In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, monotonically_increasing_id, struct, col, split, regexp_replace, when

In [2]:
spark = SparkSession.builder.appName("epl").getOrCreate()

In [3]:
spark

In [27]:
resutls_path = r"C:\Users\Stanisław\Desktop\Programowanie\GitHub\PL_predictions\epl_predictions\data\raw\results.csv"

df = spark.read.option("header", True).csv(resutls_path)

In [28]:
df.show(5)

+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+-----+---------+
| Wk|Day|      Date|Time|         Home|Score|           Away|Attendance|          Venue|     Referee|        Match Report|Notes|xG_Home|xG_Away|index|   Season|
+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+-----+---------+
|  1|Sat|1995-08-19|NULL|  Southampton|  3–4|Nott'ham Forest|    15,164|       The Dell|Gary Willard|https://fbref.com...| NULL|   NULL|   NULL|    0|1995-1996|
|  1|Sat|1995-08-19|NULL|Newcastle Utd|  3–0|  Coventry City|    36,485|St. James' Park|Roger Dilkes|https://fbref.com...| NULL|   NULL|   NULL|    1|1995-1996|
|  1|Sat|1995-08-19|NULL|    Wimbledon|  3–2|         Bolton|     9,317|  Selhurst Park|Keith Cooper|https://fbref.com...| NULL|   NULL|   NULL|    2|1995-1996|
|  1|Sat|1995-08-19|NULL|    Liver

In [46]:
last_row = df.withColumn('id', monotonically_increasing_id()).select(max(struct('id', *df.columns)).alias('x')).select(col('x.*')).drop('id')

last_row.show()

+---+---+----------+-----+--------------+-----+---------------+----------+------------------+------------+--------------------+-----+-------+-------+-----+---------+----------+----------+
| Wk|Day|      Date| Time|          Home|Score|           Away|Attendance|             Venue|     Referee|        Match Report|Notes|xG_Home|xG_Away|index|   Season|Home_score|Away_score|
+---+---+----------+-----+--------------+-----+---------------+----------+------------------+------------+--------------------+-----+-------+-------+-----+---------+----------+----------+
|  9|Fri|2024-10-25|20:00|Leicester City|  1–3|Nott'ham Forest|    31,879|King Power Stadium|Craig Pawson|https://fbref.com...| NULL|    0.6|    1.7|   80|2024-2025|         1|         3|
+---+---+----------+-----+--------------+-----+---------------+----------+------------------+------------+--------------------+-----+-------+-------+-----+---------+----------+----------+



In [30]:
df = df.withColumn("Date", to_date(col("Date"), "yyyy-MM-dd"))

In [40]:
df = df.withColumn("Home_score", split(col("Score"), "–").getItem(0).cast("integer"))
df = df.withColumn("Away_score", split(col("Score"), "–").getItem(1).cast("integer"))

In [43]:
df = df.withColumn("xG_Home", col("xG_Home").cast("float"))
df = df.withColumn("xG_Away", col("xG_Away").cast("float"))

In [48]:
df = df.drop("Notes")

In [51]:
df = df.withColumn("Attendance", regexp_replace(col("Attendance"), ",", "").cast("integer"))

In [54]:
df = df.withColumn("Score_diff", col("Home_score") - col("Away_Score"))

result_condition = when(col("Score_diff") > 0, "W").when(col("Score_diff") == 0, "D").otherwise("L")
df = df.withColumn("Match_result", result_condition)

In [55]:
df.printSchema()

root
 |-- Wk: string (nullable = true)
 |-- Day: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- Home: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Away: string (nullable = true)
 |-- Attendance: integer (nullable = true)
 |-- Venue: string (nullable = true)
 |-- Referee: string (nullable = true)
 |-- Match Report: string (nullable = true)
 |-- xG_Home: float (nullable = true)
 |-- xG_Away: float (nullable = true)
 |-- index: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Home_score: integer (nullable = true)
 |-- Away_score: integer (nullable = true)
 |-- Score_diff: integer (nullable = true)
 |-- Match_result: string (nullable = false)



In [56]:
df.show()

+---+---+----------+----+---------------+-----+---------------+----------+---------------+----------------+--------------------+-------+-------+-----+---------+----------+----------+----------+------------+
| Wk|Day|      Date|Time|           Home|Score|           Away|Attendance|          Venue|         Referee|        Match Report|xG_Home|xG_Away|index|   Season|Home_score|Away_score|Score_diff|Match_result|
+---+---+----------+----+---------------+-----+---------------+----------+---------------+----------------+--------------------+-------+-------+-----+---------+----------+----------+----------+------------+
|  1|Sat|1995-08-19|NULL|    Southampton|  3–4|Nott'ham Forest|     15164|       The Dell|    Gary Willard|https://fbref.com...|   NULL|   NULL|    0|1995-1996|         3|         4|        -1|           L|
|  1|Sat|1995-08-19|NULL|  Newcastle Utd|  3–0|  Coventry City|     36485|St. James' Park|    Roger Dilkes|https://fbref.com...|   NULL|   NULL|    1|1995-1996|         3| 

In [47]:
df.filter(col("Notes").isNotNull()).show()

+---+---+----+----+----+-----+----+----------+-----+-------+------------+-----+-------+-------+-----+------+----------+----------+
| Wk|Day|Date|Time|Home|Score|Away|Attendance|Venue|Referee|Match Report|Notes|xG_Home|xG_Away|index|Season|Home_score|Away_score|
+---+---+----+----+----+-----+----+----------+-----+-------+------------+-----+-------+-------+-----+------+----------+----------+
+---+---+----+----+----+-----+----+----------+-----+-------+------------+-----+-------+-------+-----+------+----------+----------+

