In [79]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import to_date, col, split, regexp_replace, when, count, sum, concat_ws, least, greatest, lag

In [80]:
spark = SparkSession.builder.appName("epl").getOrCreate()

In [81]:
spark

### Code for Results

In [82]:
resutls_path = r"C:\Users\Stanisław\Desktop\Programowanie\GitHub\PL_predictions\epl_predictions\data\raw\results.csv"

df = spark.read.option("header", True).csv(resutls_path)

In [83]:
df.show(5)

+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+---------+
|index| Wk|Day|      Date|Time|         Home|Score|           Away|Attendance|          Venue|     Referee|        Match Report|Notes|xG_Home|xG_Away|   Season|
+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+---------+
|    0|  1|Sat|1995-08-19|NULL|  Southampton|  3–4|Nott'ham Forest|    15,164|       The Dell|Gary Willard|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|
|    1|  1|Sat|1995-08-19|NULL|Newcastle Utd|  3–0|  Coventry City|    36,485|St. James' Park|Roger Dilkes|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|
|    2|  1|Sat|1995-08-19|NULL|    Wimbledon|  3–2|         Bolton|     9,317|  Selhurst Park|Keith Cooper|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|
|    3|  1|Sat|1995-08-19|NULL|   

Casting Data column to right format

In [84]:
df = df.withColumn("Date", to_date(col("Date"), "yyyy-MM-dd"))

Creating Home_score and Away_score columns

In [85]:
df = df.withColumn("Home_score", split(col("Score"), "–").getItem(0).cast("integer"))
df = df.withColumn("Away_score", split(col("Score"), "–").getItem(1).cast("integer"))

Changing type of xG columns to float

In [86]:
df = df.withColumn("xG_Home", col("xG_Home").cast("float"))
df = df.withColumn("xG_Away", col("xG_Away").cast("float"))

Deleting Notes column

In [87]:
df = df.drop("Notes")

Changing type of Attendance column to int

In [88]:
df = df.withColumn("Attendance", regexp_replace(col("Attendance"), ",", "").cast("integer"))

Creating column Match_result

In [89]:
df = df.withColumn("Score_diff", col("Home_score") - col("Away_Score"))

result_condition = when(col("Score_diff") > 0, "W").when(col("Score_diff") == 0, "D").otherwise("L")
df = df.withColumn("Match_result", result_condition)

In [90]:
df.show(5)

+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-------+-------+---------+----------+----------+----------+------------+
|index| Wk|Day|      Date|Time|         Home|Score|           Away|Attendance|          Venue|     Referee|        Match Report|xG_Home|xG_Away|   Season|Home_score|Away_score|Score_diff|Match_result|
+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-------+-------+---------+----------+----------+----------+------------+
|    0|  1|Sat|1995-08-19|NULL|  Southampton|  3–4|Nott'ham Forest|     15164|       The Dell|Gary Willard|https://fbref.com...|   NULL|   NULL|1995-1996|         3|         4|        -1|           L|
|    1|  1|Sat|1995-08-19|NULL|Newcastle Utd|  3–0|  Coventry City|     36485|St. James' Park|Roger Dilkes|https://fbref.com...|   NULL|   NULL|1995-1996|         3|         0|         3|         

Creating column points_last_5_matches for home and away team

In [91]:
#Creating two df where the main team is home team or away team
home_df = df.select(col("Date"), col("Home").alias("Team"), col("Away").alias("Opponent"), col("Match_result").alias("Result"))
away_df = df.select(col("Date"), col("Away").alias("Team"), col("Home").alias("Opponent"), when(col("Match_result") == "W", "L").when(col("Match_result") == "L", "W").otherwise("D").alias("Result"))

In [92]:
#Combining two df. Now we have with double the rows, beacuase we split the home and away team, to new rows
combined_df = home_df.union(away_df)
combined_df = combined_df.withColumn("Points", when(col("Result") == "W", 3).when(col("Result") == "L", 0).otherwise(1))

In [93]:
match_count_window = Window.partitionBy("Team").orderBy("Date").rowsBetween(Window.unboundedPreceding, -1)
combined_df = combined_df.withColumn("Match_count", count("Points").over(match_count_window))

In [94]:
#Sum of last 5 matches
form_window = Window.partitionBy("Team").orderBy("Date").rowsBetween(-5, -1)

combined_df = combined_df.withColumn("Points_last_5_matches", when(col("Match_count") >= 5, sum("Points").over(form_window)).otherwise(None))

In [95]:
#This line of code fix te issue with naming of columns and adding the same columns with join
combined_df_prepared = combined_df.select("Date", "Team", "Points_last_5_matches").withColumnRenamed("Date", "Date_combined")

In [96]:
#Adding form column for the home team, to original df
df= df.join(combined_df_prepared, (df["Date"] == combined_df_prepared["Date_combined"]) & (df["Home"] == combined_df_prepared["Team"]), "left") \
       .withColumnRenamed("Points_last_5_matches", "Home_points_last_5_matches") \
       .drop("Date_combined", "Team")

In [97]:
#Adding form column for the away team, to original df
df = df.join(combined_df_prepared, (df["Date"] == combined_df_prepared["Date_combined"]) & (df["Away"] == combined_df_prepared["Team"]), "left") \
       .withColumnRenamed("Points_last_5_matches", "Away_points_last_5_matches") \
       .drop("Date_combined", "Team")

In [98]:
df.show(20)

+-----+---+---+----------+----+---------------+-----+---------------+----------+---------------+----------------+--------------------+-------+-------+---------+----------+----------+----------+------------+--------------------------+--------------------------+
|index| Wk|Day|      Date|Time|           Home|Score|           Away|Attendance|          Venue|         Referee|        Match Report|xG_Home|xG_Away|   Season|Home_score|Away_score|Score_diff|Match_result|Home_points_last_5_matches|Away_points_last_5_matches|
+-----+---+---+----------+----+---------------+-----+---------------+----------+---------------+----------------+--------------------+-------+-------+---------+----------+----------+----------+------------+--------------------------+--------------------------+
|    0|  1|Sat|1995-08-19|NULL|    Southampton|  3–4|Nott'ham Forest|     15164|       The Dell|    Gary Willard|https://fbref.com...|   NULL|   NULL|1995-1996|         3|         4|        -1|           L|           