In [46]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import to_date, col, split, regexp_replace, when, count, sum, concat_ws, least, greatest

In [2]:
spark = SparkSession.builder.appName("epl").getOrCreate()

In [3]:
spark

### Code for Results

In [26]:
resutls_path = r"C:\Users\Stanisław\Desktop\Programowanie\GitHub\PL_predictions\epl_predictions\data\raw\results.csv"

df = spark.read.option("header", True).csv(resutls_path)

In [27]:
df.show(5)

+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+---------+
|index| Wk|Day|      Date|Time|         Home|Score|           Away|Attendance|          Venue|     Referee|        Match Report|Notes|xG_Home|xG_Away|   Season|
+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+---------+
|    0|  1|Sat|1995-08-19|NULL|  Southampton|  3–4|Nott'ham Forest|    15,164|       The Dell|Gary Willard|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|
|    1|  1|Sat|1995-08-19|NULL|Newcastle Utd|  3–0|  Coventry City|    36,485|St. James' Park|Roger Dilkes|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|
|    2|  1|Sat|1995-08-19|NULL|    Wimbledon|  3–2|         Bolton|     9,317|  Selhurst Park|Keith Cooper|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|
|    3|  1|Sat|1995-08-19|NULL|   

Casting Data column to right format

In [28]:
df = df.withColumn("Date", to_date(col("Date"), "yyyy-MM-dd"))

Creating Home_score and Away_score columns

In [29]:
df = df.withColumn("Home_score", split(col("Score"), "–").getItem(0).cast("integer"))
df = df.withColumn("Away_score", split(col("Score"), "–").getItem(1).cast("integer"))

Changing type of xG columns to float

In [30]:
df = df.withColumn("xG_Home", col("xG_Home").cast("float"))
df = df.withColumn("xG_Away", col("xG_Away").cast("float"))

Deleting Notes column

In [31]:
df = df.drop("Notes")

Changing type of Attendance column to int

In [32]:
df = df.withColumn("Attendance", regexp_replace(col("Attendance"), ",", "").cast("integer"))

Creating column Match_result

In [33]:
df = df.withColumn("Score_diff", col("Home_score") - col("Away_Score"))

result_condition = when(col("Score_diff") > 0, "W").when(col("Score_diff") == 0, "D").otherwise("L")
df = df.withColumn("Match_result", result_condition)

In [34]:
df.show(5)

+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-------+-------+---------+----------+----------+----------+------------+
|index| Wk|Day|      Date|Time|         Home|Score|           Away|Attendance|          Venue|     Referee|        Match Report|xG_Home|xG_Away|   Season|Home_score|Away_score|Score_diff|Match_result|
+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-------+-------+---------+----------+----------+----------+------------+
|    0|  1|Sat|1995-08-19|NULL|  Southampton|  3–4|Nott'ham Forest|     15164|       The Dell|Gary Willard|https://fbref.com...|   NULL|   NULL|1995-1996|         3|         4|        -1|           L|
|    1|  1|Sat|1995-08-19|NULL|Newcastle Utd|  3–0|  Coventry City|     36485|St. James' Park|Roger Dilkes|https://fbref.com...|   NULL|   NULL|1995-1996|         3|         0|         3|         

Creating column points_last_5_matches for home and away team

In [35]:
#Creating two df where the main team is home team or away team
home_df = df.select(col("Date"), col("Home").alias("Team"), col("Away").alias("Opponent"), col("Match_result").alias("Result"))
away_df = df.select(col("Date"), col("Away").alias("Team"), col("Home").alias("Opponent"), when(col("Match_result") == "W", "L").when(col("Match_result") == "L", "W").otherwise("D").alias("Result"))

In [36]:
#Combining two df. Now we have with double the rows, beacuase we split the home and away team, to new rows
combined_df = home_df.union(away_df)
combined_df = combined_df.withColumn("Points", when(col("Result") == "W", 3).when(col("Result") == "L", 0).otherwise(1))

In [37]:
match_count_window = Window.partitionBy("Team").orderBy("Date").rowsBetween(Window.unboundedPreceding, -1)
combined_df = combined_df.withColumn("Match_count", count("Points").over(match_count_window))

In [38]:
#Sum of last 5 matches
form_window = Window.partitionBy("Team").orderBy("Date").rowsBetween(-5, -1)

combined_df = combined_df.withColumn("Points_last_5_matches", when(col("Match_count") >= 5, sum("Points").over(form_window)).otherwise(None))

In [None]:
#This line of code fix te issue with naming of columns and adding the same columns with join
combined_df_prepared = combined_df.select("Date", "Team", "Points_last_5_matches").withColumnRenamed("Date", "Date_combined")

In [40]:
#Adding form column for the home team, to original df
df= df.join(combined_df_prepared, (df["Date"] == combined_df_prepared["Date_combined"]) & (df["Home"] == combined_df_prepared["Team"]), "left") \
       .withColumnRenamed("Points_last_5_matches", "Home_points_last_5_matches") \
       .drop("Date_combined", "Team")

In [41]:
#Adding form column for the away team, to original df
df = df.join(combined_df_prepared, (df["Date"] == combined_df_prepared["Date_combined"]) & (df["Away"] == combined_df_prepared["Team"]), "left") \
       .withColumnRenamed("Points_last_5_matches", "Away_points_last_5_matches") \
       .drop("Date_combined", "Team")

In [42]:
df.show(20)

+-----+---+---+----------+----+---------------+-----+---------------+----------+---------------+----------------+--------------------+-------+-------+---------+----------+----------+----------+------------+--------------------------+--------------------------+
|index| Wk|Day|      Date|Time|           Home|Score|           Away|Attendance|          Venue|         Referee|        Match Report|xG_Home|xG_Away|   Season|Home_score|Away_score|Score_diff|Match_result|Home_points_last_5_matches|Away_points_last_5_matches|
+-----+---+---+----------+----+---------------+-----+---------------+----------+---------------+----------------+--------------------+-------+-------+---------+----------+----------+----------+------------+--------------------------+--------------------------+
|    0|  1|Sat|1995-08-19|NULL|    Southampton|  3–4|Nott'ham Forest|     15164|       The Dell|    Gary Willard|https://fbref.com...|   NULL|   NULL|1995-1996|         3|         4|        -1|           L|           

Creating column with form in last 3 matches between two teams

In [None]:
#Creating second df with all the calculations
home_head_to_head = df.select(col("Home").alias("Team"), 
                              col("Away").alias("Opponent"), 
                              col("Date").alias("Date_head_to_head"), 
                              col("Match_result").alias("Result"))

away_head_to_head = df.select(col("Away").alias("Team"),
                              col("Home").alias("Opponent"), 
                              col("Date").alias("Date_head_to_head"), 
                              when(col("Match_result") == "W", "L").when(col("Match_result") == "L", "W").otherwise("D").alias("Result"))

In [129]:
#Creating column with unique paring of team and opponent
home_head_to_head = home_head_to_head.withColumn("Matchup", concat_ws("_", least("Team", "Opponent"), greatest("Team", "Opponent")))
away_head_to_head = away_head_to_head.withColumn("Matchup", concat_ws("_", least("Team", "Opponent"), greatest("Team", "Opponent")))

In [130]:
combined_head_to_head = home_head_to_head.union(away_head_to_head)
combined_head_to_head = combined_head_to_head.withColumn("Points", when(col("Result") == "W", 3).when(col("Result") == "L", 0).otherwise(1))

In [131]:
combined_head_to_head.show(5)

+-------------+---------------+-----------------+------+--------------------+------+
|         Team|       Opponent|Date_head_to_head|Result|             Matchup|Points|
+-------------+---------------+-----------------+------+--------------------+------+
|  Southampton|Nott'ham Forest|       1995-08-19|     L|Nott'ham Forest_S...|     0|
|Newcastle Utd|  Coventry City|       1995-08-19|     W|Coventry City_New...|     3|
|    Wimbledon|         Bolton|       1995-08-19|     W|    Bolton_Wimbledon|     3|
|    Liverpool| Sheffield Weds|       1995-08-19|     W|Liverpool_Sheffie...|     3|
|     West Ham|   Leeds United|       1995-08-19|     L|Leeds United_West...|     0|
+-------------+---------------+-----------------+------+--------------------+------+
only showing top 5 rows



In [None]:
# home_head_to_head = home_head_to_head.withColumn("Home_Points", when(col("Match_result") == "W", 3).when(col("Match_result") == "D", 1).otherwise(0))

# df_between_clubs = df_between_clubs.withColumn("Away_Points", when(col("Match_result") == "L", 3).when(col("Match_result") == "D", 1).otherwise(0))

In [132]:
#Creating Window that counts last matches - we don't want to have a value if certain number of matches between two clubs already happend
match_count_window = Window.partitionBy("Matchup").orderBy("Date_head_to_head").rowsBetween(Window.unboundedPreceding, -1)

#Sum of last 3 matches
# form_matchup_window = Window.partitionBy("Matchup").orderBy("Data_between_clubs").rowsBetween(-3, -1)

form_matchup_window = Window.partitionBy("Team").partitionBy("Matchup").orderBy("Date_head_to_head").rowsBetween(-3, -1)

In [None]:
#Creating column that helps with minimum number of matches
# home_head_to_head = home_head_to_head.withColumn("Match_count", count("Home_Points").over(match_count_window))

In [133]:
combined_head_to_head = combined_head_to_head.withColumn("Head_to_Head_Form", sum("Points").over(form_matchup_window))

# df_between_clubs = df_between_clubs.withColumn("Away_Head_to_Head_Form", when(col("Match_count") >= 3, sum("Away_Points").over(form_matchup_window)).otherwise(None))

In [134]:
combined_head_to_head.show(20)

+-----------+-----------+-----------------+------+-------------------+------+-----------------+
|       Team|   Opponent|Date_head_to_head|Result|            Matchup|Points|Head_to_Head_Form|
+-----------+-----------+-----------------+------+-------------------+------+-----------------+
|    Arsenal|Aston Villa|       1995-10-21|     W|Arsenal_Aston Villa|     3|             NULL|
|Aston Villa|    Arsenal|       1995-10-21|     L|Arsenal_Aston Villa|     0|                3|
|Aston Villa|    Arsenal|       1995-12-02|     D|Arsenal_Aston Villa|     1|                3|
|    Arsenal|Aston Villa|       1995-12-02|     D|Arsenal_Aston Villa|     1|                4|
|Aston Villa|    Arsenal|       1996-09-07|     D|Arsenal_Aston Villa|     1|                2|
|    Arsenal|Aston Villa|       1996-09-07|     D|Arsenal_Aston Villa|     1|                3|
|    Arsenal|Aston Villa|       1996-12-28|     D|Arsenal_Aston Villa|     1|                3|
|Aston Villa|    Arsenal|       1996-12-

In [None]:
test = [['AR', 'AV', 'AV', 'AR', 'AR', 'AV', 'AV', 'AR'], 
        ['AV', 'AR', 'AR', 'AV', 'AV', 'AR', 'AR', 'AV'],
        [3, 1, 1, 1, 1, 3, 3, 3],
        [0, 1, 1, 1, 1, 0, 0, 0]]

col = ['Home', 'Away', 'Home_points', 'Away_points']

In [22]:
df.printSchema()

root
 |-- index: string (nullable = true)
 |-- xG_Home: float (nullable = true)
 |-- xG_Away: float (nullable = true)
 |-- Match Report: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Wk: string (nullable = true)
 |-- Day: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- Home: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Away: string (nullable = true)
 |-- Attendance: integer (nullable = true)
 |-- Venue: string (nullable = true)
 |-- Referee: string (nullable = true)
 |-- Home_score: integer (nullable = true)
 |-- Away_score: integer (nullable = true)
 |-- Score_diff: integer (nullable = true)
 |-- Match_result: string (nullable = false)
 |-- Home_points_last_5_matches: long (nullable = true)
 |-- Away_points_last_5_matches: long (nullable = true)
 |-- Home_Points: integer (nullable = false)
 |-- Away_Points: integer (nullable = false)



In [23]:
df.show()

+-----+-------+-------+--------------------+---------+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+----------+----------+----------+------------+--------------------------+--------------------------+-----------+-----------+
|index|xG_Home|xG_Away|        Match Report|   Season| Wk|Day|      Date|Time|           Home|Score|           Away|Attendance|            Venue|         Referee|Home_score|Away_score|Score_diff|Match_result|Home_points_last_5_matches|Away_points_last_5_matches|Home_Points|Away_Points|
+-----+-------+-------+--------------------+---------+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+----------+----------+----------+------------+--------------------------+--------------------------+-----------+-----------+
|    0|   NULL|   NULL|https://fbref.com...|2000-2001|  1|Sat|2000-08-19|NULL|   Leeds United|  2–0|        Everton|     40010|      Elland