In [114]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import to_date, col, split, regexp_replace, when, lag, sum

In [115]:
spark = SparkSession.builder.appName("epl").getOrCreate()

In [116]:
spark

### Code for Results

In [117]:
resutls_path = r"C:\Users\Stanisław\Desktop\Programowanie\GitHub\PL_predictions\epl_predictions\data\raw\results.csv"

df = spark.read.option("header", True).csv(resutls_path)

In [118]:
df.show(5)

+-----+-------+-------+--------------------+---------+---+---+----------+----+-------------+-----+-------------+----------+---------------+----------------+-----+
|index|xG_Home|xG_Away|        Match Report|   Season| Wk|Day|      Date|Time|         Home|Score|         Away|Attendance|          Venue|         Referee|Notes|
+-----+-------+-------+--------------------+---------+---+---+----------+----+-------------+-----+-------------+----------+---------------+----------------+-----+
|    0|   NULL|   NULL|https://fbref.com...|2000-2001|  1|Sat|2000-08-19|NULL| Leeds United|  2–0|      Everton|    40,010|    Elland Road|Dermot Gallagher| NULL|
|    1|   NULL|   NULL|https://fbref.com...|2000-2001|  1|Sat|2000-08-19|NULL|    Liverpool|  1–0|Bradford City|    44,183|        Anfield|     Paul Durkin| NULL|
|    2|   NULL|   NULL|https://fbref.com...|2000-2001|  1|Sat|2000-08-19|NULL|      Chelsea|  4–2|     West Ham|    34,914|Stamford Bridge|   Graham Barber| NULL|
|    3|   NULL|   NULL

Casting Data column to right format

In [119]:
df = df.withColumn("Date", to_date(col("Date"), "yyyy-MM-dd"))

Creating Home_score and Away_score columns

In [120]:
df = df.withColumn("Home_score", split(col("Score"), "–").getItem(0).cast("integer"))
df = df.withColumn("Away_score", split(col("Score"), "–").getItem(1).cast("integer"))

Changing type of xG columns to float

In [121]:
df = df.withColumn("xG_Home", col("xG_Home").cast("float"))
df = df.withColumn("xG_Away", col("xG_Away").cast("float"))

Deleting Notes column

In [122]:
df = df.drop("Notes")

Changing type of Attendance column to int

In [123]:
df = df.withColumn("Attendance", regexp_replace(col("Attendance"), ",", "").cast("integer"))

Creating column Match_result

In [124]:
df = df.withColumn("Score_diff", col("Home_score") - col("Away_Score"))

result_condition = when(col("Score_diff") > 0, "W").when(col("Score_diff") == 0, "D").otherwise("L")
df = df.withColumn("Match_result", result_condition)

Creating column home_point_last_5_matches

In [125]:
df_copy = df.select("*")

In [206]:
df = df_copy.select("*")

In [207]:
#Creating two df where the main team is home team or away team
home_df = df.select(col("Date"), col("Home").alias("Team"), col("Away").alias("Opponent"), col("Match_result").alias("Result"))
away_df = df.select(col("Date"), col("Away").alias("Team"), col("Home").alias("Opponent"), when(col("Match_result") == "W", "L").when(col("Match_result") == "L", "W").otherwise("D").alias("Result"))

In [208]:
#Combining two df. Now we have with double the rows, beacuase we split the home and away team, to new rows
combined_df = home_df.union(away_df)
combined_df = combined_df.withColumn("Points", when(col("Result") == "W", 3).when(col("Result") == "L", 0).otherwise(1))

In [209]:
#Sum of last 5 matches
form_window = Window.partitionBy("Team").orderBy("Date").rowsBetween(-5, -1)

combined_df = combined_df.withColumn("Points_last_5_matches", sum("Points").over(form_window))

In [210]:
combined_df_prepared = combined_df.select("Date", "Team", "Points_last_5_matches").withColumnRenamed("Date", "Date_combined")

In [211]:
#Adding form column for the home team, to original df
df= df.join(combined_df_prepared, (df["Date"] == combined_df_prepared["Date_combined"]) & (df["Home"] == combined_df_prepared["Team"]), "left") \
       .withColumnRenamed("Points_last_5_matches", "Home_points_last_5_matches") \
       .drop("Date_combined", "Team")

In [None]:
#Adding form column for the away team, to original df
df = df.join(combined_df_prepared, (df["Date"] == combined_df_prepared["Date_combined"]) & (df["Away"] == combined_df_prepared["Team"]), "left") \
       .withColumnRenamed("Points_last_5_matches", "Away_points_last_5_matches") \
       .drop("Date_combined", "Team")

In [214]:
df.show(20)

+-----+-------+-------+--------------------+---------+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+----------+----------+----------+------------+--------------------------+--------------------------+
|index|xG_Home|xG_Away|        Match Report|   Season| Wk|Day|      Date|Time|           Home|Score|           Away|Attendance|            Venue|         Referee|Home_score|Away_score|Score_diff|Match_result|Home_points_last_5_matches|Away_points_last_5_matches|
+-----+-------+-------+--------------------+---------+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+----------+----------+----------+------------+--------------------------+--------------------------+
|    0|   NULL|   NULL|https://fbref.com...|2000-2001|  1|Sat|2000-08-19|NULL|   Leeds United|  2–0|        Everton|     40010|      Elland Road|Dermot Gallagher|         2|         0|         2|           W|   

In [41]:
#Creating points columns
df = df.withColumn("Home_Points", when(col("Match_result") == "W", 3).when(col("Match_result") == "D", 1).otherwise(0))
df = df.withColumn("Away_Points", when(col("Match_result") == "W", 3).when(col("Match_result") == "D", 1).otherwise(0))

home_window = Window.partitionBy("Home").orderBy("Date").rowsBetween(-5, -1)

home_test = df.withColumn("Home_team_points_last_5_matches", sum("Home_Points").over(home_window))
home_test.show()

+-----+-------+-------+--------------------+---------+---+---+----------+----+-------+-----+---------------+----------+--------+----------------+----------+----------+----------+------------+-----------+-----------+-------------------------------+
|index|xG_Home|xG_Away|        Match Report|   Season| Wk|Day|      Date|Time|   Home|Score|           Away|Attendance|   Venue|         Referee|Home_score|Away_score|Score_diff|Match_result|Home_Points|Away_Points|Home_team_points_last_5_matches|
+-----+-------+-------+--------------------+---------+---+---+----------+----+-------+-----+---------------+----------+--------+----------------+----------+----------+----------+------------+-----------+-----------+-------------------------------+
|   10|   NULL|   NULL|https://fbref.com...|2000-2001|  2|Mon|2000-08-21|NULL|Arsenal|  2–0|      Liverpool|     38014|Highbury|     Graham Poll|         2|         0|         2|           W|          3|          3|                           NULL|
|   23| 

In [32]:
df.printSchema()

root
 |-- index: string (nullable = true)
 |-- xG_Home: float (nullable = true)
 |-- xG_Away: float (nullable = true)
 |-- Match Report: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Wk: string (nullable = true)
 |-- Day: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- Home: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Away: string (nullable = true)
 |-- Attendance: integer (nullable = true)
 |-- Venue: string (nullable = true)
 |-- Referee: string (nullable = true)
 |-- Home_score: integer (nullable = true)
 |-- Away_score: integer (nullable = true)
 |-- Score_diff: integer (nullable = true)
 |-- Match_result: string (nullable = false)



In [33]:
df.show()

+-----+-------+-------+--------------------+---------+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+----------+----------+----------+------------+
|index|xG_Home|xG_Away|        Match Report|   Season| Wk|Day|      Date|Time|           Home|Score|           Away|Attendance|            Venue|         Referee|Home_score|Away_score|Score_diff|Match_result|
+-----+-------+-------+--------------------+---------+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+----------+----------+----------+------------+
|    0|   NULL|   NULL|https://fbref.com...|2000-2001|  1|Sat|2000-08-19|NULL|   Leeds United|  2–0|        Everton|     40010|      Elland Road|Dermot Gallagher|         2|         0|         2|           W|
|    1|   NULL|   NULL|https://fbref.com...|2000-2001|  1|Sat|2000-08-19|NULL|      Liverpool|  1–0|  Bradford City|     44183|          Anfield|     Paul Durkin|  

In [34]:
df.filter(col("Notes").isNotNull()).show()

+-----+-------+-------+------------+------+---+---+----+----+----+-----+----+----------+-----+-------+----------+----------+----------+------------+
|index|xG_Home|xG_Away|Match Report|Season| Wk|Day|Date|Time|Home|Score|Away|Attendance|Venue|Referee|Home_score|Away_score|Score_diff|Match_result|
+-----+-------+-------+------------+------+---+---+----+----+----+-----+----+----------+-----+-------+----------+----------+----------+------------+
+-----+-------+-------+------------+------+---+---+----+----+----+-----+----+----------+-----+-------+----------+----------+----------+------------+

