In [17]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import to_date, col, split, regexp_replace, when, count, sum, concat_ws, least, greatest, lag, collect_list, array_join
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

In [18]:
spark = SparkSession.builder.appName("epl").getOrCreate()

In [19]:
spark

### Code for League Table

In [28]:
path = r"C:\Users\Stanisław\Desktop\Programowanie\GitHub\PL_predictions\epl_predictions\data\raw\current_table.csv"

In [29]:
df = spark.read.csv(path, header=True)

In [30]:
df.show(5)

+------+------+---+---------------+---+---+---+---+---+---+---+---+------+----+----+-----+------+---------+----------+-------------------+--------------+-----+
|index0|index1| Rk|          Squad| MP|  W|  D|  L| GF| GA| GD|Pts|Pts/MP|  xG| xGA|  xGD|xGD/90|   Last 5|Attendance|    Top Team Scorer|    Goalkeeper|Notes|
+------+------+---+---------------+---+---+---+---+---+---+---+---+------+----+----+-----+------+---------+----------+-------------------+--------------+-----+
|     0|     0|  1|      Liverpool| 21| 15|  5|  1| 50| 20|+30| 50|  2.38|49.4|18.8|+30.7| +1.46|W W D D W|    60,276| Mohamed Salah - 18|       Alisson| NULL|
|     1|     1|  2|        Arsenal| 22| 12|  8|  2| 43| 21|+22| 44|  2.00|36.7|19.6|+17.1| +0.78|W W D W D|    60,269|    Kai Havertz - 8|    David Raya| NULL|
|     2|     2|  3|Nott'ham Forest| 21| 12|  5|  4| 30| 20|+10| 41|  1.95|25.7|23.2| +2.5| +0.12|W W W W D|    30,032|    Chris Wood - 13|     Matz Sels| NULL|
|     3|     3|  4|  Newcastle Utd| 22| 

In [23]:
print(df.schema)

StructType([StructField('index0', StringType(), True), StructField('index1', StringType(), True), StructField('Rk', StringType(), True), StructField('Squad', StringType(), True), StructField('MP', StringType(), True), StructField('W', StringType(), True), StructField('D', StringType(), True), StructField('L', StringType(), True), StructField('GF', StringType(), True), StructField('GA', StringType(), True), StructField('GD', StringType(), True), StructField('Pts', StringType(), True), StructField('Pts/MP', StringType(), True), StructField('xG', StringType(), True), StructField('xGA', StringType(), True), StructField('xGD', StringType(), True), StructField('xGD/90', StringType(), True), StructField('Last 5', StringType(), True), StructField('Attendance', StringType(), True), StructField('Top Team Scorer', StringType(), True), StructField('Goalkeeper', StringType(), True), StructField('Notes', StringType(), True)])


In [24]:
df = df.withColumn("GD", F.col("GD").cast("integer"))

In [25]:
df = df.withColumn("Top Team Scorer", F.regexp_replace(F.col("Top Team Scorer"), " ", ""))

In [26]:
df.show(5)

+------+------+---+---------------+---+---+---+---+---+---+---+---+------+----+----+-----+------+---------+----------+----------------+--------------+-----+
|index0|index1| Rk|          Squad| MP|  W|  D|  L| GF| GA| GD|Pts|Pts/MP|  xG| xGA|  xGD|xGD/90|   Last 5|Attendance| Top Team Scorer|    Goalkeeper|Notes|
+------+------+---+---------------+---+---+---+---+---+---+---+---+------+----+----+-----+------+---------+----------+----------------+--------------+-----+
|     0|     0|  1|      Liverpool| 21| 15|  5|  1| 50| 20| 30| 50|  2.38|49.4|18.8|+30.7| +1.46|W W D D W|    60,276| MohamedSalah-18|       Alisson| NULL|
|     1|     1|  2|        Arsenal| 22| 12|  8|  2| 43| 21| 22| 44|  2.00|36.7|19.6|+17.1| +0.78|W W D W D|    60,269|    KaiHavertz-8|    David Raya| NULL|
|     2|     2|  3|Nott'ham Forest| 21| 12|  5|  4| 30| 20| 10| 41|  1.95|25.7|23.2| +2.5| +0.12|W W W W D|    30,032|    ChrisWood-13|     Matz Sels| NULL|
|     3|     3|  4|  Newcastle Utd| 22| 11|  5|  6| 38| 26

In [27]:
print(df.toPandas())

   index0 index1  Rk            Squad  MP   W   D   L  GF  GA  ...  Pts/MP  \
0       0      0   1        Liverpool  21  15   5   1  50  20  ...    2.38   
1       1      1   2          Arsenal  22  12   8   2  43  21  ...    2.00   
2       2      2   3  Nott'ham Forest  21  12   5   4  30  20  ...    1.95   
3       3      3   4    Newcastle Utd  22  11   5   6  38  26  ...    1.73   
4       4      4   5          Chelsea  21  10   7   4  41  26  ...    1.76   
5       5      5   6      Bournemouth  22  10   7   5  36  26  ...    1.68   
6       6      6   7      Aston Villa  22  10   6   6  33  34  ...    1.64   
7       7      7   8  Manchester City  21  10   5   6  38  29  ...    1.67   
8       8      8   9           Fulham  22   8   9   5  34  30  ...    1.50   
9       9      9  10         Brighton  21   7  10   4  32  29  ...    1.48   
10     10     10  11        Brentford  22   8   4  10  40  39  ...    1.27   
11     11     11  12   Crystal Palace  22   6   9   7  25  28  .

### Code for Results

In [4]:
resutls_path = r"C:\Users\Stanisław\Desktop\Programowanie\GitHub\PL_predictions\epl_predictions\data\raw\results.csv"

df = spark.read.option("header", True).csv(resutls_path)

In [5]:
df.show(5)

+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+---------+
|index| Wk|Day|      Date|Time|         Home|Score|           Away|Attendance|          Venue|     Referee|        Match Report|Notes|xG_Home|xG_Away|   Season|
+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+---------+
|    0|  1|Sat|1995-08-19|NULL|  Southampton|  3–4|Nott'ham Forest|    15,164|       The Dell|Gary Willard|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|
|    1|  1|Sat|1995-08-19|NULL|Newcastle Utd|  3–0|  Coventry City|    36,485|St. James' Park|Roger Dilkes|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|
|    2|  1|Sat|1995-08-19|NULL|    Wimbledon|  3–2|         Bolton|     9,317|  Selhurst Park|Keith Cooper|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|
|    3|  1|Sat|1995-08-19|NULL|   

In [6]:
df = df.withColumn("index", col("index").cast("integer"))

In [7]:
df = df.withColumn("WK", col("WK").cast("integer"))

Casting Data column to right format

In [8]:
df = df.withColumn("Date", to_date(col("Date"), "yyyy-MM-dd"))

Creating Home_score and Away_score columns

In [9]:
df = df.withColumn("Home_score", split(col("Score"), "–").getItem(0).cast("integer"))
df = df.withColumn("Away_score", split(col("Score"), "–").getItem(1).cast("integer"))

Changing type of xG columns to float

In [10]:
df = df.withColumn("xG_Home", col("xG_Home").cast("float"))
df = df.withColumn("xG_Away", col("xG_Away").cast("float"))

Changing type of Attendance column to int

In [11]:
df = df.withColumn("Attendance", regexp_replace(col("Attendance"), ",", "").cast("integer"))

df = df.fillna({"Attendance": 0})

Creating column Match_result

In [12]:
df = df.withColumn("Score_diff", col("Home_score") - col("Away_Score"))

df = df.withColumn("Match_result_home", when(col("Score_diff") > 0, "W").when(col("Score_diff") == 0, "D").otherwise("L"))

In [13]:
df.show(5)

+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+---------+----------+----------+----------+-----------------+
|index| WK|Day|      Date|Time|         Home|Score|           Away|Attendance|          Venue|     Referee|        Match Report|Notes|xG_Home|xG_Away|   Season|Home_score|Away_score|Score_diff|Match_result_home|
+-----+---+---+----------+----+-------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+---------+----------+----------+----------+-----------------+
|    0|  1|Sat|1995-08-19|NULL|  Southampton|  3–4|Nott'ham Forest|     15164|       The Dell|Gary Willard|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|         3|         4|        -1|                L|
|    1|  1|Sat|1995-08-19|NULL|Newcastle Utd|  3–0|  Coventry City|     36485|St. James' Park|Roger Dilkes|https://fbref.com...| NULL|   NULL|   NULL|19

Creating column points_last_5_matches for home and away team

In [14]:
#Creating two df where the main team is home team or away team
home_df = df.select(col("Date"), 
                    col("Home").alias("Team"), 
                    col("Away").alias("Opponent"), 
                    col("Match_result_home").alias("Result"))

away_df = df.select(col("Date"), 
                    col("Away").alias("Team"), 
                    col("Home").alias("Opponent"), 
                    when(col("Match_result_home") == "W", "L").when(col("Match_result_home") == "L", "W").otherwise("D").alias("Result"))

In [15]:
#Combining two df. Now we have with double the rows, beacuase we split the home and away team, to new rows
combined_df = home_df.union(away_df)
combined_df = combined_df.withColumn("Points", when(col("Result") == "W", 3).when(col("Result") == "L", 0).otherwise(1))

In [16]:
match_count_window = Window.partitionBy("Team").orderBy("Date").rowsBetween(Window.unboundedPreceding, -1)

combined_df = combined_df.withColumn("Match_count", count("Points").over(match_count_window))

In [17]:
#Sum of last 5 matches
form_window = Window.partitionBy("Team").orderBy("Date").rowsBetween(-5, -1)

combined_df = combined_df.withColumn("Form_last_5_matches", collect_list("Result").over(form_window)) \
                        .withColumn("Form_last_5_matches", when(col("Match_count") >= 5, array_join("Form_last_5_matches", '')).otherwise(None))

In [18]:
combined_df.show(10)

+----------+-------+---------------+------+------+-----------+-------------------+
|      Date|   Team|       Opponent|Result|Points|Match_count|Form_last_5_matches|
+----------+-------+---------------+------+------+-----------+-------------------+
|1995-08-20|Arsenal|  Middlesbrough|     D|     1|          0|               NULL|
|1995-08-23|Arsenal|        Everton|     W|     3|          1|               NULL|
|1995-08-26|Arsenal|  Coventry City|     D|     1|          2|               NULL|
|1995-08-29|Arsenal|Nott'ham Forest|     D|     1|          3|               NULL|
|1995-09-10|Arsenal|Manchester City|     W|     3|          4|               NULL|
|1995-09-16|Arsenal|       West Ham|     W|     3|          5|              DWDDW|
|1995-09-23|Arsenal|    Southampton|     W|     3|          6|              WDDWW|
|1995-09-30|Arsenal|        Chelsea|     L|     0|          7|              DDWWW|
|1995-10-14|Arsenal|   Leeds United|     W|     3|          8|              DWWWL|
|199

In [19]:
#This line of code fix te issue with naming of columns and adding the same columns with join
combined_df_prepared = combined_df.select("Date", "Team", "Form_last_5_matches").withColumnRenamed("Date", "Date_combined")

In [20]:
#Adding form column for the home team, to original df
df= df.join(combined_df_prepared, (df["Date"] == combined_df_prepared["Date_combined"]) & (df["Home"] == combined_df_prepared["Team"]), "left") \
       .withColumnRenamed("Form_last_5_matches", "Home_form_last_5_matches") \
       .drop("Date_combined", "Team")

In [21]:
#Adding form column for the away team, to original df
df = df.join(combined_df_prepared, (df["Date"] == combined_df_prepared["Date_combined"]) & (df["Away"] == combined_df_prepared["Team"]), "left") \
       .withColumnRenamed("Form_last_5_matches", "Away_form_last_5_matches") \
       .drop("Date_combined", "Team")

In [22]:
df = df.orderBy("Date")

In [23]:
df.show(200)

+-----+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+--------------------+-----+-------+-------+---------+----------+----------+----------+-----------------+------------------------+------------------------+
|index| WK|Day|      Date|Time|           Home|Score|           Away|Attendance|            Venue|         Referee|        Match Report|Notes|xG_Home|xG_Away|   Season|Home_score|Away_score|Score_diff|Match_result_home|Home_form_last_5_matches|Away_form_last_5_matches|
+-----+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+--------------------+-----+-------+-------+---------+----------+----------+----------+-----------------+------------------------+------------------------+
|    0|  1|Sat|1995-08-19|NULL|    Southampton|  3–4|Nott'ham Forest|     15164|         The Dell|    Gary Willard|https://fbref.com...| NULL|   NULL|   NULL|1995-1996|         3|         4|

Creating result of the previous match between two teams

In [24]:
head_to_head = df.select(col("Date").alias("Date_hth"), 
                         col("Home").alias("Home_hth"), 
                         col("Away").alias("Away_hth"), 
                         col("Match_result_home").alias("Match_result_hth"))

head_to_head.show()

+----------+---------------+---------------+----------------+
|  Date_hth|       Home_hth|       Away_hth|Match_result_hth|
+----------+---------------+---------------+----------------+
|1995-08-19|    Southampton|Nott'ham Forest|               L|
|1995-08-19|  Newcastle Utd|  Coventry City|               W|
|1995-08-19|      Wimbledon|         Bolton|               W|
|1995-08-19|      Liverpool| Sheffield Weds|               W|
|1995-08-19|       West Ham|   Leeds United|               L|
|1995-08-19|      Blackburn|            QPR|               W|
|1995-08-19|    Aston Villa| Manchester Utd|               W|
|1995-08-19|Manchester City|      Tottenham|               D|
|1995-08-19|        Chelsea|        Everton|               D|
|1995-08-20|        Arsenal|  Middlesbrough|               D|
|1995-08-21|   Leeds United|      Liverpool|               W|
|1995-08-22|         Bolton|  Newcastle Utd|               L|
|1995-08-23|      Tottenham|    Aston Villa|               L|
|1995-08

In [25]:
head_to_head = head_to_head.withColumn("Match_result_away", when(col("Match_result_hth") == "W", "L").when(col("Match_result_hth") == "L", "W").otherwise("D"))

In [26]:
head_to_head = head_to_head.withColumn("Matchup", concat_ws("_", least("Home_hth", "Away_hth"), greatest("Home_hth", "Away_hth")))

In [27]:
head_to_head_window = Window.partitionBy("Matchup").orderBy("Date_hth").rowsBetween(-1, -1)

In [28]:
head_to_head = head_to_head.withColumn(
    "Last_match_between_clubs",
    when(
        (col("Home_hth") == lag("Home_hth").over(head_to_head_window)),
        lag("Match_result_hth").over(head_to_head_window)
    ).when(
        (col("Home_hth") == lag("Away_hth").over(head_to_head_window)),
        lag("Match_result_away").over(head_to_head_window)
    ).otherwise(None)
)

In [29]:
head_to_head.show()

+----------+-----------+-----------+----------------+-----------------+-------------------+------------------------+
|  Date_hth|   Home_hth|   Away_hth|Match_result_hth|Match_result_away|            Matchup|Last_match_between_clubs|
+----------+-----------+-----------+----------------+-----------------+-------------------+------------------------+
|1995-10-21|    Arsenal|Aston Villa|               W|                L|Arsenal_Aston Villa|                    NULL|
|1995-12-02|Aston Villa|    Arsenal|               D|                D|Arsenal_Aston Villa|                       L|
|1996-09-07|Aston Villa|    Arsenal|               D|                D|Arsenal_Aston Villa|                       D|
|1996-12-28|    Arsenal|Aston Villa|               D|                D|Arsenal_Aston Villa|                       D|
|1997-10-26|    Arsenal|Aston Villa|               D|                D|Arsenal_Aston Villa|                       D|
|1998-05-10|Aston Villa|    Arsenal|               W|           

In [30]:
#Adding form column for the away team, to original df
df = df.join(head_to_head, (df["Date"] == head_to_head["Date_hth"]) & (df["Home"] == head_to_head["Home_hth"]), "left") \
       .withColumnRenamed("Last_match_between_clubs", "Home_last_match_between_clubs") \
       .drop("Date_hth", "Home_hth", "Matchup", "Away_hth", "Match_result_hth")

In [31]:
df = df.orderBy("index")

In [32]:
df.show(200)

+-----+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+--------------------+-----+-------+-------+---------+----------+----------+----------+-----------------+------------------------+------------------------+-----------------+-----------------------------+
|index| WK|Day|      Date|Time|           Home|Score|           Away|Attendance|            Venue|         Referee|        Match Report|Notes|xG_Home|xG_Away|   Season|Home_score|Away_score|Score_diff|Match_result_home|Home_form_last_5_matches|Away_form_last_5_matches|Match_result_away|Home_last_match_between_clubs|
+-----+---+---+----------+----+---------------+-----+---------------+----------+-----------------+----------------+--------------------+-----+-------+-------+---------+----------+----------+----------+-----------------+------------------------+------------------------+-----------------+-----------------------------+
|    0|  1|Sat|1995-08-19|NULL|    Southampton

In [33]:
df = df.withColumn("Away_last_match_between_clubs", when(col("Home_last_match_between_clubs") == "W", "L") \
                                                    .when(col("Home_last_match_between_clubs") == "D", "D") \
                                                    .when(col("Home_last_match_between_clubs") == "L", "W") \
                                                    .otherwise(None))

In [34]:
df_pandas = df.toPandas()

# df_pandas.to_csv("results_processed.csv", index=False)

Creating league table after each matchweek

In [35]:
#Creating two df where the main team is home team or away team
mk_home_df = df.select(col("WK"), 
                       col("Season"),  
                       col("Home").alias("Team"),
                       col("Home_score").alias("Team_score"),
                       col("Away_score").alias("Opponent_score"),
                       col("xG_Home").alias("xG_Team"),
                       col("xG_Away").alias("xG_Opponent"),
                       col("Match_result_home").alias("Result"),
                       col("Attendance").alias("Match_attendance"))

mk_away_df = df.select(col("WK"), 
                       col("Season"), 
                       col("Away").alias("Team"),
                       col("Away_score").alias("Team_score"),
                       col("Home_score").alias("Opponent_score"),
                       col("xG_Away").alias("xG_Team"),
                       col("xG_Home").alias("xG_Opponent"),
                       when(col("Match_result_home") == "W", "L").when(col("Match_result_home") == "L", "W").otherwise("D").alias("Result"),
                       col("Attendance").alias("Match_attendance"))

In [36]:
mk_combined_df = mk_home_df.union(mk_away_df)
mk_combined_df = mk_combined_df.withColumn("Points_this_match", when(col("Result") == "W", 3).when(col("Result") == "L", 0).otherwise(1)) \
                               .withColumn("Wins", when(col("Result") == "W", 1).otherwise(0)) \
                               .withColumn("Losses", when(col("Result") == "L", 1).otherwise(0)) \
                               .withColumn("Draws", when(col("Result") == "D", 1).otherwise(0))

mk_combined_df.show()

+---+---------+---------------+----------+--------------+-------+-----------+------+----------------+-----------------+----+------+-----+
| WK|   Season|           Team|Team_score|Opponent_score|xG_Team|xG_Opponent|Result|Match_attendance|Points_this_match|Wins|Losses|Draws|
+---+---------+---------------+----------+--------------+-------+-----------+------+----------------+-----------------+----+------+-----+
|  1|1995-1996|    Southampton|         3|             4|   NULL|       NULL|     L|           15164|                0|   0|     1|    0|
|  1|1995-1996|  Newcastle Utd|         3|             0|   NULL|       NULL|     W|           36485|                3|   1|     0|    0|
|  1|1995-1996|      Wimbledon|         3|             2|   NULL|       NULL|     W|            9317|                3|   1|     0|    0|
|  1|1995-1996|      Liverpool|         1|             0|   NULL|       NULL|     W|           40535|                3|   1|     0|    0|
|  1|1995-1996|       West Ham|   

In [37]:
mk_window = Window.partitionBy("Season", "Team").orderBy("WK").rowsBetween(Window.unboundedPreceding, 0)

In [38]:
league_table_df = mk_combined_df.withColumn("Cumulative_Goals_For", sum("Team_score").over(mk_window)) \
                             .withColumn("Cumulative_Goals_Against", sum("Opponent_score").over(mk_window)) \
                             .withColumn("Cumulative_Points", sum("Points_this_match").over(mk_window)) \
                             .withColumn("Cumulative_xG", F.round(sum("xG_Team").over(mk_window), 2)) \
                             .withColumn("Cumulative_xGA", F.round(sum("xG_Opponent").over(mk_window), 2)) \
                             .withColumn("W", sum("Wins").over(mk_window)) \
                             .withColumn("D", sum("Draws").over(mk_window)) \
                             .withColumn("L", sum("Losses").over(mk_window)) \
                             .withColumn("MP", F.expr('+'.join(['W', 'D', 'L']))) \
                             .withColumn("GD", F.expr('-'.join(['Cumulative_Goals_For', 'Cumulative_Goals_Against']))) \
                             .withColumn('Pts/MP', F.round(col("Cumulative_Points") / col("MP"), 2)) \
                             .withColumn("Cumulative_attendance", F.round((sum("Match_attendance").over(mk_window) / col("MP"))).cast("long")) \
                             .withColumn("Cumulative_xGD", F.round(col("Cumulative_xG") - col("Cumulative_xGA"), 2)) \
                             .withColumn("Cumulative_xGD_per90", F.round(col("Cumulative_xGD") / col("MP"), 2))

league_table_df = league_table_df.withColumn("Goal_Difference", col("Cumulative_Goals_For") - col("Cumulative_Goals_Against"))

In [39]:
league_table_df.show(40)

+---+---------+-----------+----------+--------------+-------+-----------+------+----------------+-----------------+----+------+-----+--------------------+------------------------+-----------------+-------------+--------------+---+---+---+---+---+------+---------------------+--------------+--------------------+---------------+
| WK|   Season|       Team|Team_score|Opponent_score|xG_Team|xG_Opponent|Result|Match_attendance|Points_this_match|Wins|Losses|Draws|Cumulative_Goals_For|Cumulative_Goals_Against|Cumulative_Points|Cumulative_xG|Cumulative_xGA|  W|  D|  L| MP| GD|Pts/MP|Cumulative_attendance|Cumulative_xGD|Cumulative_xGD_per90|Goal_Difference|
+---+---------+-----------+----------+--------------+-------+-----------+------+----------------+-----------------+----+------+-----+--------------------+------------------------+-----------------+-------------+--------------+---+---+---+---+---+------+---------------------+--------------+--------------------+---------------+
|  1|1995-1996| 

In [40]:
league_table_window_rank = Window.partitionBy("Season", "WK").orderBy(col("Cumulative_Points").desc(), col("Goal_Difference").desc())
league_table_df = league_table_df.withColumn("Position", F.rank().over(league_table_window_rank))

In [41]:
league_table_df.show(40)

+---+---------+---------------+----------+--------------+-------+-----------+------+----------------+-----------------+----+------+-----+--------------------+------------------------+-----------------+-------------+--------------+---+---+---+---+---+------+---------------------+--------------+--------------------+---------------+--------+
| WK|   Season|           Team|Team_score|Opponent_score|xG_Team|xG_Opponent|Result|Match_attendance|Points_this_match|Wins|Losses|Draws|Cumulative_Goals_For|Cumulative_Goals_Against|Cumulative_Points|Cumulative_xG|Cumulative_xGA|  W|  D|  L| MP| GD|Pts/MP|Cumulative_attendance|Cumulative_xGD|Cumulative_xGD_per90|Goal_Difference|Position|
+---+---------+---------------+----------+--------------+-------+-----------+------+----------------+-----------------+----+------+-----+--------------------+------------------------+-----------------+-------------+--------------+---+---+---+---+---+------+---------------------+--------------+--------------------+---

In [42]:
league_table_df.count()

22040

In [43]:
league_table_df = league_table_df.drop("Team_score", "Opponent_score", "xG_Team", "xG_Opponent", "Result", "Points_this_match", "Wins", "Losses", "Draws")

In [44]:
league_table_df = league_table_df.withColumn("Top Team Scorer", F.lit(None).cast(StringType())) \
                                 .withColumn("Goalkeeper", F.lit(None).cast(StringType())) \
                                 .withColumn("Notes", F.lit(None).cast(StringType())) \
                                 .withColumn("index", F.monotonically_increasing_id())

In [45]:
league_table_df = league_table_df.select(col("index").alias("index_league_table"),
                                         col("WK").alias("wk_league_table"),
                                         col("Season").alias("season_league_table"), 
                                         col("Position").alias("rk"), 
                                         col("Team").alias("Squad"), 
                                         col("MP"), 
                                         col("W"), 
                                         col("D"), 
                                         col("L"),
                                         col("Cumulative_Goals_For").alias("GF"),
                                         col("Cumulative_Goals_Against").alias("GA"), 
                                         col("Goal_Difference").alias("GD"),
                                         col("Cumulative_Points").alias("Pts"),
                                         col("Pts/MP"),
                                         col("Cumulative_attendance").alias("attendance_league_table"),
                                         col("Top Team Scorer"),
                                         col("Goalkeeper"),
                                         col("Notes").alias("notes_league_table"),
                                         col("Cumulative_xG").alias("xG"),
                                         col("Cumulative_xGA").alias("xGA"),
                                         col("Cumulative_xGD").alias("xGD"),
                                         col("Cumulative_xGD_per90").alias("xGD/90"))

In [55]:
league_table_df.show()

+------------------+---------------+-------------------+---+---------------+---+---+---+---+---+---+---+---+------+-----------------------+---------------+----------+------------------+----+----+----+------+
|index_league_table|wk_league_table|season_league_table| rk|          Squad| MP|  W|  D|  L| GF| GA| GD|Pts|Pts/MP|attendance_league_table|Top Team Scorer|Goalkeeper|notes_league_table|  xG| xGA| xGD|xGD/90|
+------------------+---------------+-------------------+---+---------------+---+---+---+---+---+---+---+---+------+-----------------------+---------------+----------+------------------+----+----+----+------+
|                 0|              2|          1995-1996|  1|  Newcastle Utd|  1|  1|  0|  0|  3|  0|  3|  3|   3.0|                  36485|           NULL|      NULL|              NULL|NULL|NULL|NULL|  NULL|
|                 1|              2|          1995-1996|  2|    Aston Villa|  1|  1|  0|  0|  3|  1|  2|  3|   3.0|                  34655|           NULL|      NULL|  

Changing wk_league_table and season_league_table, so that model have data from previous season

In [54]:
league_table_df = league_table_df.withColumn("wk_league_table", F.col("wk_league_table") + 1)
league_table_df = league_table_df.withColumn("wk_league_table", F.when(F.col("wk_league_table") == 39, 1).otherwise(F.col("wk_league_table")))
league_table_df = league_table_df.withColumn("season_league_table", F.when(F.col("wk_league_table") == 1, F.concat(
    (F.col("season_league_table").substr(1, 4).cast("int") + 1).cast("string"), 
    F.lit("-"), 
    (F.col("season_league_table").substr(6, 4).cast("int") + 1).cast("string"))).otherwise(F.col("season_league_table")))

In [47]:
df_pandas = league_table_df.toPandas()

# df_pandas.to_csv("league_tables_after_mk.csv", index=False)

Connecting tables into one that will be used in ml

Mogę podzielić tablę z wynikami, na dwie, jedna dla home team i druga dla away team. Wtedy może jeszcze powstać trzecia tabela, która będzie łączyć obie (ale dodatkowo może łączyć się z ligową tabelą).

Mogę też połączyć to wszystko w jedną dużą tabelę, która będzie zawierać wszystkie potrzebne informacje (niekoniecznie wszystkie muszą być użyteczne)

Niezależnie od sposobu, do aktualnego meczu, musze przypisywać dane z poprzedniego meczu (np: bo przewidując wynik nie mamy jeszcze informacji o ligowej tabeli po tamtej kolejce). Problemem jest pierwsza kolejka, która dla jakości przewidywań, nie powinna mieć pustych wartości. Moim zdaniem, najlepszym rozwiązaniem jest skorzystanie z takim samych danych, jak przy innych kolejkach, ale dla końowej tabeli z poprzedniego sezonu (Jak to zrobić?).

In [56]:
combined_df_league_table = df.join(league_table_df, 
                                   (df["Home"] == league_table_df["Squad"]) & (df["WK"] == league_table_df["WK_league_table"]) & (df["Season"] == league_table_df["season_league_table"]), 
                                   "left") \
      .withColumnRenamed("rk", "Rk_home") \
      .withColumnRenamed("MP", "Mp_home") \
      .withColumnRenamed("W", "W_home") \
      .withColumnRenamed("D", "D_home") \
      .withColumnRenamed("L", "L_home") \
      .withColumnRenamed("GF", "Season_GF_home") \
      .withColumnRenamed("GA", "Season_GA_home") \
      .withColumnRenamed("GD", "Season_GD_home") \
      .withColumnRenamed("Pts", "Pts_home") \
      .withColumnRenamed("Pts/MP", "Pts_per_mp_home") \
      .withColumnRenamed("xG", "Season_xG_home") \
      .withColumnRenamed("xGA", "Season_xGA_home") \
      .withColumnRenamed("xGD", "Season_xGD_home") \
      .withColumnRenamed("xGD/90", "Season_xGD_per_90_home") \
      .withColumnRenamed("Last_match_between_clubs", "Last_match_between_clubs_home") \
      .withColumnRenamed("notes_league_table", "Notes_league_table_home") \
      .drop("Date_hth", "Home_hth", "Matchup", "Away_hth", "Match_result_hth", "index_league_table", "season_league_table", 
             "attendance_league_table", "Top Team Scorer", "Goalkeeper", "wk_league_table", "Squad")

In [57]:
combined_df_league_table = combined_df_league_table.join(league_table_df, 
                                   (df["Away"] == league_table_df["Squad"]) & (df["WK"] == league_table_df["WK_league_table"]) & (df["Season"] == league_table_df["season_league_table"]), 
                                   "left") \
        .withColumnRenamed("rk", "Rk_away") \
        .withColumnRenamed("MP", "Mp_away") \
       .withColumnRenamed("W", "W_away") \
       .withColumnRenamed("D", "D_away") \
       .withColumnRenamed("L", "L_away") \
       .withColumnRenamed("GF", "Season_GF_away") \
       .withColumnRenamed("GA", "Season_GA_away") \
       .withColumnRenamed("GD", "Season_GD_away") \
       .withColumnRenamed("Pts", "Pts_away") \
       .withColumnRenamed("Pts/MP", "Pts_per_mp_away") \
       .withColumnRenamed("xG", "Season_xG_away") \
       .withColumnRenamed("xGA", "Season_xGA_away") \
       .withColumnRenamed("xGD", "Season_xGD_away") \
       .withColumnRenamed("xGD/90", "Season_xGD_per_90_away") \
       .withColumnRenamed("Last_match_between_clubs", "Last_match_between_clubs_away") \
       .withColumnRenamed("notes_league_table", "Notes_league_table_away") \
       .drop("Date_hth", "Home_hth", "Matchup", "Away_hth", "Match_result_hth", "index_league_table", "season_league_table", 
             "attendance_league_table", "Top Team Scorer", "Goalkeeper", "wk_league_table", "Squad")

In [58]:
combined_df_league_table = combined_df_league_table.orderBy("index")

In [59]:
combined_df_league_table.show(10)

+-----+---+---+----------+----+---------------+-----+---------------+----------+---------------+------------+--------------------+-----+-------+-------+---------+----------+----------+----------+-----------------+------------------------+------------------------+-----------------+-----------------------------+-----------------------------+-------+-------+------+------+------+--------------+--------------+--------------+--------+---------------+-----------------------+--------------+---------------+---------------+----------------------+-------+-------+------+------+------+--------------+--------------+--------------+--------+---------------+-----------------------+--------------+---------------+---------------+----------------------+
|index| WK|Day|      Date|Time|           Home|Score|           Away|Attendance|          Venue|     Referee|        Match Report|Notes|xG_Home|xG_Away|   Season|Home_score|Away_score|Score_diff|Match_result_home|Home_form_last_5_matches|Away_form_last_5_m

In [60]:
df_pandas = combined_df_league_table.toPandas()

df_pandas.to_csv("results_league_table.csv", index=False)