# Football Data Analysis of Italian Serie A

The data is taken from https://datahub.io/sports-data/italian-serie-a#resource-italian-serie-a_zip

## Imports

In [79]:
from pyspark import SparkContext,  SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# For the purposes of debugging

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data

We shall load only the data for the 2018-2019 season, as the data for the other seasons is not complete and is too big to work with.
For the purposes of this notebook, it will suffice.

In [80]:
# Create Spark Context

conf = SparkConf().setAppName("Spark").setMaster("local[*]")
# Create context if it doesn't exist
try:
    sc = SparkContext(conf=conf)
except:
    pass

# Create Spark Session

spark = SparkSession.builder.appName("Spark").getOrCreate()

In [81]:
# Load the data from the csv file 

original_fb_data = spark.read.csv("./data/season-1819.csv", header=True, inferSchema=True)


In [82]:

# Print the schema of the dataframe

original_fb_data.printSchema()

root
 |-- Div: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- HomeTeam: string (nullable = true)
 |-- AwayTeam: string (nullable = true)
 |-- FTHG: integer (nullable = true)
 |-- FTAG: integer (nullable = true)
 |-- FTR: string (nullable = true)
 |-- HTHG: integer (nullable = true)
 |-- HTAG: integer (nullable = true)
 |-- HTR: string (nullable = true)
 |-- HS: integer (nullable = true)
 |-- AS: integer (nullable = true)
 |-- HST: integer (nullable = true)
 |-- AST: integer (nullable = true)
 |-- HF: integer (nullable = true)
 |-- AF: integer (nullable = true)
 |-- HC: integer (nullable = true)
 |-- AC: integer (nullable = true)
 |-- HY: integer (nullable = true)
 |-- AY: integer (nullable = true)
 |-- HR: integer (nullable = true)
 |-- AR: integer (nullable = true)
 |-- B365H: double (nullable = true)
 |-- B365D: double (nullable = true)
 |-- B365A: double (nullable = true)
 |-- BWH: double (nullable = true)
 |-- BWD: double (nullable = true)
 |-- BWA: double (nulla

### Exercise 2 

Eliminate the columns that are not needed for the analysis - ['Div', 'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD', 'BbMxA', 'BbAvA', 'BbOU', 'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5', 'BbAH', 'BbAHh', 'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA', 'PSCH', 'PSCD', 'PSCA']

In [83]:
# Eliminate the columns that are not needed

columns_to_drop = ['Div', 'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD', 'BbMxA', 'BbAvA', 'BbOU', 'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5', 'BbAH', 'BbAHh', 'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA', 'PSCH', 'PSCD', 'PSCA']
fb_data = original_fb_data.drop(*columns_to_drop)

# Print the schema of the dataframe

# fb_data.printSchema()


### Exercise 3

Create a columns.txt file that contains a map of the columns to their new names. Use this file to rename the columns.


In [84]:
# Create columns.txt file - columns FTHG, FTAG, FTR, HTHG, HTAG, HTR, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY, HR, AR  

columns_to_rename = ['FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']

# Create a dictionary to the new names 

new_names = {'FTHG':'final_time_home_goals', 'FTAG':'final_time_away_goals', 'FTR':'final_time_result', 'HTHG':'half_time_home_goals', 
             'HTAG':'half_time_away_goals', 'HTR':'half_time_result', 'HS':'home_shots', 'AS':'away_shots', 'HST':'home_shots_on_target',
               'AST':'away_shots_on_target', 'HF':'home_fouls', 'AF':'away_fouls', 'HC':'home_corners', 'AC':'away_corners', 'HY':'home_yellow_cards', 
               'AY':'away_yellow_cards', 'HR':'home_red_cards', 'AR':'away_red_cards'}

# Save that dictionary to a file

from utils import create_columns

create_columns(new_names, "./data")









In [85]:
# Load map from columns.txt file

column_map = {}

with open("./data/columns.txt", "r") as f:
    for line in f.readlines():
        line = line.strip()
        key, value = line.split("->")
        column_map[key.strip()] = value.strip()

# Rename some of the columns from the dictionary and keep the rest

fb_data_renamed = fb_data.select([col(c).alias(column_map.get(c, c)) for c in fb_data.columns])

# Print the schema of the dataframe

# fb_data_renamed.printSchema()

### Exercise 4

Change final_time_result to categories home, draw, away

In [86]:
# Change the values in the final_time_result column to home, draw, away

fb_data_renamed = fb_data_renamed.withColumn("final_time_result", when(col("final_time_result") == "H", "home")
                                             .when(col("final_time_result") == "D", "draw")
                                             .otherwise("away"))

### Exercise 5

Find all the fames where the home team won

In [87]:
# Find in what games the home team won

home_wins = fb_data_renamed.filter(col("final_time_result") == "home")

### Exercise 6 

Convert the date column to a "yyyy-MM-dd" format

In [88]:
# Convert the date column to a date type

fb_data_renamed = fb_data_renamed.withColumn("Date", to_date(col("Date"), "dd/MM/yyyy"))

# Show the first 5 rows of the date column

# fb_data_renamed.select("Date").show(5)


### Exercise 7

Add precision column to all the games where group with the upper goals has lost

In [93]:

# Find all the games where the team with the upper shots on target has lost

upper_shots_lost = fb_data_renamed.filter((col("home_shots_on_target") > col("away_shots_on_target")) & (col("final_time_result") == "away") \
| (col("home_shots_on_target") < col("away_shots_on_target")) & (col("final_time_result") == "home"))


upper_shots_lost = upper_shots_lost.withColumn("home_precision", col("home_shots_on_target") / col("home_shots")) \
.withColumn("away_precision", col("away_shots_on_target") / col("away_shots"))

# Show the first 5 rows of the teams and the precision columns

upper_shots_lost.select("HomeTeam", "AwayTeam", "home_precision", "away_precision").show(5)


+---------+--------+------------------+------------------+
| HomeTeam|AwayTeam|    home_precision|    away_precision|
+---------+--------+------------------+------------------+
|   Empoli|Cagliari|0.4444444444444444|0.4166666666666667|
|    Genoa|  Empoli|               0.4|0.6666666666666666|
| Atalanta|Cagliari|             0.625|              0.75|
|Sampdoria|  Napoli|0.5714285714285714|               0.5|
|    Inter|   Parma|0.4583333333333333|              0.75|
+---------+--------+------------------+------------------+
only showing top 5 rows



### Exercise 8

Find the pure winning games where the winning team has no opposite goals


In [95]:
# Find games where the winning team has no opposite goals

pure_wins = fb_data_renamed.filter((col("final_time_result") == "home") & (col("final_time_away_goals") == 0) | (col("final_time_result") == "away") & (col("final_time_home_goals") == 0))

# Show the first 5 rows of the teams and the goals columns

pure_wins.select("HomeTeam", "AwayTeam", "final_time_home_goals", "final_time_away_goals").show(5)

+--------+---------+---------------------+---------------------+
|HomeTeam| AwayTeam|final_time_home_goals|final_time_away_goals|
+--------+---------+---------------------+---------------------+
| Bologna|     Spal|                    0|                    1|
|  Empoli| Cagliari|                    2|                    0|
|Sassuolo|    Inter|                    1|                    0|
|  Torino|     Roma|                    0|                    1|
|Atalanta|Frosinone|                    4|                    0|
+--------+---------+---------------------+---------------------+
only showing top 5 rows



### Exercise 9

Find the game with the most total goals

In [96]:
# Find the highest total goals scored in a game

highest_goals = fb_data_renamed.withColumn("total_goals", col("final_time_home_goals") + col("final_time_away_goals")).sort(desc("total_goals"))

# Get all the games where the total goals are equal to the highest total goals

highest_goals = highest_goals.filter(col("total_goals") == highest_goals.select("total_goals").first()[0])

# Show the first 5 rows of the teams and the total goals columns

highest_goals.select("Date", "HomeTeam", "AwayTeam", "total_goals").show(5)

+----------+--------+---------+-----------+
|      Date|HomeTeam| AwayTeam|total_goals|
+----------+--------+---------+-----------+
|2018-09-02|Sassuolo|    Genoa|          8|
|2018-12-29|Sassuolo| Atalanta|          8|
|2019-03-16|Sassuolo|Sampdoria|          8|
+----------+--------+---------+-----------+

