# Project to create predictive model for football matches

## import the necessary libraries

In [4]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing

#notebook settings
pd.set_option('display.max_columns', None)

#pipelines
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.compose import ColumnTransformer


## import dataset

In [5]:
data = pd.read_csv('data/football_data.csv')
data.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Season
0,13/08/05,Aston Villa,Bolton,2.0,2.0,D,2.0,2.0,D,M Riley,3.0,13.0,2.0,6.0,14.0,16.0,7.0,8.0,0.0,2.0,0.0,0.0,2.3,3.25,3.0,2005
1,13/08/05,Everton,Man United,0.0,2.0,A,0.0,1.0,A,G Poll,10.0,12.0,5.0,5.0,15.0,14.0,8.0,6.0,3.0,1.0,0.0,0.0,5.0,3.4,1.72,2005
2,13/08/05,Fulham,Birmingham,0.0,0.0,D,0.0,0.0,D,R Styles,15.0,7.0,7.0,4.0,12.0,13.0,6.0,6.0,1.0,2.0,0.0,0.0,2.37,3.25,2.87,2005
3,13/08/05,Man City,West Brom,0.0,0.0,D,0.0,0.0,D,C Foy,15.0,13.0,8.0,3.0,13.0,11.0,3.0,6.0,2.0,3.0,0.0,0.0,1.72,3.4,5.0,2005
4,13/08/05,Middlesbrough,Liverpool,0.0,0.0,D,0.0,0.0,D,M Halsey,4.0,16.0,2.0,7.0,17.0,11.0,5.0,0.0,2.0,3.0,1.0,0.0,2.87,3.2,2.4,2005


# EDA (Exploratory Dataset Analysis)

## Dealing with missing values

In [8]:
# checking for null values
data.isnull().sum()

Date        1
HomeTeam    1
AwayTeam    1
FTHG        1
FTAG        1
FTR         1
HTHG        1
HTAG        1
HTR         1
Referee     1
HS          1
AS          1
HST         1
AST         1
HF          1
AF          1
HC          1
AC          1
HY          1
AY          1
HR          1
AR          1
B365H       1
B365D       1
B365A       1
Season      0
dtype: int64

In [9]:
# identify row with null values
data[data.isnull().any(axis=1)]

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Season
3800,,,,,,,,,,,,,,,,,,,,,,,,,,2014


In [None]:
# drop row with null values
class DropNullValues(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X = X.dropna()
        return X

## Dealing with duplicated values

In [10]:
# checking for duplicates
data.duplicated().sum()

0

## Uniques

In [12]:
data.nunique()

Date        1946
HomeTeam      43
AwayTeam      43
FTHG          10
FTAG          10
FTR            3
HTHG           6
HTAG           6
HTR            3
Referee       61
HS            40
AS            32
HST           24
AST           21
HF            29
AF            26
HC            21
AC            19
HY             8
AY            10
HR             3
AR             3
B365H        132
B365D         51
B365A        125
Season        19
dtype: int64

## Data type information

In [13]:
data.dtypes

Date         object
HomeTeam     object
AwayTeam     object
FTHG        float64
FTAG        float64
FTR          object
HTHG        float64
HTAG        float64
HTR          object
Referee      object
HS          float64
AS          float64
HST         float64
AST         float64
HF          float64
AF          float64
HC          float64
AC          float64
HY          float64
AY          float64
HR          float64
AR          float64
B365H       float64
B365D       float64
B365A       float64
Season        int64
dtype: object

In [None]:
# convert date column to datetime
class ConvertDateColumn(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        return X