# Machine learning and prediction


## Data Cleaning and Prep

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('df.csv') # our data is already sorted by date from before.

# Deleting columns we do not need.
del df["Unnamed: 0"]
del df["Squad"]
del df["Squad_opp"]

df["opp_code"] = df["team_opp"].astype("category").cat.codes # giving codes to the opponent team
df["day_code"] = df["Day"].astype("category").cat.codes # giving codes to the opponent team


In [3]:
df.columns

Index(['Day', 'Date', 'team', 'team_opp', 'season', 'score', 'score_opp',
       'home', 'result', '# Pl', 'Age', 'MP', 'Starts', 'Gls', 'Ast', 'G+A',
       'G-PK', 'PK', 'PKatt', 'Gls90', 'Ast90', 'G+A90', 'G-PK90', 'G+A-PK',
       'Min%', 'Subs', 'Mn/Sub', 'PPM', 'onG', 'onGA', '+/-', '# Pl.1', 'Min',
       'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS', 'CS%',
       'SoT', 'SoT/90', 'G/SoT', 'Fls', '# Pl_opp', 'Age_opp', 'MP_opp',
       'Starts_opp', 'Gls_opp', 'Ast_opp', 'G+A_opp', 'G-PK_opp', 'PK_opp',
       'PKatt_opp', 'Gls90_opp', 'Ast90_opp', 'G+A90_opp', 'G-PK90_opp',
       'G+A-PK_opp', 'season_opp', 'Min%_opp', 'Subs_opp', 'Mn/Sub_opp',
       'PPM_opp', 'onG_opp', 'onGA_opp', '+/-_opp', '# Pl_opp.1', 'Min_opp',
       'GA_opp', 'GA90_opp', 'SoTA_opp', 'Saves_opp', 'Save%_opp', 'W_opp',
       'D_opp', 'L_opp', 'CS_opp', 'CS%_opp', 'SoT_opp', 'SoT/90_opp',
       'G/SoT_opp', 'Fls_opp', 'opp_code', 'day_code'],
      dtype='object')

In [4]:
# Checking for any null values

nulls = pd.isnull(df).sum()
nulls[nulls>0] # no none null values

Series([], dtype: int64)

In [5]:
df

Unnamed: 0,Day,Date,team,team_opp,season,score,score_opp,home,result,# Pl,...,D_opp,L_opp,CS_opp,CS%_opp,SoT_opp,SoT/90_opp,G/SoT_opp,Fls_opp,opp_code,day_code
0,Tue,2010-09-14,Lyon,Schalke 04,2010,1,0,1,1,20,...,4,3,3,25.0,65,5.42,0.32,195,81,4
1,Tue,2010-09-14,Panathinaikos,Barcelona,2010,1,5,0,0,21,...,4,1,5,38.5,98,7.54,0.29,134,11,4
2,Tue,2010-09-14,Rubin Kazan,FC Copenhagen,2010,0,1,0,0,20,...,5,4,3,25.0,21,1.75,0.24,114,28,4
3,Tue,2010-09-14,Schalke 04,Lyon,2010,0,1,0,0,27,...,2,3,2,25.0,44,5.50,0.25,105,47,4
4,Tue,2010-09-14,Hapoel Tel Aviv,Benfica,2010,0,2,0,0,19,...,0,4,1,16.7,28,4.67,0.25,102,15,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2917,Tue,2022-05-03,Villarreal,Liverpool,2021,2,3,1,0,25,...,1,2,4,30.8,79,6.08,0.34,151,44,4
2918,Wed,2022-05-04,Manchester City,Real Madrid,2021,1,3,0,0,26,...,0,4,5,38.5,66,4.83,0.38,120,74,5
2919,Wed,2022-05-04,Real Madrid,Manchester City,2021,3,1,1,1,25,...,2,3,4,33.3,65,5.27,0.40,126,51,5
2920,Sat,2022-05-28,Liverpool,Real Madrid,2021,0,1,1,0,28,...,0,4,5,38.5,66,4.83,0.38,120,74,1


## Initial Prediction


In [6]:
# Checking the result column (the column we need to predict)

df["result"].value_counts()


1    1153
0    1153
2     616
Name: result, dtype: int64

We can see that the draw values are not balanced. We have to take care fo this after the initial stage. Probably using bagging (under/over Bagging) For now lets establish a baseline and see if we can beat it with a random forrest.

### Establishing  a Baseline

lets see what percentage of the results belong to home and awa games, that is, what percentage of the games are won at home, what percentage are drew ad what percentage are lost.

In [7]:
df.groupby('home').apply(lambda x: x[x['result']==1].shape[0]/x.shape[0])

home
0    0.318275
1    0.470910
dtype: float64

In [8]:
df.groupby('home').apply(lambda x: x[x['result']==2].shape[0]/x.shape[0])

home
0    0.210815
1    0.210815
dtype: float64

47% of the games are won at home, and 21% are drawn. Naturally 32% are lost with the home advantage.

### Standard Random Forrest Classifier 

In [45]:
from sklearn.preprocessing import MinMaxScaler # to scale

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit 
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.metrics import accuracy_score #if you predicted a win what percentage of the time the team actually won etc.. 
from sklearn.metrics  import precision_score 

All the training data has to come from THE PAST. As we have a time series data. 

In [10]:
train = df[df["season"]<2021]
test  = df[df["season"]==2021]

In [11]:
print(train.shape)
print(test.shape)

(2678, 88)
(244, 88)


In [12]:
df.columns

Index(['Day', 'Date', 'team', 'team_opp', 'season', 'score', 'score_opp',
       'home', 'result', '# Pl', 'Age', 'MP', 'Starts', 'Gls', 'Ast', 'G+A',
       'G-PK', 'PK', 'PKatt', 'Gls90', 'Ast90', 'G+A90', 'G-PK90', 'G+A-PK',
       'Min%', 'Subs', 'Mn/Sub', 'PPM', 'onG', 'onGA', '+/-', '# Pl.1', 'Min',
       'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS', 'CS%',
       'SoT', 'SoT/90', 'G/SoT', 'Fls', '# Pl_opp', 'Age_opp', 'MP_opp',
       'Starts_opp', 'Gls_opp', 'Ast_opp', 'G+A_opp', 'G-PK_opp', 'PK_opp',
       'PKatt_opp', 'Gls90_opp', 'Ast90_opp', 'G+A90_opp', 'G-PK90_opp',
       'G+A-PK_opp', 'season_opp', 'Min%_opp', 'Subs_opp', 'Mn/Sub_opp',
       'PPM_opp', 'onG_opp', 'onGA_opp', '+/-_opp', '# Pl_opp.1', 'Min_opp',
       'GA_opp', 'GA90_opp', 'SoTA_opp', 'Saves_opp', 'Save%_opp', 'W_opp',
       'D_opp', 'L_opp', 'CS_opp', 'CS%_opp', 'SoT_opp', 'SoT/90_opp',
       'G/SoT_opp', 'Fls_opp', 'opp_code', 'day_code'],
      dtype='object')

### Initial Predictor

In [13]:
predictors = ['Age','day_code','home','opp_code']

rf = RandomForestClassifier(n_estimators=200, min_samples_split=40, random_state=1)

rf.fit(train[predictors], train["result"])
preds = rf.predict(test[predictors])
accuracy = accuracy_score(test["result"], preds)

accuracy

0.45081967213114754

This is terrible! We need to improve on this as it is below the benchmark. 

In [66]:
def get_acc (model, train, test, predictors):
    model.fit(train[predictors], train["result"])
    preds = model.predict(test[predictors])

    accuracy = accuracy_score(test["result"], preds)
    percision = precision_score(test["result"], preds, average = 'weighted')
    print("accuracy is: ", accuracy)
    print("percision is: ", percision)
    
    return accuracy, percision

In [15]:
predictors = ['score', 'score_opp',
       'home', '# Pl', 'Age', 'MP', 'Starts', 'Gls', 'Ast', 'G+A',
       'G-PK', 'PK', 'PKatt', 'Gls90', 'Ast90', 'G+A90', 'G-PK90', 'G+A-PK',
       'Min%', 'Subs', 'Mn/Sub', 'PPM', 'onG', 'onGA', '+/-', '# Pl.1', 'Min',
       'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS', 'CS%',
       'SoT', 'SoT/90', 'G/SoT', 'Fls', '# Pl_opp', 'Age_opp', 'MP_opp',
       'Starts_opp', 'Gls_opp', 'Ast_opp', 'G+A_opp', 'G-PK_opp', 'PK_opp',
       'PKatt_opp', 'Gls90_opp', 'Ast90_opp', 'G+A90_opp', 'G-PK90_opp',
       'G+A-PK_opp', 'season_opp', 'Min%_opp', 'Subs_opp', 'Mn/Sub_opp',
       'PPM_opp', 'onG_opp', 'onGA_opp', '+/-_opp', '# Pl_opp.1', 'Min_opp',
       'GA_opp', 'GA90_opp', 'SoTA_opp', 'Saves_opp', 'Save%_opp', 'W_opp',
       'D_opp', 'L_opp', 'CS_opp', 'CS%_opp', 'SoT_opp', 'SoT/90_opp',
       'G/SoT_opp', 'Fls_opp', 'opp_code', 'day_code'
]

In [67]:
get_acc(rf,train, test, predictors)


accuracy is:  0.7950819672131147
percision is:  0.7786063852286867


(0.7950819672131147, 0.7786063852286867)

- Random Forest With Class Weighting

- Random Forest With Bootstrap Class Weighting


In [68]:
rf_bal = RandomForestClassifier(n_estimators=200, min_samples_split=40, random_state=1, class_weight='balanced')


get_acc(rf_bal,train, test, predictors)


accuracy is:  0.8360655737704918
percision is:  0.8439694367338286


(0.8360655737704918, 0.8439694367338286)

In [69]:
rf_bal_sub = RandomForestClassifier(n_estimators=200, min_samples_split=40, random_state=1, class_weight='balanced_subsample')


accuracy = get_acc(rf_bal_sub,train, test, predictors)
accuracy

accuracy is:  0.8442622950819673
percision is:  0.8552885905011535


(0.8442622950819673, 0.8552885905011535)

### Using imbalanced learn! 
- Random Forest With Random Undersampling
- Easy Ensemble for Imbalanced Classification


In [64]:
%%capture --no-display output

from imblearn.ensemble import BalancedRandomForestClassifier


bal_rf = BalancedRandomForestClassifier(n_estimators=200, random_state=1)

get_acc(bal_rf,train, test, predictors)


(0.8688524590163934, array([0.94444444, 0.95294118, 0.66666667]))

In [70]:
from imblearn.ensemble import EasyEnsembleClassifier


bal_rf = EasyEnsembleClassifier(n_estimators=200, random_state=1)

accuracy = get_acc(bal_rf,train, test, predictors)
accuracy

accuracy is:  0.680327868852459
percision is:  0.8721311475409836


(0.680327868852459, 0.8721311475409836)

Useful stuff:

```df.groupby('home').apply(lambda x: x[x['result']==1].shape[0]/x.shape[0])```

 Using groupby and lambda function to see what percentage o the games are won, ost or draw based on the home value (0 or 1). To establish a baseline. 