### Import Statements

In [79]:
import pandas as pd
import numpy as np
import math , chardet

from sklearn.metrics import accuracy_score


In [80]:
import mlflow
import mlflow.sklearn

In [81]:
base_path = './test_data/'
league_path_one = '17_compiled.csv'
league_path_two = '35_compiled.csv'
league_path_three = '10295_compiled.csv'

league_id = league_path_one.split('_')[0]

league_id = '17_35'

path_one = f'{base_path}{league_path_one}'
path_two = f'{base_path}{league_path_two}'
path_three = f'{base_path}{league_path_three}'

Set the mlflow tracking uri to a folder inside the main Predictor directory

In [82]:
uri = '../test_mlruns'
experiment_name = f'{league_id}_notebook_experiment'

mlflow.set_tracking_uri(uri=uri)

try:
    experiment_id = mlflow.create_experiment(experiment_name)
except mlflow.exceptions.MlflowException:
    search_result = mlflow.search_experiments(filter_string=f"name = '{experiment_name}'")
    experiment_id = search_result[0].experiment_id

In [83]:
type(experiment_id)

str

### Read in the transformed data in experiment_1

In [84]:
def read_data(data_path : str):

    # Detect encoding   
    with open(data_path, 'rb') as f:
        result = chardet.detect(f.read())
    
    # Read file with detected encoding
    df = pd.read_csv(data_path, encoding=result['encoding'] , index_col=False)
    
    return df

In [85]:
# trying it out with multiple league data

df_one = read_data(path_one)
df_two = read_data(path_two)
df_three = read_data(path_three)

df = pd.concat([df_one])

In [86]:
df.shape

(7427, 46)

- Remove the index column

In [87]:
df = df.drop(columns=[df.columns[0]])

### Perform basic feature selection

In [88]:
df.describe()

Unnamed: 0,start_time,id,home_score,away_score,result,home_attack_strength,home_defence_strength,away_attack_strength,away_defence_strength,home_score_avg,...,home_win,away_win,home_double,away_double,under0.5,under1.5,under2.5,under3.5,under4.5,draw
count,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0,...,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0,7427.0
mean,1374058000.0,4277013.0,1.518648,1.162784,1.791975,0.944246,0.941432,0.941411,0.943328,1.463109,...,0.456443,0.295274,0.704726,0.543557,0.078767,0.255015,0.493066,0.712266,0.867376,0.248283
std,183164600.0,3660945.0,1.303529,1.150428,0.813434,0.503392,0.499868,0.536468,0.474306,0.291466,...,0.498133,0.456197,0.456197,0.498133,0.269392,0.435899,0.499986,0.452737,0.339191,0.432046
min,1061032000.0,40046.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1218979000.0,781514.5,1.0,0.0,1.0,0.675676,0.647994,0.615741,0.675676,1.428571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,1377353000.0,3964819.0,1.0,1.0,2.0,0.895522,0.944206,0.898876,0.954545,1.5,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1535206000.0,7827906.0,2.0,2.0,2.0,1.191481,1.230637,1.245847,1.21289,1.582143,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
max,1693150000.0,11352520.0,9.0,9.0,3.0,5.0,5.0,5.217391,5.0,2.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


- Remove games with too little history time , currently I defined it as having at least 1 game both home and away

In [89]:
# drop games with little game info , i.e either the home team or away team has to little games in the current season
min_games = 3

dropped_df = df[(df['num_home_games'] >= min_games) & (df['num_away_games'] >= min_games)]

kept_ration = dropped_df.shape[0] / df.shape[0]
games_dropped = df.shape[0] - dropped_df.shape[0]

print(f'Games with less than {min_games} games for a team : {games_dropped}')
print(f'Ratio of kept games : {kept_ration}')

Games with less than 3 games for a team : 1237
Ratio of kept games : 0.8334455365558099


- Remove unnecessary columns

In [90]:
dropped_df.columns

Index(['start_time', 'id', 'home_team', 'away_team', 'home_score',
       'away_score', 'result', 'home_attack_strength', 'home_defence_strength',
       'away_attack_strength', 'away_defence_strength', 'home_score_avg',
       'away_score_avg', 'home_expected_goal', 'away_expected_goal',
       'home_elo', 'away_elo', '1', '2', 'x', '12', '1x', '2x', 'ov0.5',
       'un0.5', 'ov1.5', 'un1.5', 'ov2.5', 'un2.5', 'ov3.5', 'un3.5', 'ov4.5',
       'un4.5', 'num_home_games', 'num_away_games', 'home_win', 'away_win',
       'home_double', 'away_double', 'under0.5', 'under1.5', 'under2.5',
       'under3.5', 'under4.5', 'draw'],
      dtype='object')

In [91]:
# drop the id and start time columns
target_col = 'draw'
keep_columns = ['home_attack_strength', 'home_defence_strength', 'away_attack_strength',
       'away_defence_strength', 'home_expected_goal', 'away_expected_goal']
remove_columns = [column for column in dropped_df.columns if column not in keep_columns and column != target_col ]


dropped_df = dropped_df.drop(columns=remove_columns)

In [92]:
dropped_df.describe()

Unnamed: 0,home_attack_strength,home_defence_strength,away_attack_strength,away_defence_strength,home_expected_goal,away_expected_goal,draw
count,6190.0,6190.0,6190.0,6190.0,6190.0,6190.0,6190.0
mean,0.999824,0.999962,0.999927,0.999896,1.500032,1.164854,0.246365
std,0.396534,0.387611,0.437594,0.353672,0.804114,0.690907,0.430928
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.736358,0.727273,0.683003,0.754717,0.951064,0.672161,0.0
50%,0.928794,0.981595,0.942029,0.989899,1.364861,1.032856,0.0
75%,1.196346,1.235744,1.252367,1.224674,1.90236,1.523744,0.0
max,3.214286,3.20122,3.216374,2.727273,7.112947,5.751678,1.0


- Check for null values

In [93]:
dropped_df.isna().sum().to_dict()

{'home_attack_strength': 0,
 'home_defence_strength': 0,
 'away_attack_strength': 0,
 'away_defence_strength': 0,
 'home_expected_goal': 0,
 'away_expected_goal': 0,
 'draw': 0}

### 1.  Building models

- Splitting the data

In [94]:
predictor_cols = dropped_df.columns.tolist()
predictor_cols.remove(target_col)

# predictors
X = dropped_df.drop(columns=target_col)

# target
y = dropped_df.drop(columns=predictor_cols)

In [95]:
X.columns

Index(['home_attack_strength', 'home_defence_strength', 'away_attack_strength',
       'away_defence_strength', 'home_expected_goal', 'away_expected_goal'],
      dtype='object')

In [96]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.1 , random_state=100)

print(X_train.shape)
print(X_test.shape)


(5571, 6)
(619, 6)


- Creating  , training , evaluating and logging models and their parameters

1. Random Forest Classifier

In [97]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier 

run_name = f'{league_id}_{target_col}_randomForest'


with mlflow.start_run(run_name=run_name , experiment_id=experiment_id) as run:

    # create and log the params
    model_params = {'n_estimators' : 200 , 'random_state' : 42}
    mlflow.log_params(model_params)
    
    # initialize and train the model
    random_forest_cls = RandomForestClassifier(**model_params)
    random_forest_cls.fit(X_train , y_train)

    # prediction using random_forest
    random_forest_predictions = random_forest_cls.predict(X_test)
    random_forest_performance = accuracy_score(y_test , random_forest_predictions)

    # log the performance
    mlflow.log_metrics({'accuracy_score' : random_forest_performance})

    # log the sklearn model
    mlflow.sklearn.log_model(
        sk_model=random_forest_cls, 
        artifact_path="models/random_forest",
        input_example=X_train,
        registered_model_name=f"random_forest_{league_id}_{target_col}"
        )


  return fit_method(estimator, *args, **kwargs)


Successfully registered model 'random_forest_17_35_draw'.
Created version '1' of model 'random_forest_17_35_draw'.


2. SVM Classifier

In [98]:
# SVM classifier
from sklearn.svm import SVC

run_name = f'{league_id}_{target_col}_SVM'

with mlflow.start_run(run_name=run_name , experiment_id=experiment_id  ) as run:

    # create and log model params
    model_params = {'kernel' : 'rbf' , 'random_state' : 1}
    mlflow.log_params(model_params)

    # initialize and train the model
    svm_cls = SVC(**model_params)
    svm_cls.fit(X_train , y_train)
    
    # predictions using svm
    svm_predictions = svm_cls.predict(X_test)
    svm_performance = accuracy_score(y_test , svm_predictions)

    # log performance
    mlflow.log_metrics({'accuracy_score' : svm_performance})

    # log the model
    mlflow.sklearn.log_model(
        sk_model=svm_cls,
        artifact_path='models/svm',
        input_example=X_train,
        registered_model_name=f'svm_{league_id}_{target_col}'
    )

  y = column_or_1d(y, warn=True)
Successfully registered model 'svm_17_35_draw'.
Created version '1' of model 'svm_17_35_draw'.


3. Multi Layer Perceptron Classifier

In [111]:
# Multi Layer Classifier
from sklearn.neural_network import MLPClassifier

run_name = f'{league_id}_{target_col}_MLP'

with mlflow.start_run(run_name=run_name , experiment_id=experiment_id) as run:
    
    # create and log model parameters
    model_params = {'hidden_layer_sizes' : (40,),
                    'max_iter' : 100,
                    'alpha' : 1e-4,
                    'solver' : 'adam',
                    'random_state' : 1,
                    'learning_rate_init' : 0.2,
                    'learning_rate' : 'adaptive',
                    'early_stopping' : True}
    
    mlflow.log_params(model_params)

    # initialize and train the model
    mlp_cls = MLPClassifier(**model_params)
    mlp_cls.fit(X_train , y_train)

    # predictions using MLP 
    mlp_predictions = mlp_cls.predict(X_test)
    mlp_performance = accuracy_score(y_test , mlp_predictions)
    
    # log performance
    mlflow.log_metrics({'accuracy_score' : mlp_performance})

    # log the model
    mlflow.sklearn.log_model(
        sk_model=mlp_cls,
        artifact_path='models/mlp',
        input_example=X_train,
        registered_model_name=f'mlp_{league_id}_{target_col}'
    )



  y = column_or_1d(y, warn=True)
Registered model 'mlp_17_35_draw' already exists. Creating a new version of this model...
Created version '3' of model 'mlp_17_35_draw'.


4. Gradient Boosting Classifier

In [100]:
# Gradient boosting classifier
from sklearn.ensemble import GradientBoostingClassifier

run_name = f'{league_id}_{target_col}_GDBoost'

with mlflow.start_run(run_name=run_name , experiment_id=experiment_id):
    
    # create and log model parameters
    model_params = {
        'n_estimators' : 350,
        'learning_rate' : 1,
        'max_depth' : 1,
        'random_state' : 1
    }

    mlflow.log_params(model_params)

    # initialize and train the model
    gb_cls = GradientBoostingClassifier(**model_params)
    gb_cls.fit(X_train , y_train)

    # predictions using Gradient Boosting Classifier
    gb_predictions = gb_cls.predict(X_test)
    gb_perfomance = accuracy_score(y_test , gb_predictions)

    # log performance
    mlflow.log_metrics({'accuracy_score' : gb_perfomance})

    # log the model
    mlflow.sklearn.log_model(
        sk_model=gb_cls,
        artifact_path='models/gradient_boosting',
        input_example=X_train,
        registered_model_name=f'graddient_boosting_{league_id}_{target_col}'
    )

  y = column_or_1d(y, warn=True)
Successfully registered model 'graddient_boosting_17_35_draw'.
Created version '1' of model 'graddient_boosting_17_35_draw'.


5. Passive Aggressive Classifier

In [101]:
# Passive Aggressive Classififer
from sklearn.linear_model import PassiveAggressiveClassifier

run_name = f'{league_id}_{target_col}_passiveAggressive'

with mlflow.start_run(run_name=run_name , experiment_id=experiment_id):

    # create and log model parameters
    model_params = {
        'max_iter' : 100,
        'random_state' : 42
    }

    mlflow.log_metrics(model_params)

    # initialize and train the model
    passive_aggressive_cls = PassiveAggressiveClassifier(**model_params)
    passive_aggressive_cls.fit(X_train , y_train)

    # predictions using Passive Aggressive Classifier
    pass_aggr_predictions = passive_aggressive_cls.predict(X_test)
    pass_aggr_performance = accuracy_score(y_test , pass_aggr_predictions)

    #log performance
    mlflow.log_metrics({'accuracy_score' : pass_aggr_performance})

    # log the model
    mlflow.sklearn.log_model(
        sk_model=passive_aggressive_cls,
        artifact_path='models/passive_aggressive',
        input_example=X_train,
        registered_model_name=f'passive_aggressive_{league_id}_{target_col}'
    )

  y = column_or_1d(y, warn=True)
Successfully registered model 'passive_aggressive_17_35_draw'.
Created version '1' of model 'passive_aggressive_17_35_draw'.


In [102]:
print(f'Random forrest : {random_forest_performance}')
print(f'SVM : {svm_performance}')
print(f'MLP : {mlp_performance}')
print(f'Gradient Boosting : {gb_perfomance}')
print(f'Passive Aggressive : {pass_aggr_performance}')

Random forrest : 0.7495961227786753
SVM : 0.7689822294022617
MLP : 0.7689822294022617
Gradient Boosting : 0.7673667205169629
Passive Aggressive : 0.2310177705977383


- Trying out different ensemble methods

In [103]:
classifiers = [('Random Forrest' , random_forest_cls), 
               ('SVM' , svm_cls),
               ('MLP' , mlp_cls),
               ('Gradient Boosting' , gb_cls),
               ('Passive Aggressive' , passive_aggressive_cls)]

6. Votting Ensemble

In [112]:
# Votting ensemble
from sklearn.ensemble import VotingClassifier

run_name = f'{league_id}_{target_col}_vottingEnsemble'

with mlflow.start_run(run_name=run_name  , experiment_id=experiment_id):

    # create and log model_parameters
    model_params = {
        'estimators' : classifiers
    }

    mlflow.log_params(model_params)

    # initialize and train the model
    votting_ensemble = VotingClassifier(**model_params , verbose=0)
    votting_ensemble.fit(X_train , y_train)

    # votting ensemble performance
    votting_predictions = votting_ensemble.predict(X_test)
    votting_performance = accuracy_score(y_test , votting_predictions)

    # log performance
    mlflow.log_metrics({'accuracy_score' : votting_performance})

    # log the model
    mlflow.sklearn.log_model(
        sk_model=votting_ensemble,
        artifact_path='models/votting_ensemble',
        input_example=X_train,
        registered_model_name=f'votting_ensemble_{league_id}_{target_col}'
    )

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Iteration 1, loss = 0.73221003
Validation score: 0.752688
Iteration 2, loss = 0.56155825
Validation score: 0.752688
Iteration 3, loss = 0.56094984
Validation score: 0.752688
Iteration 4, loss = 0.56058628
Validation score: 0.752688
Iteration 5, loss = 0.56109398
Validation score: 0.752688
Iteration 6, loss = 0.56066988
Validation score: 0.752688
Iteration 7, loss = 0.56100290
Validation score: 0.752688
Iteration 8, loss = 0.56038605
Validation score: 0.752688
Iteration 9, loss = 0.56099385
Validation score: 0.752688
Iteration 10, loss = 0.56129364
Validation score: 0.752688
Iteration 11, loss = 0.56380939
Validation score: 0.752688
Iteration 12, loss = 0.56724725
Validation score: 0.752688
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


Registered model 'votting_ensemble_17_35_draw' already exists. Creating a new version of this model...
Created version '3' of model 'votting_ensemble_17_35_draw'.


7. Stacking Ensemble

In [105]:
# Stacking ensemble
from sklearn.ensemble import StackingClassifier

run_name = f'{league_id}_{target_col}_stackingEnsemble'

with mlflow.start_run(run_name=run_name , experiment_id=experiment_id):

    # create and log model_parameters
    model_params = {
        'estimators' : classifiers,
        'final_estimator' : svm_cls
    }

    mlflow.log_params(model_params)

    # initialize and train the model
    stacking_ensemble = StackingClassifier(estimators=classifiers , final_estimator=svm_cls)
    stacking_ensemble.fit(X_train , y_train)

    # Stacking ensemble performace
    stacking_predictions = stacking_ensemble.predict(X_test)
    stacking_performance = accuracy_score(y_test , stacking_predictions)

    # log performance
    mlflow.log_metrics({'accuracy_score' : stacking_performance})

    # log model
    mlflow.sklearn.log_model(
        sk_model = stacking_ensemble,
        artifact_path='models/stacking_ensemble',
        input_example=X_train,
        registered_model_name=f'stacking_ensemble_{league_id}_{target_col}'
    )
    

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Iteration 1, loss = 0.73221003
Validation score: 0.752688
Iteration 2, loss = 0.56155825
Validation score: 0.752688
Iteration 3, loss = 0.56094984
Validation score: 0.752688
Iteration 4, loss = 0.56058628
Validation score: 0.752688
Iteration 5, loss = 0.56109398
Validation score: 0.752688
Iteration 6, loss = 0.56066988
Validation score: 0.752688
Iteration 7, loss = 0.56100290
Validation score: 0.752688
Iteration 8, loss = 0.56038605
Validation score: 0.752688
Iteration 9, loss = 0.56099385
Validation score: 0.752688
Iteration 10, loss = 0.56129364
Validation score: 0.752688
Iteration 11, loss = 0.56380939
Validation score: 0.752688
Iteration 12, loss = 0.56724725
Validation score: 0.752688
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.85689382
Validation score: 0.751121
Iteration 2, loss = 0.56099517
Validation score: 0.751121
Iteration 3, loss = 0.56005599
Validation score: 0.751121
Iteration 4, loss = 0.56024116
Val

Successfully registered model 'stacking_ensemble_17_35_draw'.
Created version '1' of model 'stacking_ensemble_17_35_draw'.


In [106]:
print(f'Votting Ensemble : {votting_performance}')
print(f'Stacking Ensemble : {stacking_performance}')

Votting Ensemble : 0.7689822294022617
Stacking Ensemble : 0.7689822294022617
