# Modeling

***
### Importing Dependencies

In [1]:
# Importing Dependencies
from sklearn.preprocessing import RobustScaler, MinMaxScaler, MaxAbsScaler, StandardScaler, OrdinalEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from IPython.core.display import HTML
from sklearn import set_config
from sklearn.svm import SVC

import pandas as pd
import numpy as np
import warnings
import logging
import pickle
import skopt
import yaml
import os

In [2]:
logging.basicConfig(filename = '/Users/kzeynalzade/Documents/Project/Logs/engine.log', filemode = 'w', format = '%(asctime)s - %(levelname)s - %(message)s', level = logging.INFO)
display(HTML('<style>.container{width:100% !important;}<style>'))
warnings.filterwarnings(action = 'ignore')
pd.options.display.max_columns = None
set_config(display = 'diagram')
np.random.seed(seed = 42)

Let's define a function to load a yaml file.

In [3]:
# Defining a function to load a yaml file
def load_yaml(filepath = None):
    """
    This is a function that will load a yaml file.
    
    Args:
        filepath: A path to a yaml file.
        
    Returns:
        A dictionary object.
    """
    # Loading the yaml file
    with open(file = filepath) as yaml_file:
        config = yaml.safe_load(stream = yaml_file)
    
    # Returning the yaml file
    return config

Let's load the yaml file and create global variables for batch model features and split date.

In [4]:
# Loading model properties from a yaml file
CONFIG = load_yaml(filepath = '/Users/kzeynalzade/Documents/Project/Configuration/config.yml')

# Creating a path to datasets
TARGET_PATH = '/Users/kzeynalzade/Documents/Project/Data/Converted data'

# Loading features for a batch model from model properties
FEATURES = CONFIG.get('features').get('batch_model_features')

# Loading the dependent variable for Loss modeling
LOSS_TARGET = CONFIG.get('target').get('loss_model')

# Loading the dependent variable for Draw modeling
DRAW_TARGET = CONFIG.get('target').get('draw_model')

# Loading the dependent variable for Win modeling
WIN_TARGET = CONFIG.get('target').get('win_model')

Let's define a function to load the dataset.

In [5]:
# Defining a function to load the dataset
def load_data(target_path = None, non_na_ratio = 0.6):
    """
    This is a function that is used to load datasets from a specified path and create a cohesive dataset.
    
    Args:
        target_path: A path for the datasets.
        non_na_ratio: A non na ratio for variables.
        
    Returns:
        A pandas data frame.
    """
    # Creating a list of datasets based on the specified path
    datasets = [f'{target_path}/{file}' for file in os.listdir(path = target_path) if file.endswith('.brotli')]
    
    # Loading the datasets using list comprehension
    data_frames = [pd.read_parquet(path = dataset, engine = 'fastparquet') for dataset in datasets]
    
    # Concatenating data frames to create a cohesive data frame
    data_frame = pd.concat(objs = data_frames, ignore_index = True)
    
    # Casting the data type of the match date variable from object to datetime
    data_frame.match_date = pd.to_datetime(arg = data_frame.match_date, yearfirst = True)
    
    # Sorting the observations based on match_date variable in ascending order
    data_frame.sort_values(by = 'match_date', inplace = True)
    
    # Reseting the index to maintain index order
    data_frame.reset_index(drop = True, inplace = True)
    
    # Removing potential duplicate observations
    data_frame.drop_duplicates(inplace = True, ignore_index = True)
    
    # Dropping features that exceed non na ratio
    data_frame.dropna(axis = 1, thresh = int(data_frame.shape[0] * non_na_ratio), inplace = True)
    
    # Returning the data frame
    return data_frame

Let's call the function to load the dataset.

In [6]:
# Calling the function to load the dataset
df = load_data(target_path = TARGET_PATH)

# Logging information to the log file
logging.info(msg = 'Data has been loaded')

# Displaying the first five records of the dataset
df.head()

Unnamed: 0,season,match_week,match_date,month,day,weekday,referee,home_team,away_team,stadium,attendance,h_position,a_position,goals_h,goals_a,possession_h,possession_a,shots_on_target_h,shots_on_target_a,shots_h,shots_a,touches_h,touches_a,passes_h,passes_a,tackles_h,tackles_a,clearances_h,clearances_a,corners_h,corners_a,offsides_h,offsides_a,yellow_cards_h,yellow_cards_a,red_cards_h,red_cards_a,fouls_conceded_h,fouls_conceded_a,formation_h,formation_a,result_h,result_a,points_h,points_a,total_n_matches_played_h,total_n_matches_played_a,total_max_points_h,total_max_points_a,total_points_h_cum,total_points_a_cum,total_points_dropped_h,total_points_dropped_a,total_goals_scored_h,total_goals_scored_a,total_goals_conceded_h,total_goals_conceded_a,total_avg_possession_h,total_avg_possession_a,total_avg_possession_last_3_h,total_avg_possession_last_3_a,total_avg_possession_last_5_h,total_avg_possession_last_5_a,total_avg_shots_on_target_h,total_avg_shots_on_target_a,total_avg_shots_on_target_last_3_h,total_avg_shots_on_target_last_3_a,total_avg_shots_on_target_last_5_h,total_avg_shots_on_target_last_5_a,total_avg_shots_h,total_avg_shots_a,total_avg_shots_last_3_h,total_avg_shots_last_3_a,total_avg_shots_last_5_h,total_avg_shots_last_5_a,total_avg_touches_h,total_avg_touches_a,total_avg_touches_last_3_h,total_avg_touches_last_3_a,total_avg_touches_last_5_h,total_avg_touches_last_5_a,total_avg_passes_h,total_avg_passes_a,total_avg_passes_last_3_h,total_avg_passes_last_3_a,total_avg_passes_last_5_h,total_avg_passes_last_5_a,total_avg_tackles_h,total_avg_tackles_a,total_avg_tackles_last_3_h,total_avg_tackles_last_3_a,total_avg_tackles_last_5_h,total_avg_tackles_last_5_a,total_avg_clearances_h,total_avg_clearances_a,total_avg_clearances_last_3_h,total_avg_clearances_last_3_a,total_avg_clearances_last_5_h,total_avg_clearances_last_5_a,total_avg_corners_h,total_avg_corners_a,total_avg_corners_last_3_h,total_avg_corners_last_3_a,total_avg_corners_last_5_h,total_avg_corners_last_5_a,total_avg_offsides_h,total_avg_offsides_a,total_avg_offsides_last_3_h,total_avg_offsides_last_3_a,total_avg_offsides_last_5_h,total_avg_offsides_last_5_a,total_avg_yellow_cards_h,total_avg_yellow_cards_a,total_avg_yellow_cards_last_3_h,total_avg_yellow_cards_last_3_a,total_avg_yellow_cards_last_5_h,total_avg_yellow_cards_last_5_a,total_avg_fouls_conceded_h,total_avg_fouls_conceded_a,total_avg_fouls_conceded_last_3_h,total_avg_fouls_conceded_last_3_a,total_avg_fouls_conceded_last_5_h,total_avg_fouls_conceded_last_5_a,n_matches_played_h,n_matches_played_a,max_points_h,max_points_a,points_h_cum,points_a_cum,points_dropped_h,points_dropped_a,goals_scored_h_cum,goals_scored_a_cum,goals_conceded_h_cum,goals_conceded_a_cum,avg_possession_h,avg_possession_a,avg_possession_last_3_h,avg_possession_last_3_a,avg_possession_last_5_h,avg_possession_last_5_a,avg_shots_on_target_h,avg_shots_on_target_a,avg_shots_on_target_last_3_h,avg_shots_on_target_last_3_a,avg_shots_on_target_last_5_h,avg_shots_on_target_last_5_a,avg_shots_h,avg_shots_a,avg_shots_last_3_h,avg_shots_last_3_a,avg_shots_last_5_h,avg_shots_last_5_a,avg_touches_h,avg_touches_a,avg_touches_last_3_h,avg_touches_last_3_a,avg_touches_last_5_h,avg_touches_last_5_a,avg_passes_h,avg_passes_a,avg_passes_last_3_h,avg_passes_last_3_a,avg_passes_last_5_h,avg_passes_last_5_a,avg_tackles_h,avg_tackles_a,avg_tackles_last_3_h,avg_tackles_last_3_a,avg_tackles_last_5_h,avg_tackles_last_5_a,avg_clearances_h,avg_clearances_a,avg_clearances_last_3_h,avg_clearances_last_3_a,avg_clearances_last_5_h,avg_clearances_last_5_a,avg_corners_h,avg_corners_a,avg_corners_last_3_h,avg_corners_last_3_a,avg_corners_last_5_h,avg_corners_last_5_a,avg_offsides_h,avg_offsides_a,avg_offsides_last_3_h,avg_offsides_last_3_a,avg_offsides_last_5_h,avg_offsides_last_5_a,avg_yellow_cards_h,avg_yellow_cards_a,avg_yellow_cards_last_3_h,avg_yellow_cards_last_3_a,avg_yellow_cards_last_5_h,avg_yellow_cards_last_5_a,avg_fouls_conceded_h,avg_fouls_conceded_a,avg_fouls_conceded_last_3_h,avg_fouls_conceded_last_3_a,avg_fouls_conceded_last_5_h,avg_fouls_conceded_last_5_a,is_boxing_day,finished_top_4_last_season_h,finished_top_4_last_season_a,won_carabao_cup_last_season_h,won_carabao_cup_last_season_a,won_fa_cup_last_season_h,won_fa_cup_last_season_a,won_epl_last_season_h,won_epl_last_season_a,was_in_ucl_last_season_h,was_in_ucl_last_season_a,was_in_uel_last_season_h,was_in_uel_last_season_a,is_in_ucl_this_season_h,is_in_ucl_this_season_a,is_in_uel_this_season_h,is_in_uel_this_season_a,traditional_top_6_h,traditional_top_6_a,newly_promoted_h,newly_promoted_a,total_goal_difference_h,total_goal_difference_a,goal_difference_h,goal_difference_a,positive_total_goal_difference_h,positive_total_goal_difference_a,positive_goal_difference_h,positive_goal_difference_a,is_derby,derby_name,ground_truth,home_win,draw,away_win,link
0,2016/17,1,2016-08-13,8,13,6,Kevin Friend,Middlesbrough,Stoke,Riverside Stadium,32110.0,12,15,1,1,45.6,54.4,2,1,12,12,538,611,349,420,17,16,20,16,9,6,1,2,3,5,0,0,18,14,4-2-3-1,4-2-3-1,draw,draw,1,1,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,No Derby,0,0,1,0,https://www.premierleague.com/match/14048
1,2016/17,1,2016-08-13,8,13,6,Mike Dean,Hull,Leicester,KCOM Stadium,21037.0,7,8,2,1,49.7,50.3,5,5,14,18,645,619,449,453,20,14,29,15,5,3,1,0,2,2,0,0,8,17,4-3-3,4-4-2,win,defeat,3,0,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,No Derby,1,1,0,0,https://www.premierleague.com/match/14046
2,2016/17,1,2016-08-13,8,13,6,Jonathan Moss,Burnley,Swansea,Turf Moor,19126.0,3,17,0,1,47.5,52.5,3,9,10,17,558,613,353,383,14,19,16,49,7,4,3,2,3,2,0,0,10,14,4-4-2,4-3-3,defeat,win,0,3,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,No Derby,-1,0,0,1,https://www.premierleague.com/match/14042
3,2016/17,1,2016-08-13,8,13,6,Craig Pawson,Crystal Palace,West Brom,Selhurst Park,24490.0,5,19,0,1,62.3,37.7,4,3,14,13,620,476,414,245,28,18,21,23,3,6,0,2,2,2,0,0,12,15,4-2-3-1,4-4-2,defeat,win,0,3,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,No Derby,-1,0,0,1,https://www.premierleague.com/match/14044
4,2016/17,1,2016-08-13,8,13,6,Robert Madley,Man City,Sunderland,Etihad Stadium,54362.0,10,16,2,1,76.9,23.1,4,3,16,7,864,395,681,189,5,17,26,35,9,6,1,2,1,2,0,0,11,14,4-2-3-1,4-2-3-1,win,defeat,3,0,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,50.0,50.0,50.0,50.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,No Derby,1,1,0,0,https://www.premierleague.com/match/14047


Let's define a custom preprocessor that reallocates features.

In [7]:
# Defining a custom transformer to reallocate variables
class FeatureReallocator(BaseEstimator, TransformerMixin):
    # Defining a function for fitting data to custom transformer
    def fit(self, X, y = None):
        # Creating a list of binary features
        binary_features = [feature for feature in FEATURES if X[feature].nunique() == 2]

        # Creating a list of date features
        date_features = ['match_week', 'month', 'day', 'weekday']

        # Creating a list of ordinal features
        ordinal_features = ['h_position', 'a_position']

        # Creating a list of numeric features
        left_out_features = [feature for feature in X.columns.tolist() if feature not in binary_features + date_features + ordinal_features]

        # Creating a list of reallocated features
        reallocated_features = date_features + ordinal_features + left_out_features + binary_features
        
        # Defining the list for feature reallocation
        self.reallocated_features = reallocated_features
        
        # Defining the list for ordinal features
        self.ordinal_features = ordinal_features
        
        # Returning the fitted and transformed data
        return self
    
    # Defining a function for transforming data with custom transformer
    def transform(self, X, y = None):
        # Reallocating variables
        X = X[[feature for feature in self.reallocated_features if feature in FEATURES]]
        
        # Converting the values of ordinal variables into string
        X[self.ordinal_features] = X[self.ordinal_features].applymap(func = lambda x: str(x))
        
        # Returning the transformed data
        return X

Let's define a customer preprocessor that drops features with high **VIF** values.

In [8]:
# Defining a custom transformer to remove multicollinearity
class VifDropper(BaseEstimator, TransformerMixin):
    # Initializing the default threshold for variance inflation factor (VIF)
    def __init__(self, threshold = 2.5):
        # Default VIF threshold
        self.threshold = threshold

    # Defining a function for fitting data to custom transformer
    def fit(self, X, y = None):
        # Creating a copy of a Numpy array as Pandas dataframe
        data = pd.DataFrame(data = X).copy()
        
        # Creating a Pandas dataframe
        vif_df = pd.DataFrame()
        
        # Assigning the names of columns to a feature variable
        vif_df['feature'] = data.columns
        
        # Calculating VIF values
        vif_df['VIF'] = [VIF(exog = data.values, exog_idx = i) for i in range(len(data.columns))]
        
        # Creating an empty list
        features_with_max_vif = []
        
        # Calculating VIF values of variables based on default threshold
        while vif_df.VIF.max() > self.threshold:
            feature_with_max_vif = vif_df.loc[vif_df.VIF == vif_df.VIF.max()].feature.values[0]
            data.drop(columns = feature_with_max_vif, inplace = True)
            features_with_max_vif.append(feature_with_max_vif)
 
            vif_df = pd.DataFrame()
            vif_df['feature'] = data.columns
            vif_df['VIF'] = [VIF(exog = data.values, exog_idx = i) for i in range(len(data.columns))]
        
        # Defining the list of variables with maximum VIF values
        self.features_with_max_vif = features_with_max_vif
        
        # Returning the fitted and transformed data
        return self 
    
    # Defining a function for transforming data with custom transformer
    def transform(self, X, y = None):
        # Returning the transformed data
        return pd.DataFrame(data = X).drop(columns = self.features_with_max_vif).values

Let's create train features and labels for each model.

In [14]:
# Creating a training set features for Loss modeling
X_train_loss = df.loc[df.season != '2022/23'].drop(columns = LOSS_TARGET).reset_index(drop = True)

# Creating a training set features for Draw modeling
X_train_draw = df.loc[df.season != '2022/23'].drop(columns = DRAW_TARGET).reset_index(drop = True)

# Creating a training set features for Win modeling
X_train_win = df.loc[df.season != '2022/23'].drop(columns = WIN_TARGET).reset_index(drop = True)

# Creating a training set labels for Loss modeling
y_train_loss = df.loc[df.season != '2022/23', LOSS_TARGET].reset_index(drop = True)

# Creating a training set labels for Draw modeling
y_train_draw = df.loc[df.season != '2022/23', DRAW_TARGET].reset_index(drop = True)

# Creating a training set labels for Win modeling
y_train_win = df.loc[df.season != '2022/23', WIN_TARGET].reset_index(drop = True)

***
### Modeling

Let's define a function to build a classifier pipeline.

In [18]:
# Defining a function to build a classifier pipeline
def build_pipeline(classifier = None, 
                   metric = 'balanced_accuracy', 
                   data_frame = None, 
                   train_features = None, 
                   train_labels = None, 
                   apply_bayesian_optimization = True, 
                   hyperparameters = None, 
                   n_iterations = 50, 
                   scale = True, 
                   scaler_type = None,
                   drop_high_vif_features = True, 
                   apply_feature_selection = True, 
                   feature_selection_method = None, 
                   verbosity = 0):
    """
    This function is used to build a classifier pipeline.
    
    Args:
        classifier: A classifier instance.
        metric: A classification metric based on which to optimize a model.
        data_frame: A pandas data frame
        train_features: Train features.
        train_labels: Train labels.
        apply_bayesian_optimization: Whether or not to apply Bayesian Optimization to find the best hyper parameters.
        hyperparameters: A dictionary of hyperparameters.
        n_iterations: The number of repetitions for a hyperparameter tuning technique.
        scale: Whether or not to apply feature scaling.
        scaler_type: A type of a feature scaler instance as a string.
        drop_high_vif_features: Whether or not to drop features with high variance inflation factor (VIF) value.
        apply_feature_selection: Whether or not to apply feature selection.
        feature_selection_method: A type of a feature selection technique.
        verbosity: A level of verbosity to display an output of Bayesian Optimization.
        
    Returns:
        Builds a classifier pipeline.
    """
    # Creating a list of positions as string values in a descending order
    positions = [str(x) for x in sorted([int(x) for x in data_frame.h_position.unique().tolist()])[::-1]]
    
    # Creating a list of ordinal features
    ordinal_features = ['h_position', 'a_position']
    
    # Creating a list of date features
    date_features = ['match_week', 'month', 'day', 'weekday']
    
    # Creating a list of binary features
    binary_features = [feature for feature in FEATURES if data_frame[feature].nunique() == 2]
    
    # Creating a list of numeric features
    numeric_features = [feature for feature in FEATURES if feature not in ordinal_features + date_features + binary_features]
    
    # Asserting the number of features to be equal to 192
    assert len(ordinal_features) + len(date_features) + len(binary_features) + len(numeric_features) == len(FEATURES)
    
    # Instantiating a cross validation technique
    skf = StratifiedKFold()
    
    # Pipeline for binary features
    binary_pipeline = Pipeline(steps = [('mode_imputer', SimpleImputer(strategy = 'most_frequent'))])
    
    # Pipeline for ordinal features
    ordinal_pipeline = Pipeline(steps = [('mode_imputer', SimpleImputer(strategy = 'most_frequent')), 
                                         ('ore', OrdinalEncoder(categories = [positions, positions], handle_unknown = 'use_encoded_value', unknown_value = -1))])
    
    # A condition to apply feature scaling
    if scale:
        # Creating a dictionary of feature scaler and transformer instances
        scalers_dict = {'robust':RobustScaler(), 'minmax':MinMaxScaler(), 'maxabs':MaxAbsScaler(), 'standard':StandardScaler()}
        
        # A condition to drop features with high variance inflation factor (VIF) valdues
        if drop_high_vif_features:
            # Pipeline for numeric features with variance inflation factor (VIF) and feature scaling included 
            numeric_pipeline = Pipeline(steps = [('median_imputer', SimpleImputer(strategy = 'median')), 
                                                 ('vif_dropper', VifDropper()), 
                                                 ('feature_scaler', scalers_dict.get(scaler_type))])
        else:
            # Pipeline for numeric features with only feature scaling included 
            numeric_pipeline = Pipeline(steps = [('median_imputer', SimpleImputer(strategy = 'median')), 
                                                 ('feature_scaler', scalers_dict.get(scaler_type))])
    else:
        # Pipeline for numeric features without feature scaling 
        numeric_pipeline = Pipeline(steps = [('median_imputer', SimpleImputer(strategy = 'median'))])
        
    # Feature transformer with combined pipelines
    feature_transformer = ColumnTransformer(transformers = [('binary_pipeline', binary_pipeline, binary_features),
                                                            ('ordinal_pipeline', ordinal_pipeline, ordinal_features),
                                                            ('numeric_pipeline', numeric_pipeline, numeric_features)], remainder = 'passthrough', n_jobs = -1)
    
    # Creating a condition to apply feature selection
    if apply_feature_selection:
        if feature_selection_method == 'wrapper':
            # Instantiating a wrapper feature selection instance
            feature_selector = SequentialFeatureSelector(estimator = classifier, scoring = metric, cv = skf, n_jobs = -1)
        elif feature_selection_method == 'boruta':
            # Instantiating a tree based feature selection instance
            feature_selector = BorutaPy(estimator = classifier, random_state = 42, verbose = 0)
        elif feature_selection_method == 'meta':
            # Instantiating a meta transformer feature selection instance
            feature_selector = SelectFromModel(estimator = classifier)
        elif feature_selection_method == 'statistical':
            # Instantiating a meta transformer feature selection instance
            feature_selector = SelectFpr()
        elif feature_selection_method == 'mutual_info':
            # Instantiating a meta transformer feature selection instance
            feature_selector = SelectPercentile()
        elif feature_selection_method == 'hybrid':
            # Instantiating a meta transformer feature selection instance
            feature_selector = RFECV(estimator = classifier, cv = skf, scoring = metric, n_jobs = -1)
        
        # Final classifier pipeline with feature selection
        pipe = Pipeline(steps = [('feature_reallocator', FeatureReallocator()), 
                                 ('feature_transformer', feature_transformer), 
                                 ('feature_selector', feature_selector), 
                                 ('classifier', classifier)])
    else:
        # Final classifier pipeline without feature selection
        pipe = Pipeline(steps = [('feature_reallocator', FeatureReallocator()), 
                                 ('feature_transformer', feature_transformer), 
                                 ('classifier', classifier)])
    
    # A condition to apply hyperparameter tuning with Bayesian Optimization
    if apply_bayesian_optimization:
        # Applying Bayesian Optimization to identify the best hyperparameters
        bayes_search = skopt.BayesSearchCV(estimator = pipe, 
                                           search_spaces = hyperparameters, 
                                           n_iter = n_iterations, 
                                           scoring = metric, 
                                           n_jobs = -1, 
                                           cv = skf, 
                                           verbose = verbosity, 
                                           random_state = 42)
        
        # Fitting the training features and labels
        bayes_search.fit(X = train_features, y = train_labels)
        
        # Extracting the pipeline with the best hyperparameters
        best_pipe = bayes_search.best_estimator_
        
        # Returning the classifier pipeline with the best hyperparameters
        return best_pipe
    else:
        # Fitting train features and labels to the pipeline
        pipe.fit(X = train_features, y = train_labels)
        
        # Returning the classifier pipeline with default hyperparameters
        return pipe

Let's build a classifier pipeline with the best hyperparameters for the **Win** model.

In [11]:
# Setting an operating level seed
np.random.seed(seed = 42)

# Creating a dictionary of hyperparameters
search_spaces = {}
search_spaces['feature_transformer__numeric_pipeline__feature_scaler'] = skopt.space.Categorical(categories = [RobustScaler(), MinMaxScaler(), MaxAbsScaler(), StandardScaler()])
search_spaces['feature_selector__threshold'] = skopt.space.Categorical(categories = ['mean', 'median'])
search_spaces['classifier__C'] = skopt.space.Real(low = 1e-6, high = 100, prior = 'log-uniform')
search_spaces['classifier__tol'] = skopt.space.Real(low = 1e-6, high = 100, prior = 'log-uniform')

# Building a classifier pipeline with the best hyperparameters
win_model = build_pipeline(classifier = SVC(kernel = 'linear', probability = True, random_state = 42), 
                           data_frame = df, 
                           train_features = X_train_win, 
                           train_labels = y_train_win, 
                           hyperparameters = search_spaces,
                           scaler_type = 'minmax',
                           drop_high_vif_features = False,
                           feature_selection_method = 'meta')

# Logging information to the log file
logging.info(msg = 'Win model has been built')

# Displaying the architecture of the classifier
win_model

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Let's build a classifier pipeline with the best hyperparameters for the **Loss** model.

In [12]:
# Setting an operating level seed
np.random.seed(seed = 42)

# Building a classifier pipeline with the best hyperparameters
loss_model = build_pipeline(classifier = SVC(kernel = 'linear', probability = True, random_state = 42), 
                            data_frame = df, 
                            train_features = X_train_loss, 
                            train_labels = y_train_loss, 
                            hyperparameters = search_spaces,
                            scaler_type = 'minmax',
                            drop_high_vif_features = False,
                            feature_selection_method = 'meta')

# Logging information to the log file
logging.info(msg = 'Loss model has been built')

# Displaying the architecture of the classifier
loss_model

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Let's build the **Draw** model with default parameters.

In [13]:
# Building a classifier pipeline with the best hyperparameters
draw_model = build_pipeline(classifier = MLPClassifier(learning_rate = 'adaptive', shuffle = False, random_state = 42, early_stopping = True, n_iter_no_change = 3), 
                            data_frame = df, 
                            train_features = X_train_draw, 
                            train_labels = y_train_draw, 
                            apply_bayesian_optimization = False,
                            scaler_type = 'minmax',
                            drop_high_vif_features = False,
                            feature_selection_method = 'wrapper')

# Logging information to the log file
logging.info(msg = 'Draw model has been built')

# Displaying the architecture of the classifier
draw_model

Let's save the tuned models.

In [14]:
# Saving the tuned Loss model as a pickle file
with open(file = '/Users/kzeynalzade/Documents/Project/Models/loss_model.pickle', mode = 'wb') as pickled_model:
    pickle.dump(obj = loss_model, file = pickled_model)

# Saving the tuned Draw model as a pickle file
with open(file = '/Users/kzeynalzade/Documents/Project/Models/draw_model.pickle', mode = 'wb') as pickled_model:
    pickle.dump(obj = draw_model, file = pickled_model)

# Saving the tuned Win model as a pickle file
with open(file = '/Users/kzeynalzade/Documents/Project/Models/win_model.pickle', mode = 'wb') as pickled_model:
    pickle.dump(obj = win_model, file = pickled_model)

# Logging information to the log file
logging.info(msg = 'Models were saved as pickle objects')