# Predictions
### Steps : 
* Train multiple models on the dataset  
* Evaluate them  
* Select the bests models
* Hyperparameter tuning
* Ensemble training

## Importing dataset and libraries

In [13]:
import pandas as pd
import numpy as np
### Preprocessing libraries
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
### Models
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
df = pd.read_csv('../assets/data/clean_data.csv')

### Hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
### Ensembles
from sklearn.ensemble import VotingClassifier, BaggingClassifier

## Prepare data for the training




### Split X and y

In [14]:
X = df.drop(["Div","Date","FTR", "FTHG", "FTAG", "HTHG", "HTAG", "HTR", "HS", "AS", "AST", "HF", "AF", "HC",'AC', 'HY', 'AY', 'HR', 'AR', "Numerical_ftr", "Numerical_htr"], axis=1)
y = df["FTR"]

Establish categorical columns and numerical columns

In [16]:
categorical_features = [col for col in X.select_dtypes(include='object')]
numerical_features = [col for col in X.select_dtypes(include='float64')]

Instantiate preprocessing pipelines

In [17]:
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))
full_pipeline = make_column_transformer((num_pipeline, numerical_features), (cat_pipeline, categorical_features))

Split train/test data

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

## Training the models

In [19]:
log_reg = LogisticRegression(max_iter=500)
logistic_regression = make_pipeline(full_pipeline, log_reg)

xgb = XGBClassifier(n_estimators=100)
xgb_model = make_pipeline(full_pipeline, xgb)

decision_tree = DecisionTreeClassifier(max_depth=20, min_samples_split=13)
decision_model = make_pipeline(full_pipeline, decision_tree)

models = [(xgb_model, "xgb"), (logistic_regression, "logistic_regression"), (decision_model, "decision tree")]

In [20]:
for model in models : 
  model[0].fit(X_train, y_train)
  score = model[0].score(X_test, y_test)
  print(score)
  print(model[1])



0.5175438596491229
xgb
0.5372807017543859
logistic_regression
0.43859649122807015
decision tree


The best models are XGB and logistic regression

## Hyperparameter tuning with grid-search

#### Logistic regression gridsearch

In [21]:
param_grid = [
              {"logisticregression__penalty" : ["l1", "l2", "elasticnet"], 
               "logisticregression__solver": ["newton-cg", "lbfgs", "liblinear", "sag", "sag "], 
               "logisticregression__C":[100, 10, 1.0, 0.1, 0.01], "logisticregression__max_iter":[500]}
]
log_model = LogisticRegression()
logistic_model = make_pipeline(full_pipeline, log_model)
grid_search = GridSearchCV(logistic_model, param_grid, cv=10)
grid_search.fit(X_train, y_train)


Traceback (most recent call last):
  File "/home/user/Documents/premier_league/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/user/Documents/premier_league/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/user/Documents/premier_league/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/user/Documents/premier_league/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/user/Documents/premier_league/lib/python3.8/site-packages/sklearn/m

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         ['HST',
                                                                          'B365H',
                                                                          'B365D',
                                                                          'B365A',
                                                   

In [22]:
grid_search.best_score_

0.5822134150003002

In [23]:
grid_search.best_params_

{'logisticregression__C': 0.01,
 'logisticregression__max_iter': 500,
 'logisticregression__penalty': 'l1',
 'logisticregression__solver': 'liblinear'}

In [24]:
best_logistic_regression = LogisticRegression(C=0.01, max_iter=500, penalty='l1', solver="liblinear")

#### Xgboost gridsearch


In [25]:
params = { 'xgbclassifier__max_depth': [3, 5, 6, 10, 15, 20, 25, 30],
           'xgbclassifier__learning_rate': [0.01, 0.1, 0.2, 0.3],
           'xgbclassifier__subsample': np.arange(0.5, 1.0, 0.1),
           'xgbclassifier__colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'xgbclassifier__colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'xgbclassifier__n_estimators': [100,300,400,500,600, 700, 800, 1000]}
xgb = XGBClassifier()
xgb_grid_model = make_pipeline(full_pipeline, xgb)
random_grid = RandomizedSearchCV(xgb_grid_model, params, cv=10)
random_grid.fit(X_train, y_train)

































KeyboardInterrupt: 

In [None]:
random_grid.best_params_

{'xgbclassifier__colsample_bylevel': 0.6,
 'xgbclassifier__colsample_bytree': 0.8999999999999999,
 'xgbclassifier__learning_rate': 0.01,
 'xgbclassifier__max_depth': 3,
 'xgbclassifier__n_estimators': 300,
 'xgbclassifier__subsample': 0.7999999999999999}

In [None]:
best_xgb_model = XGBClassifier(colsample_bylevel=0.6, 
                               colsample_bytree=0.8999999999999999, 
                               learning_rate=0.01, max_depth=3, 
                               n_estimators=300, 
                               subsample=0.7999999999999999)

## Ensemble training for the best models
* VotingClassifier with XGBClassifier and LogisticRegression
* BaggingClassifier With XGBClassifier / BaggingClassifier with LogisticRegression

In [None]:
model_xgb = make_pipeline(full_pipeline, best_xgb_model)
model_logistic = make_pipeline(full_pipeline, best_logistic_regression)
voting_classifier = VotingClassifier(estimators=[("xgb", model_xgb), ("loregr", model_logistic)], voting="hard" )
voting_classifier.fit(X_train, y_train)
voting_classifier.score(X_test, y_test)

0.5723684210526315

In [None]:
bagging_model = BaggingClassifier(base_estimator=best_xgb_model, n_estimators=300, bootstrap=True)
bagging_classifier = make_pipeline(full_pipeline, bagging_model)
bagging_classifier.fit(X_train, y_train)
bagging_classifier.score(X_test, y_test)


0.5460526315789473