# Predictions
### Steps : 
* Train multiple models on the dataset  
* Evaluate them  
* Select the best one 

## Importing dataset and libraries

In [56]:
import pandas as pd
### Preprocessing libraries
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
### Models
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
df = pd.read_csv('/content/clean_data.csv')

## Prepare data for the training

### Split X and y

In [65]:
X = df.drop(["Div","Date","FTR", "FTHG", "FTAG", "HTHG", "HTAG", "HTR", "HS", "AS", "AST", "HF", "AF", "HC",'AC', 'HY', 'AY', 'HR', 'AR', "Numerical_ftr", "Numerical_htr"], axis=1)
y = df["FTR"]

Establish categorical columns and numerical columns

In [66]:
categorical_features = [col for col in X_train.select_dtypes(include='object')]
numerical_features = [col for col in X_train.select_dtypes(include='float64')]

Instantiate preprocessing pipelines

In [67]:
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))
full_pipeline = make_column_transformer((num_pipeline, numerical_features), (cat_pipeline, categorical_features))

Split train/test data

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

## Training the models

In [93]:
log_reg = LogisticRegression(max_iter=500)
logistic_regression = make_pipeline(full_pipeline, log_reg)

xgb = XGBClassifier(n_estimators=100)
xgb_model = make_pipeline(full_pipeline, xgb)

decision_tree = DecisionTreeClassifier()
decision_model = make_pipeline(full_pipeline, decision_tree)

models = [(xgb_model, "xgb"), (logistic_regression, "logistic_regression")]

In [94]:
for model in models : 
  model[0].fit(X_train, y_train)
  score = model[0].score(X_test, y_test)
  print(score)
  print(model[1])

0.5482456140350878
xgb
0.5394736842105263
logistic_regression
