# Establishing a baseline score for the model.
Before making our predictions, we need to establish a baseline score. Then we will be able to compare the real score of our model with this base score. 

## Importing libraries and the dataset

In [115]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier



df = pd.read_csv('../assets/data/clean_data.csv')
df["FTR"].dropna(inplace=True)

## Defining X an Y
X = Every feature available before the kick-off  

y = The Full time result

In [116]:
X = df.drop(["Div","Date","FTR", "FTHG", "FTAG", "HTHG", "HTAG", "HTR", "HS", "AS", "AST", "HF", "AF", "HC",'AC', 'HY', 'AY', 'HR', 'AR', "Numerical_ftr", "Numerical_htr"], axis=1)
y = df["FTR"]

## Data preprocessing
There are a few steps to take before training the dummy classifier. 
* Encoding categorical values
* Scaling the values
* Handling the null values in the dataset  



In [117]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

#### Separating categorical values and numerical values

In [118]:
categorical_features = [col for col in X.select_dtypes(include='object')]
numerical_features = [col for col in X.select_dtypes(include='float64')]

### Applying transformations to the data

##### Pipelines creation

In [119]:
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))
full_pipeline = make_column_transformer((num_pipeline, numerical_features), (cat_pipeline, categorical_features))

##### Data transformation

In [120]:
X = full_pipeline.fit_transform(X)
encoder = OneHotEncoder(handle_unknown="ignore")


### Splitting train/test set

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [122]:
y.isnull().value_counts()

False    2280
Name: FTR, dtype: int64

## Making random prediction with the dummy classifier

In [130]:
for strategy in ['stratified', 'most_frequent', 'prior', 'uniform']:
    dummy = DummyClassifier(strategy=strategy, random_state=42)
    dummy.fit(X_train, y_train)
    
    print( f" {strategy} : {dummy.score(X_test, y_test)}")

 stratified : 0.3267543859649123
 most_frequent : 0.4407894736842105
 prior : 0.4407894736842105
 uniform : 0.3223684210526316


## Conclusion
The baseline score for our model is between 0.30 and 0.45. 