<a href="https://colab.research.google.com/github/OskarKrafft/Machine-Learning-Project/blob/main/03_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

/content/gdrive/MyDrive/Colab Notebooks


In [None]:
import pandas as pd

from google.colab import files
uploaded = files.upload()

# load auto mpg as dummy data set

import io
eppis = pd.read_csv(io.BytesIO(uploaded['auto-mpg.csv']))

Saving auto-mpg.csv to auto-mpg.csv


**Preparing the data for modelling**

In [None]:
# create 1/0 dummy variable

def calc_dummy(row):
   if row['mpg'] >= 22:
        return float(1)
   else:
        return float(0)

eppis['vote_dummy'] = eppis.apply(calc_dummy, axis=1)
eppis

eppis['vote_dummy'].mean()

0.5376884422110553

In [None]:
# Define X and y

print(eppis.head())

X = eppis.drop(columns='vote_dummy') # reference variable which contains voted y/n
y = eppis['vote_dummy'] # reference variable which contains voted y/n

# 80/20 train-test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20)

    mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0          8         307.0        130    3504          12.0          70   
1  15.0          8         350.0        165    3693          11.5          70   
2  18.0          8         318.0        150    3436          11.0          70   
3  16.0          8         304.0        150    3433          12.0          70   
4  17.0          8         302.0        140    3449          10.5          70   

   origin                   car name  vote_dummy  
0       1  chevrolet chevelle malibu         0.0  
1       1          buick skylark 320         0.0  
2       1         plymouth satellite         0.0  
3       1              amc rebel sst         0.0  
4       1                ford torino         0.0  


In [None]:
X_train
y_train

333    1.0
111    0.0
377    1.0
230    0.0
135    0.0
      ... 
349    1.0
161    0.0
162    0.0
269    1.0
118    1.0
Name: vote_dummy, Length: 318, dtype: float64

In [None]:
# pre-processing pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# identify all numeric variables by data type
numeric_X_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# identify all categorical variables by data type
categorical_X_features = X_train.select_dtypes(include=['object', 'bool']).columns

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())] # tbd whether we need this, likely no, any other transformer?
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_X_features),
        ("cat", categorical_transformer, categorical_X_features),
    ]
)

**Model 1 (Baseline): Logistic Regression**

In [None]:
# define logistic regression model

pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

In [None]:
# hypertune parameters

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

# define parameters to be optimised, based on handbook and different online articles
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# fit model
grid = {
    'classifier__solver':solvers, 
    'classifier__penalty':penalty,
    'classifier__C':c_values}

# set-up for cross-validation 
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# tbd which metric for comparison, currently still set to accuracy
grid_search = GridSearchCV(estimator=pipe, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

# fit the grid search model
grid_result = grid_search.fit(X_train, y_train)

# print the mean test scrore (currently accuracy), sd and the params that were used
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# document best params as vector

from joblib import dump, load

estimator = grid_result.best_estimator_
dump(estimator, "best-logistic-regression.joblib")

# Later when we compare our hyperturned models, load the specification with the code below
# estimator = load("best-logistic-regression.joblib") #

Best: 0.966499 using {'classifier__C': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
0.966499 (0.036135) with: {'classifier__C': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
0.966499 (0.036135) with: {'classifier__C': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
0.962265 (0.035740) with: {'classifier__C': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
0.964382 (0.039506) with: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
0.964382 (0.039506) with: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
0.962298 (0.038325) with: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
0.960249 (0.039602) with: {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}
0.960249 (0.039602) with: {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
0.9

['best-logistic-regression.joblib']

**Model 2: Naive Bayesian**

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['mpg', 'cylinders', 'displacement', 'weight', 'acceleration',
       'model year', 'origin'],
      dtype='object')),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['horsepower', 'car name'], dtype='object'))])),
                ('classifier', LogisticRegression(C=100, solver='newton-cg'))])