# Data modeling

In [75]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

We load the data precedently cleaned in data_analysis.ipynb.

In [76]:
data = pd.read_csv("data/clean_data.csv")
x = data.drop("Smoker", axis=1)
y = data["Smoker"]
data.head()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,...,Height,Weight,Number of siblings,Education,Only child,Smoker,Female,Right handed,Living in a city,Living in a flat
0,5.0,3.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,1.0,...,0,0,1.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0
1,4.0,4.0,2.0,1.0,1.0,1.0,2.0,3.0,5.0,4.0,...,0,1,2.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0
2,5.0,5.0,2.0,2.0,3.0,4.0,5.0,3.0,5.0,3.0,...,3,2,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0
3,5.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,2,1,1.0,3.0,1.0,1.0,1.0,1.0,1.0,0.0
4,5.0,3.0,4.0,3.0,2.0,4.0,3.0,5.0,3.0,1.0,...,1,1,1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0


We scale data.

In [77]:
x = pd.DataFrame(data=StandardScaler().fit_transform(x), columns=x.columns, index=x.index)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


We split the data for training and testing.

In [78]:
# We use stratify to keep a balanced amount of values for the target categories.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=17, stratify=y)
x_train.head()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,...,Age,Height,Weight,Number of siblings,Education,Only child,Female,Right handed,Living in a city,Living in a flat
890,0.401057,-0.393689,-0.956114,-0.253446,0.812691,-1.567909,-0.607303,0.45208,-0.648204,-0.989025,...,0.868028,1.466189,1.445552,0.705455,0.91576,-0.581192,-1.209097,-3.003331,0.647506,0.828775
189,0.401057,-0.393689,-0.956114,2.382746,-0.117956,0.835525,1.772819,1.314906,-1.491711,-0.989025,...,2.963845,-1.337478,0.07392,0.705455,2.193742,-0.581192,0.827063,0.332964,-1.544388,-1.206599
31,-1.121135,-0.393689,0.757708,-1.132177,0.812691,-0.766764,0.186071,1.314906,-0.648204,-0.989025,...,0.868028,0.765272,1.445552,-0.290131,-0.362223,-0.581192,0.827063,0.332964,0.647506,0.828775
368,0.401057,0.805328,-0.099203,0.625284,-0.117956,0.835525,0.186071,1.314906,1.03881,0.466192,...,-0.179881,-0.636561,-0.611895,0.705455,-0.362223,-0.581192,0.827063,0.332964,-1.544388,-1.206599
925,0.401057,-0.393689,0.757708,-0.253446,-0.117956,-1.567909,0.186071,1.314906,-0.648204,-0.261416,...,-0.179881,-1.337478,-1.297711,-0.290131,-0.362223,-0.581192,0.827063,0.332964,0.647506,0.828775


In [79]:
y_train.value_counts()

0.0    510
1.0    291
Name: Smoker, dtype: int64

We define a baseline by using a dummy classifier that classifies everything as the most frequent class in the training dataset, "not smoker".

In [80]:
model_dummy = DummyClassifier(strategy="most_frequent")
model_dummy.fit(x_train, y_train)
print(model_dummy.score(x_test, y_test))

0.6368159203980099


We add a few models to try and get a first glance at what could be successful.

In [81]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "Random Forest",
    "AdaBoost",
    "Naive Bayes",
    "MLPClassifier"
]

classifiers = [
    KNeighborsClassifier(n_neighbors=6, weights='uniform'),
    SVC(kernel='rbf', gamma='scale', random_state=17),
    RandomForestClassifier(n_estimators=110, max_depth=6, random_state=17),
    AdaBoostClassifier(learning_rate=0.4, n_estimators=70, random_state=17),
    GaussianNB(),
    MLPClassifier(hidden_layer_sizes=(150, ), early_stopping=True, max_iter=400, random_state=17)
]

parameters = [
    {
        #"n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8],
        #"weights": ['uniform', 'distance']
    },
    {
        #"C": [0.7, 0.8, 0.9],
        #"kernel": ['linear', 'rbf', 'poly']
    },
    {
        #"n_estimators": [100, 110, 120],
        #"max_depth": [5, 6, 7]
    },
    {
        #"learning_rate": [0.3, 0.4, 0.5],
        #"n_estimators": [50, 60, 70, 80]
    },
    {},
    {
        #"alpha": [0.0001, 0.001, 0.01, 0.1, 1],
        #"hidden_layer_sizes": [(150, ), (200, ), (250, )]
    }
]

For each model proposed we use cross-validation to calculate its accuracy on the training dataset.

In [82]:
for name, clf, par in zip(names, classifiers, parameters):
    grid_obj = GridSearchCV(clf, par, cv=3)
    grid_obj = grid_obj.fit(x_train, y_train)
    # Set the clf to the best combination of parameters
    clf = grid_obj.best_estimator_
    # scores = cross_val_score(clf, x_train, y_train, cv=10)
    print("%s: %f" % (name, grid_obj.best_score_))
    print(grid_obj.best_params_)

Nearest Neighbors: 0.646692
{}
Linear SVM: 0.670412
{}
Random Forest: 0.669164
{}
AdaBoost: 0.689139
{}
Naive Bayes: 0.656679
{}
MLPClassifier: 0.680400
{}


In [83]:
for name, clf in zip(names, classifiers):
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    print("%s: %f" % (name, score))

Nearest Neighbors: 0.651741
Linear SVM: 0.676617
Random Forest: 0.671642
AdaBoost: 0.681592
Naive Bayes: 0.611940
MLPClassifier: 0.636816
