# Data modeling

In [89]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

We load the data precedently cleaned in data_analysis.ipynb.

In [90]:
data = pd.read_csv("data/clean_data.csv")
x = data.drop("Smoking", axis=1)
y = data["Smoking"]

We scale data.

In [91]:
x = pd.DataFrame(data=StandardScaler().fit_transform(x), columns=x.columns, index=x.index)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


We split the data for training and testing.

In [92]:
# We use stratify to keep a balanced amount of values for the target categories.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=17, stratify=y)
x_train.head()

Unnamed: 0.1,Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,...,Age,Height,Weight,Number of siblings,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
890,1.347152,0.401057,-0.393689,-0.956114,-0.253446,0.812691,-1.567909,-0.607303,0.45208,-0.648204,...,0.868028,1.456317,1.445711,0.705455,-1.209097,-3.003331,0.91576,-0.581192,0.647506,0.828775
189,-1.073451,0.401057,-0.393689,-0.956114,2.382746,-0.117956,0.835525,1.772819,1.314906,-1.491711,...,2.963845,-1.335131,0.075226,0.705455,0.827063,0.332964,2.193742,-0.581192,-1.544388,-1.206599
31,-1.62203,-1.121135,-0.393689,0.757708,-1.132177,0.812691,-0.766764,0.186071,1.314906,-0.648204,...,0.868028,0.758455,1.445711,-0.290131,0.827063,0.332964,-0.362223,-0.581192,0.647506,0.828775
368,-0.459729,0.401057,0.805328,-0.099203,0.625284,-0.117956,0.835525,0.186071,1.314906,1.03881,...,-0.179881,-0.637269,-0.610016,0.705455,0.827063,0.332964,-0.362223,-0.581192,-1.544388,-1.206599
925,1.467153,0.401057,-0.393689,0.757708,-0.253446,-0.117956,-1.567909,0.186071,1.314906,-0.648204,...,-0.179881,-1.335131,-1.295259,-0.290131,0.827063,0.332964,-0.362223,-0.581192,0.647506,0.828775


In [93]:
y_train.value_counts()

0.0    510
1.0    291
Name: Smoking, dtype: int64

We define a baseline by using a dummy classifier that classifies everything as the most frequent class in the training dataset, "not smoker".

In [94]:
model_dummy = DummyClassifier(strategy="most_frequent")
model_dummy.fit(x_train, y_train)
print(model_dummy.score(x_test, y_test))

0.6368159203980099


We add a few models to try and get a first glance at what could be successful.

In [95]:
names = ["Nearest Neighbors",
         "Linear SVM",
         "Random Forest",
         "AdaBoost",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(),
    SVC(kernel="linear"),
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(),
    GaussianNB()]

parameters = [
    {"n_neighbors": [6, 7, 8]},
    {"C": [10, 12, 15]},
    {"n_estimators": [90, 100, 110]},
    {"learning_rate": [0.2, 0.3, 0.4]},
    {}
]

For each model proposed we use cross-validation to calculate its accuracy on the training dataset.

In [96]:
for name, clf, par in zip(names, classifiers, parameters):
    grid_obj = GridSearchCV(clf, par, cv=5)
    grid_obj = grid_obj.fit(x_train, y_train)
    # Set the clf to the best combination of parameters
    clf = grid_obj.best_estimator_
    # scores = cross_val_score(clf, x_train, y_train, cv=10)
    print("%s: %f" % (name, grid_obj.best_score_))
    print(grid_obj.best_params_)

Nearest Neighbors: 0.652934
{'n_neighbors': 7}
Linear SVM: 0.674157
{'C': 12}
Random Forest: 0.675406
{'n_estimators': 90}
AdaBoost: 0.689139
{'learning_rate': 0.3}
Naive Bayes: 0.662921
{}


In [97]:
for name, clf in zip(names, classifiers):
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    print("%s: %f" % (name, score))

Nearest Neighbors: 0.661692
Linear SVM: 0.641791
Random Forest: 0.646766
AdaBoost: 0.651741
Naive Bayes: 0.611940
