# Data modeling

In [83]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

We load the data precedently cleaned in data_analysis.ipynb.

In [84]:
data = pd.read_csv("data/clean_data.csv")
x = data.drop("Smoking", axis=1)
x = pd.DataFrame(data=StandardScaler().fit_transform(x), columns=x.columns, index=x.index)
y = data["Smoking"]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


We split the data for training and testing.

In [85]:
# We use stratify to keep a balanced amount of values for the target categories.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=17, stratify=y)
x_train.head()

Unnamed: 0.1,Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,...,Gender_female,Gender_male,Left - right handed_left handed,Left - right handed_right handed,Only child_no,Only child_yes,Village - town_city,Village - town_village,House - block of flats_block of flats,House - block of flats_house/bungalow
368,-0.377914,0.401286,0.824163,-1.812488,-1.121607,0.819654,-1.568627,-1.407952,-2.162417,-0.647248,...,-1.202329,1.210269,-0.326623,0.332548,0.582243,-0.578981,0.646078,-0.641141,0.829898,-0.824448
389,-0.302319,0.401286,-0.395288,-0.092901,-1.121607,-1.044812,-1.568627,-1.407952,1.319581,1.049979,...,0.831719,-0.826262,-0.326623,0.332548,0.582243,-0.578981,-1.5478,1.55972,-1.204968,1.212932
909,1.608158,0.401286,0.824163,-0.952695,1.525386,0.819654,0.830051,-0.613982,-1.291917,1.049979,...,-1.202329,1.210269,-0.326623,0.332548,0.582243,-0.578981,0.646078,-0.641141,0.829898,-0.824448
357,-0.415711,0.401286,-0.395288,0.766892,0.643055,1.751887,-0.769067,-0.613982,1.319581,-2.344474,...,0.831719,-0.826262,-0.326623,0.332548,-1.717496,1.727172,-1.5478,1.55972,0.829898,-0.824448
745,1.006839,-2.716523,2.043614,-0.092901,-0.239276,-0.112579,0.830051,0.179989,1.319581,-2.344474,...,-1.202329,1.210269,3.061638,-3.007084,0.582243,-0.578981,0.646078,-0.641141,-1.204968,1.212932


In [86]:
y_train.value_counts()

1    322
0    157
4    142
3    134
Name: Smoking, dtype: int64

We define a baseline by using a dummy classifier that classifies everything as the most frequent class in the training dataset, "tried smoking".

In [87]:
model_dummy = DummyClassifier(strategy="most_frequent")
model_dummy.fit(x_train, y_train)
print(model_dummy.score(x_test, y_test))

0.42857142857142855


We add a few models to try and get a first glance at what could be successfull.

In [88]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Random Forest", "AdaBoost",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(),
    GaussianNB()]

In [89]:
for name, clf in zip(names, classifiers):
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    print("%s: %f" % (name, score))

Nearest Neighbors: 0.338624
Linear SVM: 0.417989
RBF SVM: 0.428571
Gaussian Process: 0.211640
Random Forest: 0.460317
AdaBoost: 0.402116
Naive Bayes: 0.375661
