In [129]:
import pandas as pd
import numpy as np
from collections import Counter
import datetime

import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [130]:
df = pd.read_csv("../datasets/titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Part 2.
## Features extraction and basic DecisionTreeСlassifier

### 2.1
<i>There are a missing values in data - for example, for certain passengers age is missing.<br>
Drop all samples which contain nan values in any of these variables - 'Pclass', 'Fare', 'Age', 'Sex'
</i>

In [131]:
df.dropna(inplace=True, subset=['Pclass', 'Fare', 'Age', 'Sex'])

### 2.2
<i>Leave in dataset only 4 variables:<br> 
- a passenger's class (Pclass)
- price of a ticket (Fare)
- a passenger's age (Age)
- a passenger's sex (Sex) 
</i>

In [132]:
df2 = df[['Pclass', 'Fare', 'Age', 'Sex']].copy()
df2.head()

Unnamed: 0,Pclass,Fare,Age,Sex
0,3,7.25,22.0,male
1,1,71.2833,38.0,female
2,3,7.925,26.0,female
3,1,53.1,35.0,female
4,3,8.05,35.0,male


### 2.3
<i>Convert "Sex" variable from string to integer type</i>

In [133]:
df2['Sex'] = np.where(df2['Sex'] == 'male', 1, 0)
df2.head()

Unnamed: 0,Pclass,Fare,Age,Sex
0,3,7.25,22.0,1
1,1,71.2833,38.0,0
2,3,7.925,26.0,0
3,1,53.1,35.0,0
4,3,8.05,35.0,1


### 2.4
<i>Select the target variable — "Survived"</i>

In [134]:
target_variable = df['Survived']
target_variable.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

### 2.5 
<i>Let's for example teach a basic decision tree with random_state=241 and all other arguments as default<i>

<b>Link to "Understanding the decision tree structure":</b><br>
http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html<br>
<b>"Desition tree" in Scikit Learn docs</b><br>
http://scikit-learn.org/stable/modules/tree.html

In [135]:
estimator = DecisionTreeClassifier(random_state=241)
estimator.fit(df2, target_variable)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=241, splitter='best')

In [136]:
accuracy_score(target_variable, estimator.predict(df2))

0.98459383753501406

<b>Accuracy</b> of our model is <u>98%</u> which is very high.<br>
But it's important to understand that we performed testing of our model only on training data.<br> This score doen't tell us how our model works on a new data.<br>
<br>
Moreover, there is a high risk that our model is <b>overfitted</b> (or <b>overlearned</b>) which means that it corresponds too closely or exactly to our initial dataset.<br>
<br>
In the next part we will:
- <b>split our training set</b> into separate test and training datasets  
- perform <b>feature selection</b>, which will help us to reduce amount of variables by which we will train our model
- perform <b>cross validation</b>,  which will help us to more accurately count accuracy of trained model

# Part 3
## Applying machine learning

### 3.1
<i>Find two most important features in dataset.</i>

This task is called <b>feature selection</b>.<br>
Documentation on <b>SelectKBest</b>:<br>
http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection

In [137]:
print("Initial df2 shape: {}".format(df2.shape))
print("df2: \n{}".format(df2.head(10)))

# Create and fit selector
selector = SelectKBest(k=2)
selector.fit(df2, target_variable)

# Get ids of columns to keep
ids_selected = selector.get_support(indices=True)

# Create new dataframe with only desired columns, or overwrite existing
df2_reduced = df2.iloc[:,ids_selected]

print("New df2 shape: " + str(df2_reduced.shape))
df2_reduced.head()

Initial df2 shape: (714, 4)
df2: 
    Pclass     Fare   Age  Sex
0        3   7.2500  22.0    1
1        1  71.2833  38.0    0
2        3   7.9250  26.0    0
3        1  53.1000  35.0    0
4        3   8.0500  35.0    1
6        1  51.8625  54.0    1
7        3  21.0750   2.0    1
8        3  11.1333  27.0    0
9        2  30.0708  14.0    0
10       3  16.7000   4.0    0
New df2 shape: (714, 2)


Unnamed: 0,Pclass,Sex
0,3,1
1,1,0
2,3,0
3,1,0
4,3,1


In [138]:
df2_reduced.columns

Index(['Pclass', 'Sex'], dtype='object')

### Answer
<b>'Pclass' и 'Sex'</b> - two most important features based on method <b>SelectKBest</b> and statistics test <b>k-classif</b>

### 3.2
<i>Train a model using two main features which predicts, will a person survive a Titanic sinking or not.</i>

First we will <b>divide</b> our reduced dataframe df2_reduced (dataframe with only 2 main features) <b>into test and train datasets</b> in proportion, for example 70-30%.

Also let's make an <b>explicit random_state = 241</b> for sake of definite reproduction of results. 

In [139]:
my_rand_state = 241

In [140]:
X_train, X_test, y_train, y_test = train_test_split(df2_reduced, target_variable, 
                                                    test_size=0.3, random_state=my_rand_state)

Let's train <b>DecisionTreeClassifier</b> with random_state=241 once again, but this time using reduced dataset and testing its accuracy on testing dataset (which wasn't used in training).

In [141]:
estimator = DecisionTreeClassifier(random_state=my_rand_state)
estimator.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=241, splitter='best')

In [142]:
accuracy_score(y_test, estimator.predict(X_test))

0.8046511627906977

<u>80,4%</u> - not a perfect result, but it's more representative than 98%, which we got using training data as a test data. 

### 3.3
<i>Train a model using «KNearestNeighbors» and «LogisticRegression»

<b>Scikit-learn KNearestNeighbors:</b><br>
http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html<br>
<b>KNearestNeighbors in details:</b><br>
http://scikit-learn.org/stable/modules/neighbors.html<br>
<b>Scikit-learn LogisticRegression:</b><br>
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

<b>KNearestNeighbours</b> with amount of neighbors = 5

In [143]:
# Educate
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)

# Predict
y_pred = neigh.predict(X_test)

# Print report
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
accuracy_score(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.81      0.86      0.84       133
          1       0.75      0.67      0.71        82

avg / total       0.79      0.79      0.79       215

[[115  18]
 [ 27  55]]


0.79069767441860461

<b>LogisticRegression</b> with default parameters (L2 as a penalty; liblinear as a solver because it is recommended for small datasets)

In [144]:
# Educate
regr = LogisticRegression()
regr.fit(X_train, y_train)

# Predict
y_pred = regr.predict(X_test)

# Print report
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
accuracy_score(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.81      0.83      0.82       133
          1       0.71      0.70      0.70        82

avg / total       0.78      0.78      0.78       215

[[110  23]
 [ 25  57]]


0.77674418604651163

### 3.4
Check the accuracy of models with <b>cross validation</b>.<br>

In [145]:
scores_neigh = cross_val_score(neigh, df2_reduced, target_variable, cv=5)
print("Cross-validated scores for each step: \n{}".format(scores_neigh))

Cross-validated scores for each step: 
[ 0.78321678  0.81118881  0.77622378  0.82517483  0.79577465]


In [146]:
scores_regr = cross_val_score(regr, df2_reduced, target_variable, cv=5)
print("Cross-validated scores for each step: \n{}".format(scores_regr))

Cross-validated scores for each step: 
[ 0.78321678  0.81818182  0.77622378  0.72727273  0.79577465]


### 3.5
<i>Compare accuracy of «KNearestNeighbors» and «LogisticRegression» in percents.</i>

<b>«KNearestNeighbors» accuracy</b>

In [147]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_neigh.mean(), scores_neigh.std() * 2))

Accuracy: 0.80 (+/- 0.04)


<b>«LogisticRegression» accuracy</b>

In [148]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_regr.mean(), scores_regr.std() * 2))

Accuracy: 0.78 (+/- 0.06)


### Answer
<b>«KNearestNeighbors» with amount of neighbors = 5</b> gives a 2% more accuracy using k=5 cross validation than a <b>«LogisticRegression»</b>.

# Part 4
## Making predictions on test dataset (for a Kaggle competition)

### 4.1 
<i>Read test data, for which we will make predictions</i>

In [149]:
df_test = pd.read_csv("../datasets/titanic_test.csv")
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### 4.2
<i>Do some dataset preparation that we already did for the train data
- Leave only two most important features in dataset (according to SelectKBest, that we did in step 3.1)
- Drop every line that contains NaN in those two features (if there are any)
- Convert "Sex" from strings to integers
</i>

In [150]:
df_test.dropna(inplace=True, subset=['Pclass', 'Sex'])

df2_test = df_test[['Pclass', 'Sex']].copy()

df2_test['Sex'] = np.where(df2_test['Sex'] == 'male', 1, 0)
df2_test.head()

Unnamed: 0,Pclass,Sex
0,3,1
1,3,0
2,2,1
3,3,1
4,3,0


### 4.3
<i>Educate ML algorithms for classification:
- without explicitly stating a "random_state"
- on the whole train data</i>

In [151]:
# LogisticRegression
regr = LogisticRegression()
regr.fit(df2_reduced, target_variable)

# KNeighbors
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(df2_reduced, target_variable)

# DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(df2_reduced, target_variable)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

### 4.4
<i>Cross-validation to understand best performing algorithm</i>

In [152]:
scores_regr = cross_val_score(regr, df2_reduced, target_variable, cv=10)
scores_neigh = cross_val_score(neigh, df2_reduced, target_variable, cv=10)
scores_tree = cross_val_score(tree, df2_reduced, target_variable, cv=10)

print("Accuracy LogisticRegression: %0.2f (+/- %0.2f)" % (scores_regr.mean(), scores_regr.std() * 2))
print("Accuracy KNeighborsClassifier: %0.2f (+/- %0.2f)" % (scores_neigh.mean(), scores_neigh.std() * 2))
print("Accuracy DecisionTreeClassifier: %0.2f (+/- %0.2f)" % (scores_tree.mean(), scores_tree.std() * 2))

Accuracy LogisticRegression: 0.78 (+/- 0.08)
Accuracy KNeighborsClassifier: 0.76 (+/- 0.09)
Accuracy DecisionTreeClassifier: 0.79 (+/- 0.08)


### 4.5
<i>Make predictions with DecisionTreeClassifier because it was the most accurate method on train data based on cross-validation</i>

In [153]:
y_pred = tree.predict(df2_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

### 4.6
*Form a dataset and write it to a file*

In [154]:
d = {'Survived': y_pred}
df_to_submit = pd.DataFrame(data=d, index=df_test["PassengerId"])
df_to_submit.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


In [155]:
now_dt = datetime.datetime.now().strftime("%d:%m:%Y")
df_to_submit.to_csv("subm_decision_tree_{}.csv".format(now_dt))

**Make predictions based on another algorithms**

In [156]:
y_pred = neigh.predict(df2_test)
d = {'Survived': y_pred}
df_to_submit = pd.DataFrame(data=d, index=df_test["PassengerId"])
df_to_submit.head()
now_dt = datetime.datetime.now().strftime("%d:%m:%Y")
df_to_submit.to_csv("subm_neigh_{}.csv".format(now_dt))

y_pred = regr.predict(df2_test)
d = {'Survived': y_pred}
df_to_submit = pd.DataFrame(data=d, index=df_test["PassengerId"])
df_to_submit.head()
now_dt = datetime.datetime.now().strftime("%d:%m:%Y")
df_to_submit.to_csv("subm_regr_{}.csv".format(now_dt))

# Scores on Kaggle:
<b>0.76555</b> - LogisticRegression<br>
<b>0.75598</b> - DecisionTreeClassifier<br>
<b>0.70813</b> - KNeighborsClassifier(n_neighbors=5)

### Tuning the performance of methods 

In [157]:
df_for_experiments = df[['Pclass', 'Fare', 'Age', 'Sex', 'SibSp', 'Parch']].copy()
df_for_experiments['Sex'] = np.where(df_for_experiments['Sex'] == 'male', 1, 0)
df_for_experiments.head()

Unnamed: 0,Pclass,Fare,Age,Sex,SibSp,Parch
0,3,7.25,22.0,1,1,0
1,1,71.2833,38.0,0,1,0
2,3,7.925,26.0,0,0,0
3,1,53.1,35.0,0,1,0
4,3,8.05,35.0,1,0,0


In [158]:
target_variable = df['Survived']
target_variable.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [159]:
df_test = pd.read_csv("../datasets/titanic_test.csv")
# df_test.dropna(inplace=True, subset=['Pclass', 'Fare', 'Age', 'Sex', 'SibSp', 'Parch'])
df_test["Fare"].fillna(value=df_test["Fare"].mean(), inplace=True)
df_test["Age"].fillna(value=df_test["Age"].mean(), inplace=True)
df2_test = df_test[['Pclass', 'Fare', 'Age', 'Sex', 'SibSp', 'Parch']].copy()

df2_test['Sex'] = np.where(df2_test['Sex'] == 'male', 1, 0)
df2_test.head()

Unnamed: 0,Pclass,Fare,Age,Sex,SibSp,Parch
0,3,7.8292,34.5,1,0,0
1,3,7.0,47.0,0,1,0
2,2,9.6875,62.0,1,0,0
3,3,8.6625,27.0,1,0,0
4,3,12.2875,22.0,0,1,1


In [160]:
pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression())

pipe_lr.fit(df_for_experiments, target_variable)

# print(.isnull())

y_pred = pipe_lr.predict(df2_test)

# scores_neigh = cross_val_score(neigh, df_for_experiments, target_variable, cv=5)
print("Accuracy: %.3f" % pipe_lr.score(df2_test, y_pred))
len(y_pred)

Accuracy: 1.000


418

In [161]:
scores_pipe_regr = cross_val_score(pipe_lr, df_for_experiments, target_variable, cv=10)

print("Accuracy scores_pipe_regr: %0.2f (+/- %0.2f)" % (scores_pipe_regr.mean(), scores_pipe_regr.std() * 2))

Accuracy scores_pipe_regr: 0.79 (+/- 0.06)


In [162]:
def write_to_file(y_pred, name_prefix):
    d = {'Survived': y_pred}
    df_to_submit = pd.DataFrame(data=d, index=df_test["PassengerId"])
    df_to_submit.head()
    now_dt = datetime.datetime.now().strftime("%d:%m:%Y")
    df_to_submit.to_csv("{}_{}.csv".format(name_prefix, now_dt))

In [163]:
write_to_file(y_pred, "pipe_scaling_regr")

### SVC with standard scaler

In [165]:
pipe_svc = make_pipeline(StandardScaler(),
                        SVC())

param_range = [0.0001, 0.001, 0.01, 0.1, 
              1.0, 10.0, 100.0, 1000.0]

param_grid = [{'svc__C': param_range,
              'svc__kernel':['linear']},
              {'svc__C': param_range,
               'svc__gamma': param_range,
               'svc__kernel':['rbf']}]

gs = GridSearchCV(estimator=pipe_svc,
                 param_grid=param_grid,
                 scoring='accuracy',
                 cv=10,
                 n_jobs=-1)

gs = gs.fit(df_for_experiments, target_variable)

In [166]:
print(gs.best_score_)
print(gs.best_params_)

0.824929971989
{'svc__C': 10.0, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}


In [167]:
clf = gs.best_estimator_
clf.fit(df_for_experiments, target_variable)

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [168]:
scores_grid_svc = cross_val_score(clf, df_for_experiments, target_variable, cv=10)
print("Accuracy scores_pipe_regr: %0.2f (+/- %0.2f)" % (scores_grid_svc.mean(), scores_grid_svc.std() * 2))

Accuracy scores_pipe_regr: 0.82 (+/- 0.08)


In [169]:
y_pred = clf.predict(df2_test)
write_to_file(y_pred, "pipe_grid_svc")

### Kaggle Score
**0.77033**

### SVC with SelectKBest with standard scaler

In [177]:
pipe_svc_k_best = make_pipeline(StandardScaler(),
                         SelectKBest(),
                         SVC())

param_range = [0.0001, 0.001, 0.01, 0.1, 
              1.0, 10.0, 100.0, 1000.0]

param_k_best = [1,2,3,4]

param_grid = [{'svc__C': param_range,
              'svc__kernel':['linear'],
              'selectkbest__k':param_k_best},
              {'svc__C': param_range,
               'svc__gamma': param_range,
               'svc__kernel':['rbf'],
               'selectkbest__k':param_k_best}]

gs_with_k_best = GridSearchCV(estimator=pipe_svc_k_best,
                 param_grid=param_grid,
                 scoring='accuracy',
                 cv=10,
                 n_jobs=-1)

gs_with_k_best = gs_with_k_best.fit(df_for_experiments, target_variable)

In [178]:
print(gs_with_k_best.best_score_)
print(gs_with_k_best.best_params_)

0.805322128852
{'selectkbest__k': 3, 'svc__C': 100.0, 'svc__gamma': 1000.0, 'svc__kernel': 'rbf'}


In [179]:
clf = gs_with_k_best.best_estimator_
clf.fit(df_for_experiments, target_variable)

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=3, score_func=<function f_classif at 0x7f145d9e7f28>)), ('svc', SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1000.0, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [180]:
scores_grid_svc = cross_val_score(clf, df_for_experiments, target_variable, cv=10)
print("Accuracy scores_pipe_regr: %0.2f (+/- %0.2f)" % (scores_grid_svc.mean(), scores_grid_svc.std() * 2))

Accuracy scores_pipe_regr: 0.81 (+/- 0.08)


In [181]:
y_pred = clf.predict(df2_test)
write_to_file(y_pred, "pipe_grid_kbest_svc")

### Kaggle Score
**0.74162**

### SVC with SelectKBest(k=2) with standard scaler

In [182]:
pipe_svc_k_best_2 = make_pipeline(SelectKBest(k=2),
                                StandardScaler(),
                                SVC())

param_range = [0.0001, 0.001, 0.01, 0.1, 
              1.0, 10.0, 100.0, 1000.0]

param_grid = [{'svc__C': param_range,
              'svc__kernel':['linear']},
              {'svc__C': param_range,
               'svc__gamma': param_range,
               'svc__kernel':['rbf']}]

gs_with_k_best_2 = GridSearchCV(estimator=pipe_svc_k_best_2,
                 param_grid=param_grid,
                 scoring='accuracy',
                 cv=10,
                 n_jobs=-1)

gs_with_k_best_2 = gs_with_k_best_2.fit(df_for_experiments, target_variable)

In [183]:
print(gs_with_k_best_2.best_score_)
print(gs_with_k_best_2.best_params_)

0.791316526611
{'svc__C': 0.01, 'svc__gamma': 1.0, 'svc__kernel': 'rbf'}


In [184]:
clf = gs_with_k_best_2.best_estimator_
clf.fit(df_for_experiments, target_variable)

Pipeline(steps=[('selectkbest', SelectKBest(k=2, score_func=<function f_classif at 0x7f145d9e7f28>)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [185]:
scores_grid_svc = cross_val_score(clf, df_for_experiments, target_variable, cv=10)
print("Accuracy scores_pipe_regr: %0.2f (+/- %0.2f)" % (scores_grid_svc.mean(), scores_grid_svc.std() * 2))

Accuracy scores_pipe_regr: 0.79 (+/- 0.08)


In [186]:
y_pred = clf.predict(df2_test)
write_to_file(y_pred, "pipe_grid_kbest_2_svc")

### Kaggle Score
**0.75598**