In [1]:
import numpy as np
import pandas as pd
import warnings
from time import time
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, log_loss
print('Load modules')

Load modules


In [2]:
warnings.filterwarnings('ignore')
t0 = time()
data = pd.read_csv('train.csv')
label = data['Survived']
data = data.drop(['Name', 'Ticket', 'Cabin', 'Survived', 'PassengerId'], axis=1)

data['Age'] = data['Age'].fillna(np.nanmean(data['Age']))        # Assign mean
data['Fare'] = data['Fare'].fillna(np.nanmean(data['Fare']))     # Assign mean
data['Embarked'] = data['Embarked'].fillna('U')
label_enc = LabelEncoder()
data['Sex'] = label_enc.fit_transform(data['Sex'])
data['Embarked'] = label_enc.fit_transform(data['Embarked'])


In [3]:
X_train, X_valid, y_train, y_valid = train_test_split(data, label, test_size=0.2)
print('X_train_shape = ', X_train.shape)
scale = MinMaxScaler()

X_train = scale.fit_transform(X_train)
X_valid = scale.transform(X_valid)


X_train_shape =  (712, 7)


In [4]:
svm = SVC(kernel='poly', degree=6, coef0=1, C=1, probability=True)
ada_svm = AdaBoostClassifier(base_estimator=svm, n_estimators=100, algorithm='SAMME', learning_rate=1)
bag_svm = BaggingClassifier(base_estimator=svm, n_estimators=100)

nbr = KNeighborsClassifier(n_neighbors=11, weights='distance', p=1)
bag_knn = BaggingClassifier(base_estimator=nbr, n_estimators=100)

neu = MLPClassifier(hidden_layer_sizes=(300, 75), activation='logistic', solver='lbfgs', alpha=0.001, early_stopping=True)
bag_neu = BaggingClassifier(base_estimator=neu, n_estimators=100)

dec = DecisionTreeClassifier(class_weight='balanced')
ada_dec = AdaBoostClassifier(base_estimator=dec, n_estimators=50, algorithm='SAMME.R', learning_rate=0.001)
bag_dec = BaggingClassifier(base_estimator=dec, n_estimators=150)
rad = RandomForestClassifier(n_estimators=50)

classifiers = {'Std SVM': svm, 'Std KNN': nbr, 'Std MLP': neu, 'Std DTL': dec, 'Std RTr': rad, 
               'Bag SVM': bag_svm, 'Bag KNN': bag_knn, 'Bag MLP': bag_neu, 'Bag DTL': bag_dec, 
               'Ada SVM': ada_svm, 'ADA DTL': ada_dec}

In [5]:
for name in classifiers:
    clf = classifiers[name]
    clf.fit(X_train, y_train)
    y_pro = clf.predict_proba(X_valid)
    y_ = (y_pro[:, 1] > 0.51)
    loss = log_loss(y_valid, y_pro)
    acc = accuracy_score(y_valid, y_)
    fsc = f1_score(y_valid, y_)
    print('%s\tAcc = %0.4f\tf-score = %0.4f\tlog_loss = %0.4f' %((name, acc, fsc, loss)))

Std SVM	Acc = 0.8268	f-score = 0.7597	log_loss = 0.4416
Std KNN	Acc = 0.8324	f-score = 0.7794	log_loss = 1.7082
Std MLP	Acc = 0.8101	f-score = 0.7344	log_loss = 0.4367
Std DTL	Acc = 0.7542	f-score = 0.7067	log_loss = 8.1348
Std RTr	Acc = 0.8324	f-score = 0.7887	log_loss = 0.8019
Bag SVM	Acc = 0.8268	f-score = 0.7597	log_loss = 0.4339
Bag KNN	Acc = 0.8324	f-score = 0.7794	log_loss = 0.6551
Bag MLP	Acc = 0.8324	f-score = 0.7581	log_loss = 0.3893
Bag DTL	Acc = 0.8380	f-score = 0.7972	log_loss = 0.9666
Ada SVM	Acc = 0.7989	f-score = 0.7313	log_loss = 0.5800
ADA DTL	Acc = 0.7709	f-score = 0.7172	log_loss = 6.1226


In [6]:
param = {'n_estimators': [25, 50, 75, 100, 125, 150]}
grid = GridSearchCV(rad, param, cv=3, verbose=3)
grid.fit(X_train, y_train)
print(pd.DataFrame(grid.cv_results_ ))
print(grid.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] n_estimators=25 .................................................
[CV] ........ n_estimators=25, score=0.7857142857142857, total=   0.0s
[CV] n_estimators=25 .................................................
[CV] ......... n_estimators=25, score=0.810126582278481, total=   0.0s
[CV] n_estimators=25 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ........ n_estimators=25, score=0.8354430379746836, total=   0.0s
[CV] n_estimators=50 .................................................
[CV] ......... n_estimators=50, score=0.773109243697479, total=   0.0s
[CV] n_estimators=50 .................................................
[CV] ......... n_estimators=50, score=0.810126582278481, total=   0.0s
[CV] n_estimators=50 .................................................
[CV] ........ n_estimators=50, score=0.8354430379746836, total=   0.0s
[CV] n_estimators=75 .................................................
[CV] ........ n_estimators=75, score=0.7857142857142857, total=   0.0s
[CV] n_estimators=75 .................................................
[CV] ......... n_estimators=75, score=0.810126582278481, total=   0.0s
[CV] n_estimators=75 .................................................
[CV] ........ n_estimators=75, score=0.8312236286919831, total=   0.1s
[CV] n_estimators=100 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    5.6s finished
