In [2]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import KFold
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [3]:
df = pd.read_csv('cleaneddata.csv')

X = df.iloc[:,1:]
Y = df.iloc[:,0]

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3)

In [4]:
df.shape

(891, 51)

In [5]:
clf_rf = RandomForestClassifier(n_estimators=10)
clf_rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [6]:
clf_rf.score(X_train,Y_train)

0.9085072231139647

In [7]:
clf_rf.score(X_test,Y_test)

0.8208955223880597

In [8]:
# Perform KFold on RandomForest Classifier

df = pd.read_csv('cleaneddata.csv')

X = df.iloc[:,1:]
Y = df.iloc[:,0]

score_train = [0]
score_cv = [0]

parameters = {"n_estimators":[10,100],
             "criterion":("gini","entropy"),
             #"max_features":('auto'),
             "n_jobs":[1,10]}

kf = KFold(n_splits=10, shuffle=True)
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    rf = RandomForestClassifier()
    clf_rf = GridSearchCV(rf,parameters)
    clf_rf.fit(X_train,Y_train)
    if clf_rf.score(X_test,Y_test) > max(score_cv):
        most_accurate = clf_rf.predict
    score_train.append(clf_rf.score(X_train,Y_train))
    score_cv.append(clf_rf.score(X_test,Y_test))

score_train,score_cv = np.array(score_train),np.array(score_cv)
print('train score: ',score_train[score_cv.argmax()])
print('cross-validation score: ',score_cv.max())


print("train score: ",score_train.max())
print("cross-validation score: ",score_cv[score_train.argmax()])


train score:  0.9039900249376559
cross-validation score:  0.8764044943820225
train score:  0.9226932668329177
cross-validation score:  0.7528089887640449


In [9]:
df_predict = pd.read_csv('cleaneddata_test.csv')

predict = most_accurate(df_predict)

PassengerId = np.arange(892,1310)

result = pd.DataFrame(PassengerId,columns=['PassengerId'])

result["Survived"] = pd.Series(predict)
result.to_csv('submit_rf.csv',index=False)

In [10]:
# Perform KFold on Gradient Boosting Classifier

df = pd.read_csv('cleaneddata.csv')

X = df.iloc[:,1:]
Y = df.iloc[:,0]

score_train = [0]
score_cv = [0]
most_accurate = 0

parameters = {"loss":("deviance","exponential"),
              "learning_rate":[0.01,1],
              "n_estimators":[90,150]}

kf = KFold(n_splits=10)
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    gb = GradientBoostingClassifier()
    clf_gb = GridSearchCV(gb,parameters)
    clf_gb.fit(X_train,Y_train)
    if clf_gb.score(X_test,Y_test) > max(score_cv):
        most_accurate = clf_gb.predict
    score_train.append(clf_gb.score(X_train,Y_train))
    score_cv.append(clf_gb.score(X_test,Y_test))
    
    
score_train,score_cv = np.array(score_train),np.array(score_cv)
print('train score: ',score_train[score_cv.argmax()])
print('cross-validation score: ',score_cv.max())


print("train score: ",score_train.max())
print("cross-validation score: ",score_cv[score_train.argmax()])


train score:  0.830423940149626
cross-validation score:  0.8764044943820225
train score:  0.9201995012468828
cross-validation score:  0.7752808988764045


In [11]:
df_predict = pd.read_csv('cleaneddata_test.csv')

predict = most_accurate(df_predict)

PassengerId = np.arange(892,1310)

result = pd.DataFrame(PassengerId,columns=['PassengerId'])

result["Survived"] = pd.Series(predict)
result.to_csv('submit_gb.csv',index=False)

# Perform KFold on Neural Network

df = pd.read_csv('cleaneddata.csv')

X = df.iloc[:,1:]
Y = df.iloc[:,0]

score_train = []
score_cv = []

kf = KFold(n_splits=9, shuffle=True)
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    clf_mlp = MLPClassifier(solver='lbfgs')
    clf_mlp.fit(X_train,Y_train)
    score_train.append(clf_mlp.score(X_train,Y_train))
    score_cv.append(clf_mlp.score(X_test,Y_test))
    
score_train,score_cv = np.array(score_train),np.array(score_cv)
print('train score: ',score_train.mean(),'cross-validation score: ',score_cv.mean())

In [None]:
# Perform KFold and GridSearch on SVC

df = pd.read_csv('cleaneddata.csv')

X = df.iloc[:,1:]
Y = df.iloc[:,0]

score_train = [0]
score_cv = [0]
parameters = {"kernel":("linear","poly","rbf","sigmoid"),
              "C":[1,10],
             "degree":[1,10]}

kf = KFold(n_splits=10, shuffle=True)
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    Y_train, Y_test = Y[train_index], Y[test_index]
    svc = SVC()
    clf_svc = GridSearchCV(svc,parameters)
    clf_svc.fit(X_train,Y_train)
    if clf_svc.score(X_test,Y_test) > max(score_cv):
        most_accurate = clf_svc.predict
    score_train.append(clf_svc.score(X_train,Y_train))
    score_cv.append(clf_svc.score(X_test,Y_test))
    
score_train,score_cv = np.array(score_train),np.array(score_cv)
print('cross-validation score: ',score_cv.max())


In [None]:

df_predict = pd.read_csv('cleaneddata_test.csv')

predict = most_accurate(df_predict)

PassengerId = np.arange(892,1310)

result = pd.DataFrame(PassengerId,columns=['PassengerId'])

result["Survived"] = pd.Series(predict)
result.to_csv('submit_svc.csv',index=False)