In [74]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from tpot import TPOTClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import  AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [64]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [65]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [66]:
train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(test['Age'].mean(), inplace=True)

test['Fare'].fillna(test['Fare'].mean(), inplace=True)

train['Embarked'].fillna(train['Embarked'].value_counts().reset_index()['index'][0], inplace=True)
test['Embarked'].fillna(train['Embarked'].value_counts().reset_index()['index'][0], inplace=True)

In [67]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train[features]
test_features = test[features]
train_label = train['Survived']

In [68]:
dvec = DictVectorizer(sparse=False)
train_features = dvec.fit_transform(train_features.to_dict(orient='record'))
test_features = dvec.transform(test_features.to_dict(orient='record'))

In [69]:
# TPOT
tpot = TPOTClassifier(generations=50, population_size=30, verbosity=2)
tpot.fit(train_features, train_label)
tpot.export('tpot_titanic_pipeline1.py')

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=1530, style=ProgressStyle(descrip…

Generation 1 - Current best internal CV score: 0.8439897056054233
Generation 2 - Current best internal CV score: 0.8439897056054233
Generation 3 - Current best internal CV score: 0.8439897056054233
Generation 4 - Current best internal CV score: 0.8439897056054233
Generation 5 - Current best internal CV score: 0.8439897056054233
Generation 6 - Current best internal CV score: 0.8439897056054233
Generation 7 - Current best internal CV score: 0.8439897056054233
Generation 8 - Current best internal CV score: 0.8439897056054233
Generation 9 - Current best internal CV score: 0.8439897056054233
Generation 10 - Current best internal CV score: 0.8439897056054233
Generation 11 - Current best internal CV score: 0.8439897056054233
Generation 12 - Current best internal CV score: 0.8439897056054233
Generation 13 - Current best internal CV score: 0.8439897056054233
Generation 14 - Current best internal CV score: 0.8439897056054233
Generation 15 - Current best internal CV score: 0.8439897056054233
Gene

In [75]:
# 朴素贝叶斯
clf = BernoulliNB()
clf.fit(train_features, train_label)
results = clf.predict(test_features)
# 得到准确率(基于训练集)
acc_nb = round(clf.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_nb)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_label, cv=10)))

score准确率为 0.7868
cross_val_score准确率为 0.7867


In [77]:
# Logistic Regression
lr = LogisticRegression(solver='liblinear', multi_class='auto')
lr.fit(train_features, train_label)
lr_predict = lr.predict(test_features)
# 得到准确率(基于训练集)
acc_lr = round(lr.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_lr)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(lr, train_features, train_label, cv=10)))

score准确率为 0.8036
cross_val_score准确率为 0.7946


In [78]:
# CART
dt = DecisionTreeClassifier()
dt.fit(train_features, train_label)
dt_predict = dt.predict(test_features)
# 得到准确率(基于训练集)
acc_dt = round(dt.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_dt)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(dt, train_features, train_label, cv=10)))

score准确率为 0.9820
cross_val_score准确率为 0.7868


In [82]:
# LDA
lda = LinearDiscriminantAnalysis()
lda.fit(train_features, train_label)
lda_predict = lda.predict(test_features)
# 得到准确率(基于训练集)
acc_lda = round(lda.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_lda)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(lda, train_features, train_label, cv=10)))

score准确率为 0.7991
cross_val_score准确率为 0.7924


In [83]:
# SVM
svm0 = svm.SVC(kernel='rbf', C=1.0, gamma='auto')
svm0.fit(train_features, train_label)
svm_predict = svm0.predict(test_features)
# 得到准确率(基于训练集)
acc_svm = round(svm0.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_svm)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(svm0, train_features, train_label, cv=10)))

score准确率为 0.8900
cross_val_score准确率为 0.7285


In [84]:
# KNN
knn = KNeighborsClassifier()
knn.fit(train_features, train_label)
knn_predict = knn.predict(test_features)
# 得到准确率(基于训练集)
acc_knn = round(knn.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_knn)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(knn, train_features, train_label, cv=10)))

score准确率为 0.8193
cross_val_score准确率为 0.7094


In [85]:
# Adaboost
dt_stump = DecisionTreeClassifier(max_depth=5,min_samples_leaf=1)
dt_stump.fit(train_features, train_label)

n_estimators=500
ada = AdaBoostClassifier(base_estimator=dt_stump,n_estimators=n_estimators)
ada.fit(train_features, train_label)
ada_predict = ada.predict(test_features)
# 得到准确率(基于训练集)
acc_ada = round(ada.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_ada)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(ada, train_features, train_label, cv=10)))

score准确率为 0.9820
cross_val_score准确率为 0.7980


In [86]:
# XGBoost
xgb = XGBClassifier()
xgb.fit(train_features, train_label)
xgb_predict = xgb.predict(test_features)
# 得到准确率(基于训练集)
acc_xgb = round(xgb.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_xgb)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(xgb, train_features, train_label, cv=10)))

score准确率为 0.8721
cross_val_score准确率为 0.8227
