In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from tpot import TPOTClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import  AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [3]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(test['Age'].mean(), inplace=True)

test['Fare'].fillna(test['Fare'].mean(), inplace=True)

train['Embarked'].fillna(train['Embarked'].value_counts().reset_index()['index'][0], inplace=True)
test['Embarked'].fillna(train['Embarked'].value_counts().reset_index()['index'][0], inplace=True)

train = train.set_index('PassengerId')
test = test.set_index('PassengerId')

In [7]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train[features]
test_features = test[features]
train_label = train['Survived']

In [8]:
dvec = DictVectorizer(sparse=False)
train_features = dvec.fit_transform(train_features.to_dict(orient='record'))
test_features = dvec.transform(test_features.to_dict(orient='record'))

In [9]:
# TPOT
tpot = TPOTClassifier(generations=5, population_size=30, verbosity=2)
tpot.fit(train_features, train_label)
#tpot.export('tpot_titanic_pipeline1.py')
tpot_predict = tpot.predict(test_features)
tpot_predict = pd.DataFrame(tpot_predict, index=test.index, columns=['tpot_predict'])

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=180.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.8204569706860838
Generation 2 - Current best internal CV score: 0.8260561170045821
Generation 3 - Current best internal CV score: 0.827185989580064
Generation 4 - Current best internal CV score: 0.831680371602536
Generation 5 - Current best internal CV score: 0.8361684765551439

Best pipeline: DecisionTreeClassifier(RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.7000000000000001, min_samples_leaf=3, min_samples_split=7, n_estimators=100), criterion=entropy, max_depth=3, min_samples_leaf=14, min_samples_split=16)


In [137]:
# 朴素贝叶斯
clf = BernoulliNB()
clf.fit(train_features, train_label)
results = clf.predict(test_features)
results = pd.DataFrame(results, index=test.index, columns=['NB_predict'])
# 得到准确率(基于训练集)
acc_nb = round(clf.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_nb)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_label, cv=10)))

score准确率为 0.7868
cross_val_score准确率为 0.7867


In [138]:
# Logistic Regression
lr = LogisticRegression(solver='liblinear', multi_class='auto')
lr.fit(train_features, train_label)
lr_predict = lr.predict(test_features)
lr_predict = pd.DataFrame(lr_predict, index=test.index, columns=['lr_predict'])
# 得到准确率(基于训练集)
acc_lr = round(lr.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_lr)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(lr, train_features, train_label, cv=10)))

score准确率为 0.8036
cross_val_score准确率为 0.7946


In [139]:
# CART
dt = DecisionTreeClassifier()
dt.fit(train_features, train_label)
dt_predict = dt.predict(test_features)
dt_predict = pd.DataFrame(dt_predict, index=test.index, columns=['dt_predict'])
# 得到准确率(基于训练集)
acc_dt = round(dt.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_dt)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(dt, train_features, train_label, cv=10)))

score准确率为 0.9820
cross_val_score准确率为 0.7756


In [140]:
# LDA
lda = LinearDiscriminantAnalysis()
lda.fit(train_features, train_label)
lda_predict = lda.predict(test_features)
lda_predict = pd.DataFrame(lda_predict, index=test.index, columns=['lda_predict'])
# 得到准确率(基于训练集)
acc_lda = round(lda.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_lda)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(lda, train_features, train_label, cv=10)))

score准确率为 0.7991
cross_val_score准确率为 0.7924


In [141]:
# SVM
svm0 = svm.SVC(kernel='rbf', C=1.0, gamma='auto')
svm0.fit(train_features, train_label)
svm_predict = svm0.predict(test_features)
svm_predict = pd.DataFrame(svm_predict, index=test.index, columns=['svm_predict'])
# 得到准确率(基于训练集)
acc_svm = round(svm0.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_svm)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(svm0, train_features, train_label, cv=10)))

score准确率为 0.8900
cross_val_score准确率为 0.7285


In [142]:
# KNN
knn = KNeighborsClassifier()
knn.fit(train_features, train_label)
knn_predict = knn.predict(test_features)
knn_predict = pd.DataFrame(knn_predict, index=test.index, columns=['knn_predict'])
# 得到准确率(基于训练集)
acc_knn = round(knn.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_knn)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(knn, train_features, train_label, cv=10)))

score准确率为 0.8193
cross_val_score准确率为 0.7094


In [143]:
# Adaboost
dt_stump = DecisionTreeClassifier(max_depth=5,min_samples_leaf=1)
dt_stump.fit(train_features, train_label)

n_estimators=500
ada = AdaBoostClassifier(base_estimator=dt_stump,n_estimators=n_estimators)
ada.fit(train_features, train_label)
ada_predict = ada.predict(test_features)
ada_predict = pd.DataFrame(ada_predict, index=test.index, columns=['ada_predict'])
# 得到准确率(基于训练集)
acc_ada = round(ada.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_ada)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(ada, train_features, train_label, cv=10)))

score准确率为 0.9820
cross_val_score准确率为 0.7913


In [144]:
# XGBoost
xgb = XGBClassifier()
xgb.fit(train_features, train_label)
xgb_predict = xgb.predict(test_features)
xgb_predict = pd.DataFrame(xgb_predict, index=test.index, columns=['xgb_predict'])
# 得到准确率(基于训练集)
acc_xgb = round(xgb.score(train_features, train_label), 6)
print(u'score准确率为 %.4lf' % acc_xgb)
# 使用K折交叉验证 统计准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(xgb, train_features, train_label, cv=10)))

score准确率为 0.8721
cross_val_score准确率为 0.8227


In [145]:
predict_results = pd.concat([tpot_predict, results, lr_predict, dt_predict, lda_predict, svm_predict, knn_predict, ada_predict, xgb_predict], axis=1)
print(predict_results)

             tpot_predict  NB_predict  lr_predict  dt_predict  lda_predict  \
PassengerId                                                                  
892                     0           0           0           0            0   
893                     0           1           0           0            0   
894                     0           0           0           1            0   
895                     0           0           0           1            0   
896                     1           1           1           1            1   
...                   ...         ...         ...         ...          ...   
1305                    0           0           0           0            0   
1306                    1           1           1           1            1   
1307                    0           0           0           0            0   
1308                    0           0           0           0            0   
1309                    0           0           0           0   