In [24]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer

In [11]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [2]:
# 数据加载
train_data = pd.read_csv('./titanic_train.csv')
test_data = pd.read_csv('./titanic_test.csv')

In [16]:
print(test_data.describe())

       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576
min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000
25%     996.250000    1.000000   21.000000    0.000000    0.000000    7.895800
50%    1100.500000    3.000000   27.000000    0.000000    0.000000   14.454200
75%    1204.750000    3.000000   39.000000    1.000000    0.000000   31.500000
max    1309.000000    3.000000   76.000000    8.000000    9.000000  512.329200


In [20]:
print(train_data.describe(include='O'))

                                    Name   Sex    Ticket    Cabin Embarked
count                                891   891       891      204      889
unique                               891     2       681      147        3
top     Najib, Miss. Adele Kiamie "Jane"  male  CA. 2343  B96 B98        S
freq                                   1   577         7        4      644


In [17]:
# 使用平均年龄来填充年龄中的nan值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)
# 使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)

In [21]:
print(train_data['Embarked'].value_counts())
# 使用登录最多的港口来填充登录港口的nan值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S',inplace=True)

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [25]:
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]
print('特征值')
print(train_features)

特征值
     Pclass     Sex        Age  SibSp  Parch      Fare Embarked
0         3    male  22.000000      1      0    7.2500        S
1         1  female  38.000000      1      0   71.2833        C
2         3  female  26.000000      0      0    7.9250        S
3         1  female  35.000000      1      0   53.1000        S
4         3    male  35.000000      0      0    8.0500        S
5         3    male  29.699118      0      0    8.4583        Q
6         1    male  54.000000      0      0   51.8625        S
7         3    male   2.000000      3      1   21.0750        S
8         3  female  27.000000      0      2   11.1333        S
9         2  female  14.000000      1      0   30.0708        C
10        3  female   4.000000      1      1   16.7000        S
11        1  female  58.000000      0      0   26.5500        S
12        3    male  20.000000      0      0    8.0500        S
13        3    male  39.000000      1      5   31.2750        S
14        3  female  14.000000      

In [26]:
dvec=DictVectorizer(sparse=False)
train_features=dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)

['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


In [27]:
# 构造ID3决策树
clf = DecisionTreeClassifier(criterion='entropy')
# 决策树训练
clf.fit(train_features, train_labels)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [28]:
test_features=dvec.transform(test_features.to_dict(orient='record'))
# 决策树预测
pred_labels = clf.predict(test_features)

In [29]:
# 得到决策树准确率(基于训练集)
acc_decision_tree = round(clf.score(train_features, train_labels), 6)
print(u'score准确率为 %.4lf' % acc_decision_tree)

score准确率为 0.9820


In [30]:
# 使用K折交叉验证 统计决策树准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

cross_val_score准确率为 0.7834


In [31]:
# 使用TPOT自动机器学习工具对MNIST进行分类
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

In [32]:
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(train_features, train_labels)
print(tpot.score(test_features, pred_labels))
tpot.export('tpot_titanic.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=120.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.824938798568828
Generation 2 - Current best internal CV score: 0.827185989580064
Generation 3 - Current best internal CV score: 0.8339212855439081
Generation 4 - Current best internal CV score: 0.8339212855439081
Generation 5 - Current best internal CV score: 0.8339212855439081

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.4, min_samples_leaf=5, min_samples_split=2, n_estimators=100)
0.8421052631578947


In [34]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# features = tpot_data.drop('target', axis=1)
# training_features, testing_features, training_target, testing_target = \
#             train_test_split(features, tpot_data['target'], random_state=None)

training_features, testing_features, training_target, testing_target= train_features,test_features,train_labels,pred_labels

# Average CV score on the training set was: 0.8339212855439081
exported_pipeline = RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.4, min_samples_leaf=5, min_samples_split=2, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [38]:
# 得到决策树准确率(基于训练集)
acc_decision_tree = round(exported_pipeline.score(train_features, train_labels), 6)
print(u'score准确率为 %.4lf' % acc_decision_tree)

score准确率为 0.8923


In [36]:
# 使用K折交叉验证 统计决策树准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(exported_pipeline, train_features, train_labels, cv=10)))

cross_val_score准确率为 0.8306
