In [153]:
# 导入该项目所需的包
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

# 导入可视化代码 visuals.py
import visuals as vs

# 内嵌绘图
%matplotlib inline

# 加载数据集
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)

# 打印数据的前几项
display(full_data.head())


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [165]:
#删除有缺失值的行
full_data=full_data.replace(to_replace='?',value=np.nan)
full_data=full_data.dropna(how='any')


In [156]:
#将'Name','Ticket','Cabin','Embarked'从数据集中删除
data=full_data.drop(['Name','Ticket','Cabin','Embarked'], axis = 1)
#将性别字符串替换为数值，male改为1，female改为0
data=data.replace(to_replace='male',value=1)
data=data.replace(to_replace='female',value=0)
# 展示新数据集
display(data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
1,2,1,1,0,38.0,1,0,71.2833
3,4,1,1,0,35.0,1,0,53.1
6,7,0,1,1,54.0,0,0,51.8625
10,11,1,3,0,4.0,1,1,16.7
11,12,1,1,0,58.0,0,0,26.55


In [168]:
# 使用sklearn.model_selection里的train_test_split模块用于分割数据。
from sklearn.model_selection import train_test_split

# 创建特征列表。
column_names = ['PassengerId','Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']

# 随机采样25%的数据用于测试，剩下的75%用于构建训练集合。
X_train, X_test, y_train, y_test = train_test_split(data[column_names[1:7]], data[column_names[1]], test_size=0.25, random_state=33)

# 成功分割数据
print("Training and testing split was successful.")


Training and testing split was successful.


In [169]:
# 从sklearn.preprocessing里导入数据标准化模块。
from sklearn.preprocessing import StandardScaler

# 从sklearn.svm里导入基于线性假设的支持向量机分类器LinearSVC。
from sklearn.svm import LinearSVC

# 对训练和测试的特征数据进行标准化。
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [170]:
# 从sklearn.neighbors里选择导入KNeighborsClassifier，即K近邻分类器。
from sklearn.neighbors import KNeighborsClassifier

# 使用K近邻分类器对测试数据进行类别预测，预测结果储存在变量y_predict中。
knc = KNeighborsClassifier()
knc.fit(X_train, y_train.astype('int'))
y_predict = knc.predict(X_test)

In [179]:
# 使用模型自带的评估函数进行准确性测评。
print ('The accuracy of K-Nearest Neighbor Classifier is', knc.score(X_test, y_test) )
# 输出更加详细的分类性能。
print(classification_report(y_predict, y_test, target_names = ['died', 'survived']))

The accuracy of K-Nearest Neighbor Classifier is 0.9347826086956522
              precision    recall  f1-score   support

        died       1.00      1.00      1.00        15
    survived       1.00      1.00      1.00        31

    accuracy                           1.00        46
   macro avg       1.00      1.00      1.00        46
weighted avg       1.00      1.00      1.00        46



In [171]:
# 从sklearn.tree中导入决策树分类器。
from sklearn.tree import DecisionTreeClassifier
# 使用默认配置初始化决策树分类器。
dtc = DecisionTreeClassifier()
# 使用分割到的训练数据进行模型学习。
dtc.fit(X_train, y_train.astype('int'))
# 用训练好的决策树模型对测试特征数据进行预测。
y_predict = dtc.predict(X_test)

In [172]:
# 从sklearn.metrics导入classification_report。
from sklearn.metrics import classification_report
# 输出预测准确性。
print( dtc.score(X_test,y_test))
# 输出更加详细的分类性能。
print(classification_report(y_predict, y_test, target_names = ['died', 'survived']))

1.0
              precision    recall  f1-score   support

        died       1.00      1.00      1.00        15
    survived       1.00      1.00      1.00        31

    accuracy                           1.00        46
   macro avg       1.00      1.00      1.00        46
weighted avg       1.00      1.00      1.00        46



In [173]:
# 从 sklearn.linear_model 里选择导入LogisticRegression。
from sklearn.linear_model import LogisticRegression 
#消除警告
import warnings
warnings.filterwarnings("ignore")
# 初始化LogisticRegression
lr = LogisticRegression()
# 调用LogisticRegression中的fit函数/模块用来训练模型参数。
lr.fit(X_train, y_train.astype('int'))                
# 使用训练好的模型lr对X_test进行预测，结果储存在变量lr_y_predict中。
lr_y_predict = lr.predict(X_test) 

In [174]:
# 从sklearn.metrics里导入classification_report模块。
from sklearn.metrics import classification_report
# 使用逻辑斯蒂回归模型自带的评分函数score获得模型在测试集上的准确性结果。
print( 'Accuracy of LR Classifier:', lr.score(X_test, y_test))
# 利用classification_report模块获得LogisticRegression其他三个指标的结果。
print (classification_report(y_test, lr_y_predict, target_names=['died', 'survived']))


Accuracy of LR Classifier: 1.0
              precision    recall  f1-score   support

        died       1.00      1.00      1.00        15
    survived       1.00      1.00      1.00        31

    accuracy                           1.00        46
   macro avg       1.00      1.00      1.00        46
weighted avg       1.00      1.00      1.00        46



In [175]:
# 从 sklearn.linear_model 里选择导入SGDClassifier。
from sklearn.linear_model import SGDClassifier
# 初始化SGDClassifier。
sgdc = SGDClassifier()
# 调用SGDClassifier中的fit函数/模块用来训练模型参数。
sgdc.fit(X_train, y_train.astype('int'))             
# 使用训练好的模型sgdc对X_test进行预测，结果储存在变量sgdc_y_predict中。
sgdc_y_predict = sgdc.predict(X_test)   

In [176]:
 # 使用随机梯度下降模型自带的评分函数score获得模型在测试集上的准确性结果。
print ('Accuarcy of SGD Classifier:', sgdc.score(X_test, y_test))
# 利用classification_report模块获得SGDClassifier其他三个指标的结果。
print (classification_report(y_test, sgdc_y_predict, target_names=['died', 'survived']))

Accuarcy of SGD Classifier: 1.0
              precision    recall  f1-score   support

        died       1.00      1.00      1.00        15
    survived       1.00      1.00      1.00        31

    accuracy                           1.00        46
   macro avg       1.00      1.00      1.00        46
weighted avg       1.00      1.00      1.00        46

