In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [81]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


每列属性的含义
- Survived：是否幸存(0代表没有，1代表有)
- Pclass:客票级别
- Name, Sex, Age:名字，性别，年龄
- SibSp:乘客兄弟姐妹/配偶的个数(整数值)
- Parch:乘客父母/孩子的个数(整数值)
- Ticket:票号(字符串)
- Fare：乘客所持票的价格(浮点数，0-500不等)
- Cabin：乘客所在船舱(有缺失)
- Embarked：乘客登船港口:S、C、Q(有缺失)

In [82]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


好的，“Age”，“Cabin”和“Embarked”属性有时为空（小于891非空），尤其是“Cabin”（77％为空）。 我们暂时将忽略Cabin，而专注于其余部分。 Age属性大约有19％的空值，因此我们需要决定如何处理它们。 用中位数年龄替换空值似乎是合理的。


“Name”和“Ticket”属性可能有一些值，但是将它们转换为模型可以使用的有用数字会有些棘手。 所以现在，我们将忽略它们。

In [83]:
train_data.describe()




Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


数据预处理， 填充缺失值以及将特征中含有字符的转换为数值型

In [84]:
#将年龄这一列的数据缺失值进行填充
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].median())

In [85]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


将性别中的男女设置为0 1 值 把机器学习不能处理的自字符值转换成能处理的数值
loc定位到哪一行，将titanic['Sex'] == 'male'的样本Sex值改为0

In [86]:

# train_data.loc[train_data["Sex"] == "male","Sex"] = 0
# train_data.loc[train_data["Sex"] == "female","Sex"] = 1



In [87]:
# print(train_data["Sex"].unique)

In [88]:
print(train_data["Embarked"].unique())

['S' 'C' 'Q' nan]


In [89]:
# #通过统计三个登船地点人数最多的填充缺失值
# train_data["Embarked"] = train_data["Embarked"].fillna("S")
# #将登船地点同样转换成数值
# train_data.loc[train_data["Embarked"] == "S","Embarked"] = 0
# train_data.loc[train_data["Embarked"] == "C","Embarked"] = 1
# train_data.loc[train_data["Embarked"] == "Q","Embarked"] = 2


In [90]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [91]:
#3.1删除掉缺失值太多的列，与预测结果无关的列
train_data = train_data.drop(["Cabin","Name","Ticket","PassengerId"],axis=1)

In [92]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [93]:
train_data["Sex"]=(train_data["Sex"]=='male').astype(int)
# train_data["Sex"]=(train_data["Sex"]=='female').astype(int)

ls=train_data["Embarked"].unique().tolist()
train_data["Embarked"]=train_data["Embarked"].apply(lambda x:ls.index(x))


In [94]:
train_data['Sex']

0      1
1      0
2      0
3      0
4      1
5      1
6      1
7      1
8      0
9      0
10     0
11     0
12     1
13     1
14     0
15     0
16     1
17     1
18     0
19     0
20     1
21     1
22     0
23     1
24     0
25     0
26     1
27     1
28     0
29     1
      ..
861    1
862    0
863    0
864    1
865    0
866    0
867    1
868    1
869    1
870    1
871    0
872    1
873    1
874    0
875    0
876    1
877    1
878    1
879    0
880    0
881    1
882    0
883    1
884    1
885    0
886    1
887    0
888    0
889    1
890    1
Name: Sex, Length: 891, dtype: int32

In [95]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int32
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int32(1), int64(5)
memory usage: 52.3 KB


测试集和训练集划分

In [96]:
X=train_data.loc[:,train_data.columns!='Survived']
y=train_data.loc[:,train_data.columns=='Survived']
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,y,test_size=0.3)
for i in [Xtrain,Xtest,Ytrain,Ytest]:#重排序号，使之有序
    i.index=range(i.shape[0])


In [97]:
Xtrain.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,2,1,42.0,0,0,13.0,0
1,1,1,37.0,1,0,53.1,0
2,1,1,46.0,1,0,61.175,0
3,3,1,28.0,0,0,7.225,1
4,3,1,26.0,0,0,7.8958,0
5,3,0,29.0,0,2,15.2458,1
6,2,0,27.0,1,0,13.8583,1
7,2,0,50.0,0,0,10.5,0
8,1,0,29.0,0,0,211.3375,0
9,1,1,4.0,0,2,81.8583,0


In [98]:
Ytrain.head(10)

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,0
5,1
6,1
7,1
8,1
9,1


In [99]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=20
                            ,criterion='gini'
                            ,max_depth=4
                            ,min_samples_leaf=1
                            ,splitter='random'
                            )
clf = clf.fit(Xtrain, Ytrain)



In [100]:
# score_1

In [101]:
test = pd.read_csv('test.csv')
test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [102]:
#把测试集预处理操作封装
def clean_data(data):
    data = data.drop(['Cabin','Name','Ticket','PassengerId']
          ,axis=1
         )
    data['Age'] = data['Age'].fillna(data['Age'].mean())
    data['Fare'] = data['Fare'].fillna(data['Fare'].mean())   #
    data = data.dropna(axis=0)
    data['Sex'] = (data['Sex'] == 'male').astype('int')
    data['Embarked'] = data['Embarked'].map({'S':0,'C':1,'Q':2})
    return data



In [103]:
test_data = clean_data(test)

In [104]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null int32
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Embarked    418 non-null int64
dtypes: float64(2), int32(1), int64(4)
memory usage: 24.5 KB


In [105]:
clf.predict(test_data)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [106]:
res =pd.concat([test['PassengerId'],pd.DataFrame(clf.predict(test_data))],axis=1)



In [107]:
res.columns = ['PassengerId','Survived']
res.to_csv("result.csv",sep=',',index=False)

