In [1]:
import pandas as pd
import numpy as np

In [2]:
#read_excel() 读取tinanic3.xls数据集，并存储到dataframe
all_df = pd.read_excel('titanic3.xls')

In [3]:
print(all_df[:2])

   pclass  survived                            name     sex      age  sibsp  \
0       1         1   Allen, Miss. Elisabeth Walton  female  29.0000      0   
1       1         1  Allison, Master. Hudson Trevor    male   0.9167      1   

   parch  ticket      fare    cabin embarked boat  body  \
0      0   24160  211.3375       B5        S    2   NaN   
1      2  113781  151.5500  C22 C26        S   11   NaN   

                         home.dest  
0                     St Louis, MO  
1  Montreal, PQ / Chesterville, ON  


In [4]:
#提取表格中有用的列
cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]

In [5]:
print(all_df[:2])

   survived                            name  pclass     sex      age  sibsp  \
0         1   Allen, Miss. Elisabeth Walton       1  female  29.0000      0   
1         1  Allison, Master. Hudson Trevor       1    male   0.9167      1   

   parch      fare embarked  
0      0  211.3375        S  
1      2  151.5500        S  


In [6]:
#去除与生还率无关的name列，预测时用不到
df = all_df.drop(['name'], axis = 1)

In [7]:
#对数据集进行处理，将字段为空的数据重新处理，用平均值替代
#查看当前数据集的空字段，找出并统计个数
df.isnull().sum()

survived      0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [8]:
#将空字段内容用平均值填充
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)

fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [9]:
#female：0，male:1
#pandas中的map()方法可以利用字典的方法将key替换为value
df['sex'] = df['sex'].map({'famale':0, 'male':1})

In [10]:
#pandas中也提到了一个一位有效编码的转换工具get_dumies()
OneHot_df = pd.get_dummies(data = df, columns = ['embarked'])

In [11]:
print(OneHot_df[:2])

   survived  pclass  sex      age  sibsp  parch      fare  embarked_C  \
0         1       1  NaN  29.0000      0      0  211.3375           0   
1         1       1  1.0   0.9167      1      2  151.5500           0   

   embarked_Q  embarked_S  
0           0           1  
1           0           1  


In [12]:
#将dataframe转换为array数组
ndarray = OneHot_df.values

In [13]:
print(ndarray.shape)
print(ndarray[:2])

(1309, 10)
[[  1.       1.          nan  29.       0.       0.     211.3375   0.
    0.       1.    ]
 [  1.       1.       1.       0.9167   1.       2.     151.55     0.
    0.       1.    ]]


In [14]:
#当前ndarrayl同时包含数据特征和标签，所以需要将特征和标签进行分割
Label = ndarray[:, 0]
Features = ndarray[:, 1:]

In [15]:
print(Label[:2])
print(Features[:2])

[1. 1.]
[[  1.          nan  29.       0.       0.     211.3375   0.       0.
    1.    ]
 [  1.       1.       0.9167   1.       2.     151.55     0.       0.
    1.    ]]


In [16]:
#将数据集中的内容进行归一化处理，因为成员之间的量纲单位不同相差很大
#将所有数据去除量纲，落在0到1之间
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler((0,1))
scaledFeatures = minmax_scale.fit_transform(Features)

In [17]:
print(scaledFeatures[0:2])

[[0.                nan 0.36116884 0.         0.         0.41250333
  0.         0.         1.        ]
 [0.         0.         0.00939458 0.125      0.22222222 0.2958059
  0.         0.         1.        ]]


In [18]:
#将数据集分为训练数据80%和测试数据20%
msk = np.random.random(len(all_df)) < 0.8
print(msk)
train_df = all_df[msk]
test_df = all_df[msk]
print('total = ', len(all_df),'train = ', len(train_df),'test = ', len(test_df))

[ True  True False ...  True  True  True]
total =  1309 train =  1052 test =  1052
