In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion

# 1 数据清理分析

In [2]:
csv_path='D:\MyBlogs\Machine Learning Project Simulation'
train_set_original=pd.read_csv(os.path.join(csv_path,'train.csv'))
train_set=train_set_original.copy()
test_set=pd.read_csv(os.path.join(csv_path,'test.csv'))

先把Name,Cabin和Ticket属性去掉

In [3]:
try:
    train_set=train_set.drop('Cabin',axis=1)
    train_set=train_set.drop('Ticket',axis=1)
    train_set=train_set.drop('Name',axis=1)
except:
    pass
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB


Embarked属性是分类属性，存在少量的缺失，可以把缺失Embarked的记录删除

In [4]:
try:
    train_set=train_set.dropna(subset=['Embarked'])
except:
    pass
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Sex            889 non-null object
Age            712 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Fare           889 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 69.5+ KB


对于Age属性，用中位数填充空值

In [5]:
imputer=Imputer(strategy='median')
try:
    train_set_WithoutDiscreteValues=train_set.drop(['Sex','Embarked'],axis=1) #Imputer不能计算非数值属性
except:
    pass
imputer.fit(train_set_WithoutDiscreteValues)
print(imputer.statistics_)

[446.       0.       3.      28.       0.       0.      14.4542]


In [6]:
X=imputer.transform(train_set_WithoutDiscreteValues)
train_set_WithoutDiscreteValues=pd.DataFrame(X,columns=train_set_WithoutDiscreteValues.columns)
train_set_WithoutDiscreteValues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 7 columns):
PassengerId    889 non-null float64
Survived       889 non-null float64
Pclass         889 non-null float64
Age            889 non-null float64
SibSp          889 non-null float64
Parch          889 non-null float64
Fare           889 non-null float64
dtypes: float64(7)
memory usage: 48.7 KB


将SibSp和Parch属性转成二值属性

In [7]:
sibsp=[]
for index, row in train_set_WithoutDiscreteValues.iterrows():
    if row['SibSp']>0:
        sibsp.append(1)
    else:
        sibsp.append(0)
train_set_WithoutDiscreteValues=train_set_WithoutDiscreteValues.drop('SibSp',axis=1,inplace=False)
train_set_WithoutDiscreteValues.insert(0,'SibSp',sibsp)
train_set_WithoutDiscreteValues['SibSp'].value_counts()

0    606
1    283
Name: SibSp, dtype: int64

In [23]:
parch=[]
for index, row in train_set_WithoutDiscreteValues.iterrows():
    if row['Parch']>0:
        parch.append(1)
    else:
        parch.append(0)
train_set_WithoutDiscreteValues=train_set_WithoutDiscreteValues.drop('Parch',axis=1,inplace=False)
train_set_WithoutDiscreteValues.insert(0,'Parch',parch)
train_set_WithoutDiscreteValues['Parch'].value_counts()

0    676
1    213
Name: Parch, dtype: int64

处理文本属性和分类属性，转成独热编码('Sex','Embarked'是文本属性，'Pclass','SibSp','Parch'是分类属性)

In [77]:
encoder=LabelEncoder()
train_set_Sex=train_set['Sex']
train_set_Embarked=train_set['Embarked']
train_set_Sex=encoder.fit_transform(train_set_Sex)
train_set_Embarked=encoder.fit_transform(train_set_Embarked)
train_set_Pclass=train_set_WithoutDiscreteValues['Pclass'].values
train_set_SibSp=train_set_WithoutDiscreteValues['SibSp'].values
train_set_Parch=train_set_WithoutDiscreteValues['Parch'].values

encoder2=OneHotEncoder()
train_set_Sex=encoder2.fit_transform(train_set_Sex.reshape(-1,1))
train_set_Embarked=encoder2.fit_transform(train_set_Embarked.reshape(-1,1))
train_set_Pclass=encoder2.fit_transform(train_set_Pclass.reshape(-1,1))
train_set_SibSp=encoder2.fit_transform(train_set_SibSp.reshape(-1,1))
train_set_Parch=encoder2.fit_transform(train_set_Parch.reshape(-1,1))
#print(encoder.classes_)
#train_set_Embarked.shape
#print(encoder.classes_)

# 2 编写Pipe

合并上面的代码，分别为数值属性和分类属性自定义转换器并组成pipe，将原始数据转换成合适的格式

## 1 首先定义一个可去除指定属性为空的记录的转换器RemoveNullPropertyRecords

In [7]:
class RemoveNullPropertyRecords(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X.dropna(subset=self.attribute_names) #返回的还是DataFrame对象
#removeNullPropertyRecords=RemoveNullPropertyRecords(['Embarked'])
#x=removeNullPropertyRecords.fit_transform(train_set_original.copy())
#x.info()

## 2 然后定义一个数据子集提取器，输入属性名，返回包含这些属性的数据集DataFrameSelector

In [8]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values#返回的还是DataFrame对象
#dataFrameSelector=DataFrameSelector(['Age','Fare'])
#x=dataFrameSelector.fit_transform(x)
#x.info()


## 3 为数值属性定义一个数值处理pipe，其流程是：去除指定属性为空的记录提取数值属性集->中位数补齐->归一化

In [9]:
rv_null_attribs=['Embarked']
num_attribs=['Age','Fare']
num_pipeline=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(num_attribs)),
    #('imputer',Imputer(strategy='median')),
    #('std_scaler',StandardScaler())
    ])
x=num_pipeline.fit_transform(train_set_original.copy())
print(x)
print(x.shape)

[[22.      7.25  ]
 [38.     71.2833]
 [26.      7.925 ]
 ...
 [    nan 23.45  ]
 [26.     30.    ]
 [32.      7.75  ]]
(889, 2)


## 4 为字符属性定义pipe, 其流程是: 去除指定属性为空的记录->提取字符属性集->字符转数字并独热编码

In [132]:
str_attribs_sex=['Sex']
str_attribs_embarked=['Embarked']

class MyLabelBinarizer(TransformerMixin):
    '''在pipe里直接用LabelBinarizer会出错，需要用带3个参数构造函数封装一下'''
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)


str_pipeline_sex=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(str_attribs_sex)),
    ('label_binarizer',MyLabelBinarizer()),
    ])
#x=str_pipeline_sex.fit_transform(train_set_original.copy()) 
#print(x.shape)


str_pipeline_embarked=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(str_attribs_embarked)),
    ('label_binarizer',MyLabelBinarizer()),
    ])
#x=str_pipeline_embarked.fit_transform(train_set_original.copy()) 
#print(x.shape)

## 5 为数字分类属性定义pipe，其流程是：去除指定属性为空的记录->提取字符属性集->独热编码

In [131]:
class MyOneHotEncoder(TransformerMixin):
    '''在pipe里直接用OneHotEncoder会出错，需要用带3个参数构造函数封装一下'''
    def __init__(self, *args, **kwargs):
        self.encoder = OneHotEncoder(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

cat_attribs_pclass=['Pclass']
cat_pipeline_pclass=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(cat_attribs_pclass)),
    ('ont_not_encoder',MyOneHotEncoder()),
    ])
#x=cat_pipeline_pclass.fit_transform(train_set_original.copy()) 
#print(x.shape)

cat_attribs_sibsp=['SibSp']
cat_pipeline_sibsp=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(cat_attribs_sibsp)),
    ('ont_not_encoder',MyOneHotEncoder()),
    ])
#x=cat_pipeline_sibsp.fit_transform(train_set_original.copy()) 
#print(x.shape)

cat_attribs_parch=['Parch']
cat_pipeline_parch=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(cat_attribs_parch)),
    ('ont_not_encoder',MyOneHotEncoder()),
    ])
#x=cat_pipeline_parch.fit_transform(train_set_original.copy()) 
#print(x.shape)

## 6 定义一个组合Pipe

In [140]:
full_pipeline=FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('str_pipeline_sex',str_pipeline_sex),
    ('str_pipeline_embarked',str_pipeline_embarked),
    ('cat_pipeline_pclass',cat_pipeline_pclass),
    ('cat_attribs_sibsp',cat_pipeline_sibsp),
    ('cat_pipeline_parch',cat_pipeline_parch),
])

## 7 运行pipe

In [53]:
class RemoveNullPropertyRecords(BaseEstimator,TransformerMixin):
    '''删除指定属性为空的记录的转换器'''
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X.dropna(subset=self.attribute_names) #返回的还是DataFrame对象
    
class DataFrameSelector(BaseEstimator,TransformerMixin):
    '''数据子集转换器'''
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

class MyLabelBinarizer(TransformerMixin):
    '''在pipe里直接用LabelBinarizer会出错，需要用带3个参数构造函数封装一下'''
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)
    
class MyOneHotEncoder(TransformerMixin):
    '''在pipe里直接用OneHotEncoder会出错，需要用带3个参数构造函数封装一下'''
    def __init__(self, *args, **kwargs):
        self.encoder = OneHotEncoder(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

#定义各个pipe
rv_null_attribs=['Embarked']
num_attribs=['Age','Fare']
str_attribs_sex=['Sex']
str_attribs_embarked=['Embarked']
cat_attribs_pclass=['Pclass']
cat_attribs_sibsp=['SibSp']
cat_attribs_parch=['Parch']

num_pipeline=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(num_attribs)),
    ('imputer',Imputer(strategy='median')),
    ('std_scaler',StandardScaler())
    ])

str_pipeline_sex=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(str_attribs_sex)),
    ('label_binarizer',MyLabelBinarizer()),
    ])

str_pipeline_embarked=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(str_attribs_embarked)),
    ('label_binarizer',MyLabelBinarizer()),
    ])

cat_pipeline_pclass=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(cat_attribs_pclass)),
    ('ont_not_encoder',MyOneHotEncoder()),
    ])

cat_pipeline_sibsp=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(cat_attribs_sibsp)),
    ('ont_not_encoder',MyOneHotEncoder()),
    ])

cat_pipeline_parch=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(cat_attribs_parch)),
    ('ont_not_encoder',MyOneHotEncoder()),
    ])

full_pipeline=FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('str_pipeline_sex',str_pipeline_sex),
    ('str_pipeline_embarked',str_pipeline_embarked),
    ('cat_pipeline_pclass',cat_pipeline_pclass),
    ('cat_attribs_sibsp',cat_pipeline_sibsp),
    ('cat_pipeline_parch',cat_pipeline_parch),
])

#读取数据
csv_path='D:\MyBlogs\Machine Learning Project Simulation'
train_set_original=pd.read_csv(os.path.join(csv_path,'train.csv'))
train_set=train_set_original.copy()
test_set=pd.read_csv(os.path.join(csv_path,'test.csv'))

#先对原始数据进行SibSp和Parch属性的更新
sibsp=[]
for index, row in train_set.iterrows():
    if row['SibSp']>0:
        sibsp.append(1)
    else:
        sibsp.append(0)
train_set=train_set.drop('SibSp',axis=1,inplace=False)
train_set.insert(0,'SibSp',sibsp)

parch=[]
for index, row in train_set.iterrows():
    if row['Parch']>0:
        parch.append(1)
    else:
        parch.append(0)
train_set=train_set.drop('Parch',axis=1,inplace=False)
train_set.insert(0,'Parch',parch)

#运行pipe，得到处理过的train_X
train_X=full_pipeline.fit_transform(train_set)
print(train_X.shape)

#同样的，也可以使用pipe得到train_Y
label_pipeline=Pipeline([
    ('remove',RemoveNullPropertyRecords(rv_null_attribs)),
    ('selector',DataFrameSelector(['Survived']))
    ])
train_Y=label_pipeline.fit_transform(train_set)
print(train_Y.shape)

#把处理好的训练数据保存，便于下一章的处理
data_X = pd.DataFrame(train_X.toarray())
data_X.to_csv('train_X.csv')
#train_X=pd.read_csv('train_X.csv')
#train_X.info()

data_Y = pd.DataFrame(train_Y)
data_Y.to_csv('train_Y.csv')
#train_Y=pd.read_csv('train_Y.csv')
#train_Y.info()

(889, 13)
(889, 1)
