已经用训练集准备好了模型，接着就是使用测试集评估最终模型。

对于测试集，同样需要进行预处理

In [15]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.externals import joblib
from sklearn.svm import SVC

In [14]:
class RemoveNullPropertyRecords(BaseEstimator,TransformerMixin):
    '''删除指定属性为空的记录的转换器'''
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X.dropna(subset=self.attribute_names) #返回的还是DataFrame对象
    
class DataFrameSelector(BaseEstimator,TransformerMixin):
    '''数据子集转换器'''
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

class MyLabelBinarizer(TransformerMixin):
    '''在pipe里直接用LabelBinarizer会出错，需要用带3个参数构造函数封装一下'''
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)
    
class MyOneHotEncoder(TransformerMixin):
    '''在pipe里直接用OneHotEncoder会出错，需要用带3个参数构造函数封装一下'''
    def __init__(self, *args, **kwargs):
        self.encoder = OneHotEncoder(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

#定义各个pipe
num_attribs=['Age','Fare']
str_attribs_sex=['Sex']
str_attribs_embarked=['Embarked']
cat_attribs_pclass=['Pclass']
cat_attribs_sibsp=['SibSp']
cat_attribs_parch=['Parch']

num_pipeline=Pipeline([
    ('selector',DataFrameSelector(num_attribs)),
    ('imputer',Imputer(strategy='median')),
    ('std_scaler',StandardScaler())
    ])

str_pipeline_sex=Pipeline([
    ('selector',DataFrameSelector(str_attribs_sex)),
    ('label_binarizer',MyLabelBinarizer()),
    ])

str_pipeline_embarked=Pipeline([
    ('selector',DataFrameSelector(str_attribs_embarked)),
    ('label_binarizer',MyLabelBinarizer()),
    ])

cat_pipeline_pclass=Pipeline([
    ('selector',DataFrameSelector(cat_attribs_pclass)),
    ('ont_not_encoder',MyOneHotEncoder()),
    ])

cat_pipeline_sibsp=Pipeline([
    ('selector',DataFrameSelector(cat_attribs_sibsp)),
    ('ont_not_encoder',MyOneHotEncoder()),
    ])

cat_pipeline_parch=Pipeline([
    ('selector',DataFrameSelector(cat_attribs_parch)),
    ('ont_not_encoder',MyOneHotEncoder()),
    ])

full_pipeline=FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('str_pipeline_sex',str_pipeline_sex),
    ('str_pipeline_embarked',str_pipeline_embarked),
    ('cat_pipeline_pclass',cat_pipeline_pclass),
    ('cat_attribs_sibsp',cat_pipeline_sibsp),
    ('cat_pipeline_parch',cat_pipeline_parch),
])

#读取数据
csv_path='D:\MyBlogs\Machine Learning Project Simulation'
test_set=pd.read_csv(os.path.join(csv_path,'test.csv'))
test_set.info()
#先对原始数据进行SibSp和Parch属性的更新
sibsp=[]
for index, row in test_set.iterrows():
    if row['SibSp']>0:
        sibsp.append(1)
    else:
        sibsp.append(0)
test_set=test_set.drop('SibSp',axis=1,inplace=False)
test_set.insert(0,'SibSp',sibsp)

parch=[]
for index, row in test_set.iterrows():
    if row['Parch']>0:
        parch.append(1)
    else:
        parch.append(0)
test_set=test_set.drop('Parch',axis=1,inplace=False)
test_set.insert(0,'Parch',parch)

#运行pipe，得到处理过的test_X
test_X=full_pipeline.fit_transform(test_set)
print(test_X.shape)
test_X



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
(418, 13)


<418x13 sparse matrix of type '<class 'numpy.float64'>'
	with 2774 stored elements in Compressed Sparse Row format>

载入训练好的模型

In [19]:
svc=joblib.load('my_model.pkl')

In [33]:
test_pred=svc.predict(test_X.toarray())
print(test_pred.shape)

(418,)


TypeError: 'CClass' object is not callable

安装Kaggle的要求生成指定格式的提交文件

In [39]:
passagerIds=test_set['PassengerId'].values
submission=pd.DataFrame(data=np.c_[passagerIds,test_pred],columns=['PassengerId','Survived'])
submission.info()
submission.to_csv('submission.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int64
dtypes: int64(2)
memory usage: 6.6 KB
