## 用pandas进行数据预处理

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train.isnull().sum().sort_values(ascending=False)

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
FireplaceQu     690
               ... 
ExterQual         0
Exterior2nd       0
Exterior1st       0
RoofMatl          0
SalePrice         0
Length: 81, dtype: int64

### 预处理思路: 

- 去掉id列
- 去掉NaN数值较多的列（可能会有副作用）
- 分别处理数字类型的features/需要进行Ordinal编码的features/需要进行OneHotEncoder编码的features
    - Ordinal编码：针对values之间有等级之分的列（譬如：excellent, good, bad ......）
    - OneHotEncoder编码：针对values之间没有等级之分的列(譬如：male,female)

In [5]:
#定义一下要对train_set和test_set都进行数据预处理的步骤
def preprocess1(df):
    #去掉Id列
    df.drop(columns=['Id'],inplace=True)
    
    #去掉缺失值较多的列 >=1000
    df.drop(columns=['PoolQC','MiscFeature','Alley','Fence'],inplace=True)
    
    #提取numeric_columns
    numeric_features = list(df.dtypes[df.dtypes != 'object'].index)
    
    #提取需要进行OrdinalEncoder的列
    object_features_ore = ['ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','HeatingQC','KitchenQual','FireplaceQu',
#                                                              # 'GarageFinish',
                            'GarageQual','GarageCond','SaleCondition']
    '''
    GarageFinish特例
    '''
    
    #提取需要进行OneHotEncoder的列
    object_features_ohe = list(df.dtypes[df.dtypes == 'object'].index)
    for each in object_features_ore: 
        object_features_ohe.remove(each)
    
    return numeric_features,object_features_ore,object_features_ohe

In [6]:
#数据预处理
X = train.iloc[:,:-1]
y = train.SalePrice
train_numeric_features,object_features_ore,train_object_features_ohe = preprocess1(X)

In [7]:
#检查处理完后的列向量是否有遗漏
len(train_numeric_features),len(object_features_ore),len(train_object_features_ohe)

(36, 11, 28)

- 对需要Ordinal Encoder的列中的值进行预处理

In [8]:
def ORE(x):
    if x =="Ex":
        return 5 
    elif x == "Gd":
        return 4
    elif x == "TA":
        return 3
    elif x == "Fa":
        return 2
    elif x == "Po":
        return 1
    elif x == "NA":
        return 0
    else:
        return 0
    return 

- 整体编码

In [9]:
#定义进行编码操作
def encode(df,n,ore,ohe):
    #对numeric_features
    df[n] = df[n].apply(
    lambda x: (x - x.mean()) / (x.std()))
    
    # 在标准化数据之后，所有均值消失，因此我们可以将缺失值设置为0
    df[n] = df[n].fillna(0)
    
    #对ore列
    df[ore] = df[ore].fillna('missing')
    df[ore] = df[ore].applymap(ORE)
    #df = df[ore].apply(lambda x: if x==""))
    
    #对ohe列
    df = pd.get_dummies(df, dummy_na=True, columns=train_object_features_ohe)
    return df

In [10]:
X_new_1 = encode(X,train_numeric_features,object_features_ore,train_object_features_ohe)

- 在test数据集中，有些columns中的某些values未出现一次，比如test中的Utilities中没有NoSeWa值，这在预测test数据集时会造成报错（因为列向量个数不匹配）；所以将这些“冗余”的列从X_new_1编码转化完成后的数据集中剔除

In [11]:
X_new_1.shape
X_new_1.drop(columns=['Utilities_NoSeWa',
 'Condition2_RRAe',
 'Condition2_RRAn',
 'Condition2_RRNn',
 'HouseStyle_2.5Fin',
 'RoofMatl_ClyTile',
 'RoofMatl_Membran',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'Exterior1st_ImStucc',
 'Exterior1st_Stone',
 'Exterior2nd_Other',
 'Heating_Floor',
 'Heating_OthW',
 'Electrical_Mix'],inplace=True)

### 模型预测思路: 

- 试模型，调参......

In [12]:
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor,AdaBoostRegressor,RandomForestRegressor,GradientBoostingRegressor,HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [13]:
sgd = SGDRegressor()
gbr = GradientBoostingRegressor() #0.89187? ----> overfitting ----> actually:>0.14
hgbr = HistGradientBoostingRegressor()
svr = SVR()
abr = AdaBoostRegressor()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor(n_estimators=500)
#vr = VotingRegressor([('rfr',rfr),('gbr',gbr),('hgbr',hgbr)],weights=(2,1,1))
#vr = VotingRegressor([('rfr',rfr),('gbr',gbr)],weights=(2,1)) #0.8753 ----> actually: >0.14
vr = VotingRegressor([('rfr',rfr),('sgd',sgd)],weights=(2,1))

In [14]:
from sklearn.model_selection import cross_val_score
cross_val_score(vr, X_new_1, y, scoring='r2').mean()

0.8668796856288166

- GridSearchCV太耗时间了，这里就不演示了

In [15]:
# from sklearn.model_selection import GridSearchCV
# params = { 'weights':[(2,1), (3,1)],
#            'rfr__n_estimators':[120, 500],
#            'rfr__min_samples_leaf':[1, 2]}

In [16]:
# grid = GridSearchCV(vr, params)
# grid.fit(X_new_1, y)

In [17]:
# grid.best_params_,grid.best_score_

- fit模型

In [18]:
vr.fit(X_new_1,y)

VotingRegressor(estimators=[('rfr', RandomForestRegressor(n_estimators=500)),
                            ('sgd', SGDRegressor())],
                weights=(2, 1))

### 对test数据集进行相同预处理后，predict

In [19]:
test = pd.read_csv('test.csv')
test_numeric_features,test_object_features_ore,test_object_features_ohe = preprocess1(test)

In [20]:
#检查与train中的是否一致
len(test_numeric_features),len(test_object_features_ore),len(test_object_features_ohe)

(36, 11, 28)

In [21]:
test_new_1 = encode(test,test_numeric_features,test_object_features_ore,test_object_features_ohe)

In [22]:
#检查train（较大数据集）中是否有test中（较小数据集）不存在的columns
new_list = [item for item in X_new_1 if item not in test_new_1]
new_list

[]

In [23]:
result = vr.predict(test_new_1)
result.shape

(1459,)

In [24]:
test = pd.read_csv('test.csv')
pd.DataFrame({'id':test.Id,'SalePrice':result}).set_index('id').to_csv('sub_houseprice_ver2.0_rft_sgd_2.0.csv')

- LMSE:0.133