In [72]:
'''数据处理'''
import pandas as pd
import numpy as np

train = pd.read_csv('../data/Etrain.csv')  # 导入训练集
test = pd.read_csv('../data/Etest.csv')  #导入测试集


#数值型和类别型分开处理
num_columns = []
cate_columns = []
for column in test.columns:
    if test.dtypes[column] != np.dtype('object'):
        num_columns.append(column)
    else:
        cate_columns.append(column)
print(num_columns)   #数值型数据的列名称

['Id', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', 'GrLivArea', 'FullBath', 'GarageCars']


In [73]:
print(cate_columns)   #类别型数据的列名称

['SaleType']


In [74]:
label = train.pop('SalePrice')

In [75]:
#label2 = train.pop('OverallCond')

In [76]:
#缺失值填充
# 数值型用中值填充
for column in num_columns:
    train[column] = train[column].fillna(train[column].median())
    test[column] = test[column].fillna(test[column].median())

# # 类别型用最多的填充 
# for column in cate_columns:
#     train[column] = train[column].fillna(train[column].mode())
#     test[column] = test[column].fillna(test[column].mode())
    
# 类别型填充'NaN'
for column in cate_columns:
    train[column] = train[column].fillna('NaN')
    test[column] = test[column].fillna('NaN')
    
#类别型哑变量处理 将不能够定量处理的变量量化

data = pd.concat([train,test],axis=0) #训练集要和测试集放一起
for column in cate_columns:
    
    t = pd.get_dummies(data[column],prefix=column)
    train = pd.concat([train,t[:len(train)]],axis=1)
    train.drop(column,axis=1,inplace=True)
    
    test = pd.concat([test,t[len(train):]],axis=1)
    test.drop(column,axis=1,inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [77]:
#回归
from sklearn.linear_model import Lasso,LinearRegression,Ridge,ElasticNet,TheilSenRegressor,HuberRegressor,RANSACRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import itertools

X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.33, random_state=42)

regs = [
    ['Lasso',Lasso()],   #套索回归
    ['LinearRegression',LinearRegression()],  #线性回归
    ['Ridge',Ridge()],    #岭回归    
    ['TheilSenRegressor',TheilSenRegressor()],  #泰尔森回归
    ['RANSACRegressor',RANSACRegressor()],   #随机抽样一致性   
    ['DecisionTreeRegressor',DecisionTreeRegressor()],    #决策树回归
    ['RandomForestRegressor',RandomForestRegressor(n_estimators=150)],     #随机森林回归
    ['XGBRegressor',XGBRegressor(n_estimators=150)],
]

In [78]:
preds = []
for reg_name,reg in regs:
    print(reg_name)
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)
    if np.sum(y_pred<0) > 0:
        print('y_pred have',np.sum(y_pred<0),'negative values, we fill it with np.median(y_pred)')
        y_pred[y_pred<0] = np.median(y_pred)
    score = np.sqrt(mean_squared_error(np.log(y_test),np.log(y_pred)))
    preds.append([reg_name,y_pred])
    

final_results = []
for comb_length in range(1,len(regs)+1):
    print('Model Amount :',comb_length)
    results = []
    for comb in itertools.combinations(preds,comb_length):
        pred_sum = 0
        model_name = []
        for reg_name,pred in comb:
            pred_sum += pred
            model_name.append(reg_name)
        pred_sum /= comb_length
        model_name = '+'.join(model_name)
        score = np.sqrt(mean_squared_error(np.log(y_test),np.log(pred_sum)))
        results.append([model_name,score])
    results = sorted(results,key=lambda x:x[1])
    for model_name,score in results:
        print(model_name,score)
    print()
    final_results.append(results[0])

Lasso
y_pred have 1 negative values, we fill it with np.median(y_pred)
LinearRegression
y_pred have 1 negative values, we fill it with np.median(y_pred)
Ridge
y_pred have 1 negative values, we fill it with np.median(y_pred)
TheilSenRegressor
RANSACRegressor
DecisionTreeRegressor
RandomForestRegressor
XGBRegressor
Model Amount : 1
RandomForestRegressor 0.16358252197097084
XGBRegressor 0.16652750584015594
RANSACRegressor 0.1814725922097048
Ridge 0.1975075594882399
Lasso 0.21140993127755417
LinearRegression 0.2124312121467681
DecisionTreeRegressor 0.2171803779684804
TheilSenRegressor 0.285784711987325

Model Amount : 2
RANSACRegressor+RandomForestRegressor 0.16186277104066768
RandomForestRegressor+XGBRegressor 0.1619637886967034
RANSACRegressor+XGBRegressor 0.16432054536057575
Ridge+RandomForestRegressor 0.1667622380788972
Lasso+RandomForestRegressor 0.16872318620269217
LinearRegression+RandomForestRegressor 0.16882496261310453
Ridge+XGBRegressor 0.1702778376469939
RANSACRegressor+Decisio

  if getattr(data, 'base', None) is not None and \


 0.1642934811411625
Lasso+RANSACRegressor+DecisionTreeRegressor+XGBRegressor 0.16460230816373939
LinearRegression+RANSACRegressor+DecisionTreeRegressor+XGBRegressor 0.1646334747226157
Lasso+Ridge+RandomForestRegressor+XGBRegressor 0.16867490938873633
LinearRegression+Ridge+RandomForestRegressor+XGBRegressor 0.16871763866560932
Lasso+Ridge+DecisionTreeRegressor+RandomForestRegressor 0.1697886166037339
LinearRegression+Ridge+DecisionTreeRegressor+RandomForestRegressor 0.16982580602834427
Lasso+LinearRegression+RandomForestRegressor+XGBRegressor 0.1701008882362313
Lasso+Ridge+DecisionTreeRegressor+XGBRegressor 0.17087126739878658
LinearRegression+Ridge+DecisionTreeRegressor+XGBRegressor 0.17091457610230512
Lasso+LinearRegression+DecisionTreeRegressor+RandomForestRegressor 0.17099129802555987
Lasso+Ridge+RANSACRegressor+RandomForestRegressor 0.17153481934728843
LinearRegression+Ridge+RANSACRegressor+RandomForestRegressor 0.17157266853706385
Lasso+LinearRegression+DecisionTreeRegressor+XGBR

In [79]:
final_results = sorted(final_results,key=lambda x:x[1])
for model_name,score in final_results:
    print(model_name,score)

RANSACRegressor+RandomForestRegressor+XGBRegressor 0.16039796522040947
Ridge+RANSACRegressor+DecisionTreeRegressor+RandomForestRegressor+XGBRegressor 0.16141486603971528
RANSACRegressor+DecisionTreeRegressor+RandomForestRegressor+XGBRegressor 0.16172029733434257
RANSACRegressor+RandomForestRegressor 0.16186277104066768
RandomForestRegressor 0.16358252197097084
Lasso+Ridge+RANSACRegressor+DecisionTreeRegressor+RandomForestRegressor+XGBRegressor 0.1640318356875474
Lasso+LinearRegression+Ridge+RANSACRegressor+DecisionTreeRegressor+RandomForestRegressor+XGBRegressor 0.16711504223464746
Lasso+LinearRegression+Ridge+TheilSenRegressor+RANSACRegressor+DecisionTreeRegressor+RandomForestRegressor+XGBRegressor 0.17402839475610643


In [81]:
#最终输出
[b for b in zip(itertools.count(),[a[0] for a in regs])]

pred = np.mean(list(map(lambda x:regs[x][1].predict(test),[0,5,6])),axis=0)

sub = pd.DataFrame({'Id':test['Id'],'SalePrice':pred})
sub.to_csv('D:/Desktop/predict.csv',index=None)