In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [37]:
train.head()
train.shape

(4459, 4993)

In [38]:
test.head()
test.shape

(49342, 4992)

In [39]:
train.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [40]:
test_ID = test['ID']
target = train['target']
target = np.log1p(target)

In [41]:
train.drop('ID',axis=1,inplace=True)

In [42]:
train.drop('target',axis=1,inplace=True)

In [43]:
test.drop('ID',axis=1,inplace=True)

In [44]:
cols_with_one_var = train.columns[train.nunique()==1]
cols_with_one_var.size

256

In [45]:
train.drop(cols_with_one_var,axis=1,inplace=True)
test.drop(cols_with_one_var,axis=1,inplace=True)

In [46]:
NUM_OF_DECIMALS = 32

In [47]:
## 将精度归为32位
train = train.round(NUM_OF_DECIMALS)
test = test.round(NUM_OF_DECIMALS)

In [48]:
colsToRemove = []
columns = train.columns

In [49]:
# 移除重复列
for i  in range(len(columns)-1):
    v = train[columns[i]].values
    dupCols = []
    for j in range(i+1,len(columns)):
        if np.array_equal(v,train[columns[j]].values):
            colsToRemove.append(columns[j])
            
train.drop(colsToRemove,axis=1,inplace=True)
test.drop(colsToRemove,axis=1,inplace=True)
train.shape

(4459, 4730)

### 2. 使用随机森林挑选特征

In [51]:
from sklearn import model_selection
from sklearn import ensemble
NUM_OF_FEATURES = 1000
def rmsle(y,pred):
    return np.sqrt(np.mean(np.power(y-pred,2)))

x_train,x_test,y_train,y_test = model_selection.train_test_split(train,target.values,test_size=0.2,random_state=5)
rf = ensemble.RandomForestRegressor(n_jobs=6,random_state=5)
rf.fit(x_train,y_train)
print(rmsle(y_test,rf.predict(x_test)))

1.51707194481


In [52]:
col = pd.DataFrame({'importance':rf.feature_importances_,'feature':train.columns}).sort_values(by=['importance'],ascending=False)[:NUM_OF_FEATURES]['feature'].values

In [53]:
train = train[col]
test = test[col]
train.shape

(4459, 1000)

### 3.测试训练集和测试集是否同分布

In [54]:
from scipy.stats import ks_2samp

THRESHOLD_P_VALUE = 0.01
THRESHOLD_STATISTIC = 0.3

diff_col = []

for col in train.columns:
    statistic,pvalue = ks_2samp(train[col].values,test[col].values)
    if pvalue<= THRESHOLD_P_VALUE and np.abs(statistic)>THRESHOLD_STATISTIC:
        diff_col.append(col)
        


In [55]:
train = train.drop(diff_col,axis=1)
test = test.drop(diff_col,axis=1)

In [56]:
train.shape

(4459, 1000)

### 4. 添加一些统计特征，添加了特征的低维表示

In [57]:
from sklearn import random_projection
ntrain = len(train)
ntest = len(test)
tmp = pd.concat([train,test])#RandomProjection
weight = ((train != 0).sum()/len(train)).values
tmp_train = train[train!=0]
tmp_test = test[test!=0]
train["weight_count"] = (tmp_train*weight).sum(axis=1)
test["weight_count"] = (tmp_test*weight).sum(axis=1)
train["count_not0"] = (train != 0).sum(axis=1)
test["count_not0"] = (test != 0).sum(axis=1)
train["sum"] = train.sum(axis=1)
test["sum"] = test.sum(axis=1)
train["var"] = tmp_train.var(axis=1)
test["var"] = tmp_test.var(axis=1)
train["median"] = tmp_train.median(axis=1)
test["median"] = tmp_test.median(axis=1)
train["mean"] = tmp_train.mean(axis=1)
test["mean"] = tmp_test.mean(axis=1)
train["std"] = tmp_train.std(axis=1)
test["std"] = tmp_test.std(axis=1)
train["max"] = tmp_train.max(axis=1)
test["max"] = tmp_test.max(axis=1)
train["min"] = tmp_train.min(axis=1)
test["min"] = tmp_test.min(axis=1)
train["skew"] = tmp_train.skew(axis=1)
test["skew"] = tmp_test.skew(axis=1)
train["kurtosis"] = tmp_train.kurtosis(axis=1)
test["kurtosis"] = tmp_test.kurtosis(axis=1)
del(tmp_train)
del(tmp_test)
NUM_OF_COM = 100 #need tuned

In [58]:
transformer = random_projection.SparseRandomProjection(n_components = NUM_OF_COM)
RP = transformer.fit_transform(tmp)

In [59]:
rp = pd.DataFrame(RP)
columns = ["RandomProjection{}".format(i) for i in range(NUM_OF_COM)]
rp.columns = columns

rp_train = rp[:ntrain]
rp_test = rp[ntrain:]
rp_test.index = test.index

#concat RandomProjection and raw data
train = pd.concat([train,rp_train],axis=1)
test = pd.concat([test,rp_test],axis=1)

In [60]:
del(rp_train)
del(rp_test)
train.shape

(4459, 1111)

In [61]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
#define evaluation method for a given model. we use k-fold cross validation on the training set. 
#the loss function is root mean square logarithm error between target and prediction
#note: train and y_train are feeded as global variables
NUM_FOLDS = 5 #need tuned



In [62]:
#ensemble method: model averaging
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    # the reason of clone is avoiding affect the original base models
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]  
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)
        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([ model.predict(X) for model in self.models_ ])
        return np.mean(predictions, axis=1)


In [63]:
train.shape

(4459, 1111)

In [65]:
target.shape

(4459,)

In [69]:

def rmsle_cv(model):
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
    rmse= np.sqrt(-cross_val_score(model, train, target, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)



    
    
model_xgb = xgb.XGBRegressor(colsample_bytree=0.055, colsample_bylevel =0.5, 
                             gamma=1.5, learning_rate=0.02, max_depth=32, 
                             objective='reg:linear',
                             min_child_weight=57, n_estimators=1000, reg_alpha=0, 
                             reg_lambda = 0, subsample=0.7, 
                             silent=1, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=144,
                              learning_rate=0.005, n_estimators=720, max_depth=13,
                              metric='rmse',is_training_metric=True,
                              max_bin = 55, bagging_fraction = 0.8,verbose=-1,
                              bagging_freq = 5, feature_fraction = 0.9) 
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))
averaged_models = AveragingModels(models = (model_xgb, model_lgb))
score = rmsle_cv(averaged_models)
print("averaged score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

Xgboost score: 1.3564 (0.0257)

LGBM score: 1.3490 (0.0250)

averaged score: 1.3442 (0.0237)



In [70]:
averaged_models.fit(train.values, target)
pred = np.expm1(averaged_models.predict(test.values))
ensemble = pred
sub = pd.DataFrame()
sub['ID'] = test_ID
sub['target'] = ensemble
sub.to_csv('submission.csv',index=False)