In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train.head()
train.shape

(4459, 4993)

In [4]:
test.head()
test.shape

(49343, 4992)

In [5]:
train.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [6]:
test_ID = test['ID']
y_train = train['target']
y_train = np.log1p(y_train)

In [7]:
train.drop('ID',axis=1,inplace=True)

In [8]:
train.drop('target',axis=1,inplace=True)

In [9]:
test.drop('ID',axis=1,inplace=True)

In [10]:
cols_with_one_var = train.columns[train.nunique()==1]
cols_with_one_var.size

256

In [11]:
train.drop(cols_with_one_var,axis=1,inplace=True)
test.drop(cols_with_one_var,axis=1,inplace=True)

In [12]:
NUM_OF_DECIMALS = 32

In [31]:
## 将精度归为32位
train = train.round(NUM_OF_DECIMALS)
test = test.round(NUM_OF_DECIMALS)

In [14]:
colsToRemove = []
columns = train.columns

In [15]:
# 移除重复列
for i  in range(len(columns)-1):
    v = train[columns[i]].values
    dupCols = []
    for j in range(i+1,len(columns)):
        if np.array_equal(v,train[columns[j]].values):
            colsToRemove.append(columns[j])
            
train.drop(colsToRemove,axis=1,inplace=True)
test.drop(colsToRemove,axis=1,inplace=True)
train.shape

(4459, 4730)

### 2. 使用随机森林挑选特征

In [16]:
from sklearn import model_selection
from sklearn import ensemble
NUM_OF_FEATURES = 1000
def rmsle(y,pred):
    return np.sqrt(np.mean(np.power(y-pred,2)))

x_train,x_test,y_train,y_test = model_selection.train_test_split(train,y_train.values,test_size=0.2,random_state=5)
rf = ensemble.RandomForestRegressor(n_jobs=4,random_state=5)
rf.fit(x_train,y_train)
print(rmsle(y_test,rf.predict(x_test)))

1.51707194481


In [17]:
col = pd.DataFrame({'importance':rf.feature_importances_,'feature':train.columns}).sort_values(by=['importance'],ascending=False)[:NUM_OF_FEATURES]['feature'].values

In [18]:
train = train[col]
test = test[col]
train.shape

(4459, 1000)

### 3.测试训练集和测试集是否同分布

In [19]:
from scipy.stats import ks_2samp

THRESHOLD_P_VALUE = 0.01
THRESHOLD_STATISTIC = 0.3

diff_col = []

for col in train.columns:
    statistic,pvalue = ks_2samp(train[col].values,test[col].values)
    if pvalue<= THRESHOLD_P_VALUE and np.abs(statistic)>THRESHOLD_STATISTIC:
        diff_col.append(col)
        


In [20]:
train = train.drop(diff_col,axis=1)
test = test.drop(diff_col,axis=1)

In [21]:
train.shape

(4459, 1000)

### 4. 添加一些统计特征，添加了特征的低维表示

In [29]:
from sklearn import random_projection
ntrain = len(train)
ntest = len(test)
tmp = pd.concat([train,test])#RandomProjection
weight = ((train != 0).sum()/len(train)).values
tmp_train = train[train!=0]
tmp_test = test[test!=0]
train["weight_count"] = (tmp_train*weight).sum(axis=1)
test["weight_count"] = (tmp_test*weight).sum(axis=1)
train["count_not0"] = (train != 0).sum(axis=1)
test["count_not0"] = (test != 0).sum(axis=1)
train["sum"] = train.sum(axis=1)
test["sum"] = test.sum(axis=1)
train["var"] = tmp_train.var(axis=1)
test["var"] = tmp_test.var(axis=1)
train["median"] = tmp_train.median(axis=1)
test["median"] = tmp_test.median(axis=1)
train["mean"] = tmp_train.mean(axis=1)
test["mean"] = tmp_test.mean(axis=1)
train["std"] = tmp_train.std(axis=1)
test["std"] = tmp_test.std(axis=1)
train["max"] = tmp_train.max(axis=1)
test["max"] = tmp_test.max(axis=1)
train["min"] = tmp_train.min(axis=1)
test["min"] = tmp_test.min(axis=1)
train["skew"] = tmp_train.skew(axis=1)
test["skew"] = tmp_test.skew(axis=1)
train["kurtosis"] = tmp_train.kurtosis(axis=1)
test["kurtosis"] = tmp_test.kurtosis(axis=1)
del(tmp_train)
del(tmp_test)
NUM_OF_COM = 100 #need tuned

In [32]:
transformer = random_projection.SparseRandomProjection(n_components = NUM_OF_COM)
RP = transformer.fit_transform(tmp)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').