In [1]:
import numpy as np
from config import *
import datetime, time, json
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
TEST_SPLIT = 0.2
RNG_SEED = 1996

### 融合dl特征 + nlp 特征

In [3]:
F_train = np.load(open('data/tmp/train_dl_feature', 'rb'))
F_test = np.load(open('data/tmp/test_dl_feature', 'rb'))

In [4]:
print(F_train.shape)
print(F_test.shape)

(808580, 576)
(2345796, 576)


### PCA降维

In [5]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 100)
pca.fit(F_train)
F_train = pca.transform(F_train)
F_test = pca.transform(F_test)

In [6]:
print(F_train.shape)
print(F_test.shape)

(808580, 100)
(2345796, 100)


In [7]:
nlp_feature_train = pd.read_csv(FEATURE_TRAIN).values
feature_train = np.concatenate((nlp_feature_train,nlp_feature_train))
X_all = np.concatenate((feature_train,F_train),axis = 1)
y = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
y_all = np.concatenate((y,y))


nlp_feature_test = pd.read_csv(FEATURE_TEST).values
X_test = np.concatenate((nlp_feature_test,F_test),axis = 1)

KeyboardInterrupt: 

In [None]:
print(X_all.shape)
print(y_all.shape)
print(X_test.shape)

### rebalance dataset

In [None]:
pos_train = X_all[y_all == 1]
neg_train = X_all[y_all == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = np.vstack((neg_train, neg_train)) 
    scale -=1
neg_train = np.vstack([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

X_all = np.vstack([pos_train, neg_train])
y_all = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

In [None]:
indices = np.arange(X_all.shape[0])
X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size = TEST_SPLIT, random_state = RNG_SEED)

In [None]:
class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=1)

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))

        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], self.n_folds))
            for j, (train_idx, test_idx) in enumerate(skf.split(X,y)):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
                print('fold '+str(j+1) +' finished')
            S_test[:, i] = S_test_i.mean(1)
            print('model '+str(i+1) +' finished')

        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)[:]
        return y_pred
    
    def info(self,X_valid,y_valid):
        for i, model in enumerate(self.base_models):
            y_predict = model.predict(X_valid).astype(np.float64)
            print('model '+str(i)+' logloss : '+ str(log_loss(y_valid,y_predict)))

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold


# stacking
stacker = XGBRegressor(
        seed=2016,
        n_estimators=100, max_depth=8,
        learning_rate=0.05, subsample=0.8, colsample_bytree=0.85
    )
base_models = [
    RandomForestRegressor(random_state=2016),
    ExtraTreesRegressor(random_state=2016),
    GradientBoostingRegressor(random_state=2016),
    XGBRegressor(
        seed=2016,
        n_estimators=100, max_depth=8,
        learning_rate=0.05, subsample=0.8, colsample_bytree=0.85
    )
]
ensemble = Ensemble(
    n_folds=5,
    stacker = stacker,
    base_models=base_models
)
y_predict = ensemble.fit_predict(X_train,y_train,X_valid).astype(np.float64)

In [None]:
ensemble.info(X_valid,y_valid)
print('ensemble logloss : ',log_loss(y_valid,y_predict))
# y_test = ensemble.fit_predict(X_all,y_all,X_test)

In [None]:
ensemble.info(X_valid,y_valid)

In [None]:
df_test = pd.read_csv('data/test.csv')
sub = pd.DataFrame({'test_id': df_test['test_id'], 'is_duplicate': y_test})
sub.to_csv('submission.csv', index=False)
sub.head()

### XGBOOST

In [None]:
import xgboost as xgb

In [None]:
dTrain = xgb.DMatrix(X_train, label=y_train)
dVal = xgb.DMatrix(X_valid, label=y_valid)

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'logloss',
    'eta': 0.1, 
    'max_depth': 9,
    'subsample': 0.9,
    'colsample_bytree': 1 / F_train.shape[1]**0.5,
    'min_child_weight': 5,
    'silent': 1,
    'seed': 2016
}
bst = xgb.train(xgb_params, dTrain, 1000,  [(dTrain,'train'), (dVal,'val')], 
                verbose_eval=10, early_stopping_rounds=10)

In [None]:
df_test = pd.read_csv('data/test.csv')
dTest = xgb.DMatrix(X_test)
df_sub = pd.DataFrame({
        'test_id': df_test['test_id'].values,
        'is_duplicate': bst.predict(dTest, ntree_limit=bst.best_ntree_limit)
    })

In [None]:
df_sub.to_csv('submission.csv', index=False)
df_sub.head()