In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
import multiprocessing
import difflib
import time
import gc

import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer



In [16]:
def get_train():
    keras_q1 = np.load('../../data/transformed/keras_tokenizer/train_q1_transformed.npy')
    keras_q2 = np.load('../../data/transformed/keras_tokenizer/train_q2_transformed.npy')
    xgb_feats = pd.read_csv('../../data/features/the_1owl/owl_train.csv')
    abhishek_feats = pd.read_csv('../../data/features/abhishek/train_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../data/features/other_features/text_features_train.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv('../../data/features/other_features/img_features_train.csv')
    srk_feats = pd.read_csv('../../data/features/srk/SRK_grams_features_train.csv')

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    y_train = xgb_feats['is_duplicate']
    xgb_feats = xgb_feats.iloc[:, 8:]
    
    X_train2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    #X_train2 = np.concatenate([xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    for i in range(X_train2.shape[1]):
        if np.sum(X_train2[:, i] == y_train.values) == X_train2.shape[0]:
            print('LEAK FOUND')
    
    X_train2 = X_train2.astype('float32')
    X_train2 = pd.DataFrame(X_train2)
    X_train2['is_duplicate'] = y_train
    print('Training data shape:', X_train2.shape)
    return X_train2, y_train

def get_test():
    keras_q1 = np.load('../../data/transformed/keras_tokenizer/test_q1_transformed.npy')
    keras_q2 = np.load('../../data/transformed/keras_tokenizer/test_q2_transformed.npy')
    xgb_feats = pd.read_csv('../../data/features/the_1owl/owl_test.csv')
    abhishek_feats = pd.read_csv('../../data/features/abhishek/test_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../data/features/other_features/text_features_test.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv('../../data/features/other_features/img_features_test.csv')
    srk_feats = pd.read_csv('../../data/features/srk/SRK_grams_features_test.csv')

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    xgb_feats = xgb_feats.iloc[:, 5:]
    
    X_test2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    #X_test2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats], axis = 1)
    
    X_test2 = X_test2.astype('float32')
    X_test2 = pd.DataFrame(X_test2)
    print('Test data shape:', X_test2.shape)
    return X_test2

def predict_test(model_name):
    X_test = get_test()
    gbm = xgb.Booster(model_file = 'saved_models/XGB/{}.txt'.format(model_name))
    test_preds = gbm.predict(xgb.DMatrix(X_test))

    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

In [3]:
def oversample():
    print('Oversampling negative y according to anokas method')
    X_train, y_train = get_train()
    pos_train = X_train[X_train['is_duplicate'] == 1]
    neg_train = X_train[X_train['is_duplicate'] == 0]
    p = 0.165
    scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
    while scale > 1:
        neg_train = pd.concat([neg_train, neg_train])
        scale -=1
    neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
    X_train = pd.concat([pos_train, neg_train])
    y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

    X_train = X_train.astype('float32')
    X_train.drop(['is_duplicate'], axis = 1, inplace = True)
    return X_train, y_train

def oversample2(X_train):
    print('Oversampling negative y according to SRK method')
    y_train = np.array(X_train["is_duplicate"])
    X_train.drop(['is_duplicate'], axis = 1, inplace = True)
    X_train_dup = X_train[y_train==1]
    X_train_non_dup = X_train[y_train==0]

    X_train = np.vstack([X_train_non_dup, X_train_dup, X_train_non_dup, X_train_non_dup])
    y_train = np.array([0]*X_train_non_dup.shape[0] + [1]*X_train_dup.shape[0] + [0]*X_train_non_dup.shape[0] + [0]*X_train_non_dup.shape[0])
    del X_train_dup
    del X_train_non_dup
    print("Mean target rate : ",y_train.mean())
    X_train = X_train.astype('float32')
    return X_train, y_train

def kappa(preds, y):
    score = []
    a = 0.165 / 0.37
    b = (1 - 0.165) / (1 - 0.37)
    for pp,yy in zip(preds, y.get_label()):
        score.append(a * yy * np.log (pp) + b * (1 - yy) * np.log(1-pp))
    score = -np.sum(score) / len(score)
    return 'kappa', score

def get_temporal_pattern(df2):
    df = df2.copy()
    df["qmax"] = df.apply( lambda row: max(row["qid1"], row["qid2"]), axis=1 )
    df = df.sort_values(by=["qmax"], ascending=True)
    df["dupe_rate"] = df.is_duplicate.rolling(window=500, min_periods=500).mean()
    df["timeline"] = np.arange(df.shape[0]) / float(df.shape[0])
    return df

In [4]:
def train_xgb(cv = False):
    
    t = time.time()
    params = {
    'seed': 1337,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'eta': 0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'min_child_weight': 1,
    'nthread': 6,
    'tree_method': 'hist',
    }

    
    X_train, y_train = get_train()
    #X_train = X_train.astype('float32')
    #X_train.drop(['is_duplicate'], axis = 1, inplace = True)
    X_train, y_train = oversample2(X_train)
    if cv:
        dtrain = xgb.DMatrix(X_train, y_train)
        hist = xgb.cv(params, dtrain, num_boost_round = 100000, nfold = 5,
                      stratified = True, early_stopping_rounds = 350, verbose_eval = 250,
                      seed = 1337)
        del X_train, y_train
        gc.collect()
        print('Time it took to train in CV manner:', time.time() - t)
        return hist
    
    else:
        X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify = y_train,
                                                    test_size = 0.2, random_state = 111)
        del X_train, y_train
        gc.collect()
        dtrain = xgb.DMatrix(X_tr, label = y_tr)
        dval = xgb.DMatrix(X_val, label = y_val)
        watchlist = [(dtrain, 'train'), (dval, 'valid')]

        print('Start training...')
        gbm = xgb.train(params, dtrain, 100000, watchlist, 
                        early_stopping_rounds = 350, verbose_eval = 250)

        print('Start predicting...')
        val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
        score = log_loss(y_val, val_pred)
        print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
        
        del X_tr, X_val, y_tr, y_val
        gc.collect()
        return gbm
    

def run_xgb(model_name, train = True, test = False, cv = False):
    if cv:
        gbm_hist = train_xgb(True)
        return gbm_hist
    if train:
        gbm = train_xgb()
        gbm.save_model('saved_models/XGB/{}.txt'.format(model_name))
        if test:
            predict_test('{}'.format(model_name))
        return gbm

In [5]:
def get_train():
    keras_q1 = np.load('../../data/transformed/keras_tokenizer/train_q1_transformed.npy')
    keras_q2 = np.load('../../data/transformed/keras_tokenizer/train_q2_transformed.npy')
    xgb_feats = pd.read_csv('../../data/features/the_1owl/owl_train.csv')
    abhishek_feats = pd.read_csv('../../data/features/abhishek/train_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../data/features/other_features/text_features_train.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv('../../data/features/other_features/img_features_train.csv')
    srk_feats = pd.read_csv('../../data/features/srk/SRK_grams_features_train.csv')

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    y_train = xgb_feats['is_duplicate']
    xgb_feats = xgb_feats.iloc[:, 8:]
    
    #X_train2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    #X_train2 = np.concatenate([xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    X_train2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, img_feats], axis = 1)
    for i in range(X_train2.shape[1]):
        if np.sum(X_train2[:, i] == y_train.values) == X_train2.shape[0]:
            print('LEAK FOUND')
    
    X_train2 = X_train2.astype('float32')
    X_train2 = pd.DataFrame(X_train2)
    X_train2['is_duplicate'] = y_train
    print('Training data shape:', X_train2.shape)
    return X_train2, y_train

In [6]:
input_folder = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/'

df_train = pd.read_csv(input_folder + 'train.csv')
X_train, y_train = get_train()

X_train['qid1'] = df_train['qid1']
X_train['qid2'] = df_train['qid2']
X_traintemp = get_temporal_pattern(X_train)

Training data shape: (404290, 210)


In [7]:
X_tr = X_traintemp.iloc[:360000, :]
X_val = X_traintemp.iloc[360000:, :]

X_tr.drop(['qid1', 'qid2', 'qmax', 'dupe_rate'], axis = 1, inplace = True)
X_val.drop(['qid1', 'qid2', 'qmax', 'dupe_rate'], axis = 1, inplace = True)

y_tr = X_tr['is_duplicate']
X_tr.drop(['is_duplicate'], axis = 1, inplace = True)

y_val = X_val['is_duplicate']
X_val.drop(['is_duplicate'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [8]:
params = {
    'seed': 1337,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'eta': 0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'min_child_weight': 1,
    'nthread': 6,
    'tree_method': 'hist',
    }

t = time.time()
dtrain = xgb.DMatrix(X_tr, label = y_tr)
dval = xgb.DMatrix(X_val, label = y_val)
watchlist = [(dtrain, 'train'), (dval, 'valid')]

print('Start training...')
gbm = xgb.train(params, dtrain, 100000, watchlist, 
                early_stopping_rounds = 350, verbose_eval = 250)

print('Start predicting...')
val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
score = log_loss(y_val, val_pred)
print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)

Start training...
[0]	train-logloss:0.674218	valid-logloss:0.67017
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 350 rounds.
[250]	train-logloss:0.392507	valid-logloss:0.324946
[500]	train-logloss:0.374015	valid-logloss:0.315331
[750]	train-logloss:0.36287	valid-logloss:0.310121
[1000]	train-logloss:0.353967	valid-logloss:0.307269
[1250]	train-logloss:0.346235	valid-logloss:0.304677
[1500]	train-logloss:0.339551	valid-logloss:0.303332
[1750]	train-logloss:0.333347	valid-logloss:0.301864
[2000]	train-logloss:0.327589	valid-logloss:0.3009
[2250]	train-logloss:0.322191	valid-logloss:0.300018
[2500]	train-logloss:0.317015	valid-logloss:0.299543
[2750]	train-logloss:0.311972	valid-logloss:0.298753
[3000]	train-logloss:0.307197	valid-logloss:0.298148
[3250]	train-logloss:0.30253	valid-logloss:0.297568
[3500]	train-logloss:0.29814	valid-logloss:0.2971
[3750]	train-logloss:0.293853	valid-logloss:0.296

In [None]:
predict_test('XGB_nooversampling')