In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
import multiprocessing
import difflib
import time
import gc

import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import lightgbm as lgb

import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
def get_train():
    keras_q1 = np.load('../../data/transformed/keras_tokenizer/train_q1_transformed.npy')
    keras_q2 = np.load('../../data/transformed/keras_tokenizer/train_q2_transformed.npy')
    xgb_feats = pd.read_csv('../../data/features/the_1owl/owl_train.csv')
    abhishek_feats = pd.read_csv('../../data/features/abhishek/train_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../data/features/other_features/text_features_train.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv('../../data/features/other_features/img_features_train.csv')
    srk_feats = pd.read_csv('../../data/features/srk/SRK_grams_features_train.csv')

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    y_train = xgb_feats['is_duplicate']
    xgb_feats = xgb_feats.iloc[:, 8:]
    
    X_train2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    #X_train2 = np.concatenate([xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    #X_train2 = np.concatenate([xgb_feats], axis = 1)
    for i in range(X_train2.shape[1]):
        if np.sum(X_train2[:, i] == y_train.values) == X_train2.shape[0]:
            print('LEAK FOUND')
    
    X_train2 = X_train2.astype('float32')
    X_train2 = pd.DataFrame(X_train2)
    X_train2['is_duplicate'] = y_train
    print('Training data shape:', X_train2.shape)
    return X_train2, y_train

def get_test():
    keras_q1 = np.load('../../data/transformed/keras_tokenizer/test_q1_transformed.npy')
    keras_q2 = np.load('../../data/transformed/keras_tokenizer/test_q2_transformed.npy')
    xgb_feats = pd.read_csv('../../data/features/the_1owl/owl_test.csv')
    abhishek_feats = pd.read_csv('../../data/features/abhishek/test_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../data/features/other_features/text_features_test.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv('../../data/features/other_features/img_features_test.csv')
    srk_feats = pd.read_csv('../../data/features/srk/SRK_grams_features_test.csv')

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    xgb_feats = xgb_feats.iloc[:, 5:]
    
    X_test2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    #X_test2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats], axis = 1)
    
    X_test2 = X_test2.astype('float32')
    X_test2 = pd.DataFrame(X_test2)
    print('Test data shape:', X_test2.shape)
    return X_test2

def predict_test(model_name):
    X_test = get_test()
    gbm = lgb.Booster(model_file='saved_models/LGBM/{}.txt'.format(model_name))
    test_preds = gbm.predict(lgb.Dataset(X_test))

    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

In [3]:
def oversample(X_train, y_train):
    print('Oversampling negative y according to anokas method')
    pos_train = X_train[X_train['is_duplicate'] == 1]
    neg_train = X_train[X_train['is_duplicate'] == 0]
    p = 0.165
    scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
    while scale > 1:
        neg_train = pd.concat([neg_train, neg_train])
        scale -=1
    neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
    X_train = pd.concat([pos_train, neg_train])
    y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

    X_train = X_train.astype('float32')
    X_train.drop(['is_duplicate'], axis = 1, inplace = True)
    return X_train, y_train

def oversample2(X_train):
    print('Oversampling negative y according to SRK method')
    y_train = np.array(X_train["is_duplicate"])
    X_train.drop(['is_duplicate'], axis = 1, inplace = True)
    X_train_dup = X_train[y_train==1]
    X_train_non_dup = X_train[y_train==0]

    X_train = np.vstack([X_train_non_dup, X_train_dup, X_train_non_dup, X_train_non_dup])
    y_train = np.array([0]*X_train_non_dup.shape[0] + [1]*X_train_dup.shape[0] + [0]*X_train_non_dup.shape[0] + [0]*X_train_non_dup.shape[0])
    del X_train_dup
    del X_train_non_dup
    print("Mean target rate : ",y_train.mean())
    X_train = X_train.astype('float32')
    return X_train, y_train

def kappa(preds, y):
    score = []
    a = 0.165 / 0.37
    b = (1 - 0.165) / (1 - 0.37)
    for pp,yy in zip(preds, y.get_label()):
        score.append(a * yy * np.log (pp) + b * (1 - yy) * np.log(1-pp))
    score = -np.sum(score) / len(score)
    return 'kappa', score

def get_temporal_pattern(df2):
    df = df2.copy()
    df["qmax"] = df.apply( lambda row: max(row["qid1"], row["qid2"]), axis=1 )
    df = df.sort_values(by=["qmax"], ascending=True)
    df["dupe_rate"] = df.is_duplicate.rolling(window=500, min_periods=500).mean()
    df["timeline"] = np.arange(df.shape[0]) / float(df.shape[0])
    return df

In [4]:
def train_lgb(cv = False):
    
    t = time.time()
    params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'binary',
        'metric' : {'binary_logloss'},
        'learning_rate' : 0.05,
        'feature_fraction' : 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 100,
        'num_leaves' : 200,
        'max_depth': 4,
        'min_data_in_leaf': 1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'silent': 1,
        'random_state': 1337,
        'verbose': 1,
        'nthread': 6,
    }
    
    X_train, _ = get_train()
    X_train, y_train = oversample2(X_train)
    if cv:
        lgb_train = lgb.Dataset(X_train, y_train)
        hist = lgb.cv(params, lgb_train, num_boost_round = 100000, nfold = 5,
                      stratified = True, early_stopping_rounds = 350, verbose_eval = 250,
                      seed = 1337)
        del X_train, y_train
        gc.collect()
        print('Time it took to train in CV manner:', time.time() - t)
        return hist
    
    else:
        X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify = y_train,
                                                    test_size = 0.2, random_state = 111)
        del X_train, y_train
        gc.collect()
        lgb_train = lgb.Dataset(X_tr, y_tr)
        lgb_val = lgb.Dataset(X_val, y_val)

        print('Start training...')
        gbm = lgb.train(params, lgb_train, num_boost_round = 100000, valid_sets = lgb_val,
                        early_stopping_rounds = 350, verbose_eval = 500)

        print('Start predicting...')
        val_pred = gbm.predict(lgb.Dataset(X_val), num_iteration=gbm.best_iteration)
        score = log_loss(y_val, val_pred)
        print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
        
        del X_tr, X_val, y_tr, y_val
        gc.collect()
        return gbm
    

def run_lgbm(model_name, train = True, test = False, cv = False):
    if cv:
        gbm_hist = train_lgb(True)
        return gbm_hist
    if train:
        gbm = train_lgb()
        gbm.save_model('saved_models/LGBM/{}.txt'.format(model_name))
        if test:
            predict_test('{}'.format(model_name))
        return gbm

In [None]:
gbm = run_lgbm(train = True)

In [5]:
input_folder = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/'

df_train = pd.read_csv(input_folder + 'train.csv')
X_train, y_train = get_train()

X_train['qid1'] = df_train['qid1']
X_train['qid2'] = df_train['qid2']
X_traintemp = get_temporal_pattern(X_train)

Training data shape: (404290, 247)


In [10]:
X_tr = X_traintemp.iloc[:360000, :]
X_val = X_traintemp.iloc[:360000, :]

X_tr.drop(['qid1', 'qid2', 'qmax', 'dupe_rate'], axis = 1, inplace = True)
X_val.drop(['qid1', 'qid2', 'qmax', 'dupe_rate'], axis = 1, inplace = True)

X_tr, y_tr = oversample2(X_tr)
y_val = X_val['is_duplicate']
X_val.drop(['is_duplicate'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Oversampling negative y according to SRK method


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Mean target rate :  0.175685821108


In [None]:
params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'binary',
        'metric' : {'binary_logloss'},
        'learning_rate' : 0.05,
        'feature_fraction' : 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 100,
        'num_leaves' : 200,
        'max_depth': 4,
        'min_data_in_leaf': 1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'silent': 1,
        'random_state': 1337,
        'verbose': 1,
        'nthread': 6,
    }

t = time.time()
lgb_train = lgb.Dataset(X_tr, y_tr)
lgb_val = lgb.Dataset(X_val, y_val)

print('Start training...')
gbm = lgb.train(params, lgb_train, num_boost_round = 100000, valid_sets = lgb_val,
                early_stopping_rounds = 350, verbose_eval = 500)

print('Start predicting...')
val_pred = gbm.predict(lgb.Dataset(X_val), num_iteration=gbm.best_iteration)
score = log_loss(y_val, val_pred)
print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)