In [None]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import difflib
import time
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize

from models_utils_fe import *

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'

wmd = pd.read_csv(src + 'train_WMD_cleaned_stemmed.csv')
wmd = wmd.astype('float32')
wmd.replace(np.inf, 1000, inplace = True)

skip_thought = pd.read_csv(src + 'train_skipthoughts_Alex_distances.csv')
skip_thought = skip_thought.astype('float32')

compression = pd.read_csv(src + 'train_LZMAcompression_distance.csv')
compression = compression.astype('float32')

edit = pd.read_csv(src + 'train_EDITdistance.csv')
edit = edit.astype('float32')

moments = pd.read_csv(src + 'train_doc2vec_moments.csv')
moments = moments.astype('float32')

networks_NER = pd.read_csv(src + 'train_networkfeats_NER.csv')
networks_NER = networks_NER.astype('float32')

xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
y_train = xgb_feats[['is_duplicate']]

X_train = pd.read_pickle('Xtrain_500bestCols.pkl')
X_train = pd.concat([X_train, wmd, skip_thought, compression, edit, moments, networks_NER], axis = 1)

del xgb_feats, wmd, skip_thought, compression, edit, moments, networks_NER
gc.collect()

In [None]:
best_cols = [
    'min_pagerank_sp_network_weighted',
    'norm_wmd',
    'word_match',
    '1wl_tfidf_l2_euclidean',
    'm_vstack_svd_q1_q1_euclidean',
    '1wl_tfidf_cosine',
    'sk_bi_skew_q2vec',
    'm_q1_q2_tf_svd0',
    'sk_bi_skew_q1vec',
    'skew_q2vec',
    'trigram_tfidf_cosine',
    'sk_uni_skew_q2vec',
    'sk_bi_canberra_distance',
    'question1_3',
    'sk_uni_skew_q1vec',
    'sk_uni_kur_q2vec',
    'min_eigenvector_centrality_np_network_weighted',
    'avg_world_len2',
    'z_word_match',
    'sk_uni_kur_q1vec',
    'skew_doc2vec_pretrained_lemmat']

rescale = False
X_bin = bin_numerical(X_train, best_cols, 0.1)
X_grouped = group_featbyfeat(X_train, best_cols, 'mean')
X_grouped2 = group_featbyfeat(X_train, best_cols, 'sum')
X_combinations = feature_combinations(X_train, best_cols[:5])

X_additional = pd.concat([X_bin, X_grouped, X_grouped2, X_combinations], axis = 1)
X_additional = drop_duplicate_cols(X_additional)
X_additional.replace(np.inf, 999, inplace = True)
X_additional.replace(np.nan, -999, inplace = True)
if rescale:
    colnames = X_additional.columns
    X_additional = pd.DataFrame(MinMaxScaler().fit_transform(X_additional))
    X_additional.columns = colnames

X_train = pd.concat([X_train, X_additional], axis = 1)
print('Final training data shape:', X_train.shape)

del X_bin, X_grouped, X_grouped2, X_combinations, X_additional
gc.collect()

In [None]:
params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : {'binary_logloss'},
    'learning_rate' : 0.02,
    'feature_fraction' : 0.7,
    'bagging_fraction': 0.9,
    'bagging_freq': 100,
    'num_leaves' : 255,
    'max_depth': 12,
    'min_data_in_leaf': 20,
    'subsample': 0.7,
    'colsample_bytree': 0.5,
    'silent': 1,
    'random_state': 1337,
    'verbose': 1,
    'nthread': 9,
}

t = time.time()
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify = y_train,
                                                test_size = 0.2, random_state = 111)
lgb_train = lgb.Dataset(X_tr, y_tr.is_duplicate.values)
lgb_val = lgb.Dataset(X_val, y_val.is_duplicate.values)

print('Start training...')
gbm = lgb.train(params, lgb_train, num_boost_round = 100000, valid_sets = lgb_val,
                early_stopping_rounds = 150, verbose_eval = 100)

print('Start predicting...')
val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
score = log_loss(y_val, val_pred)
print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
gbm.save_model('saved_models/LGBM/LGBM_500bestexperiments_loss{:.5f}.txt'.format(score))