In [1]:
import gc
import glob
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import utils
import utils_fe
from gbm_pipeline import GBMPipeline
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
src_features = '/home/w/Projects/Toxic/data/features/'
run_kfold = True
split_for_validation = True

if run_kfold:
    prefix = 'KFold'
else:
    prefix = 'Bag'


train = pd.read_pickle("../data/train_basic_clean.pkl")
test = pd.read_pickle("../data/test_basic_clean.pkl")
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


data_tokenized = pd.read_pickle(src_features + 'data_TokenizedSentences196.pkl')
data_badwords300 = pd.read_pickle(src_features + 'data_Binary300Badwords.pkl')
data_badwordsCount = pd.read_pickle(src_features + 'data_BadwordsCount.pkl')
data_textStatistics = pd.read_pickle(src_features + 'data_TextStatistics.pkl')
data_transformations = pd.read_pickle(src_features + 'data_TransformationsFeats20dim_SVDLSA.pkl')

X = pd.concat([data_tokenized, data_badwords300, data_textStatistics, data_transformations], axis=1)
X['badwordsCount'] = data_badwordsCount

X_train = X.iloc[:train.shape[0], :]
X_test = X.iloc[train.shape[0]:, :]

features = np.setdiff1d(X_train.columns, target_columns)


del X, test
del data_tokenized, data_badwords300, data_badwordsCount
del data_textStatistics, data_transformations
gc.collect()

41

In [None]:
X_train = X_train.iloc[:train.shape[0], :][:1000]
train = train[:1000]

In [5]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.05,
    'max_depth': 10,
    'min_child_weight': 20,
    'subsample': 0.8,
    'lambda': 0,
    'tree_method': 'hist',
    'nthread': 10,
    'silent': True,
}

lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 255,
    'max_depth': 10,
    'min_child_weight': 20,
    'subsample': 0.8,
    'reg_lambda': 0,
    'nthread': 10,
}


if split_for_validation:
    
    train_params = {
        'boost_round': 10000,
        'stopping_rounds': 50,
        'verbose_eval': False,
    }
    
else:
    
    train_params = {
        'boost_round': 161,
        'stopping_rounds': 50,
        'verbose_eval': False,
    }

    
pipeline_params = {
    'use_lgb': True,
    'predict_test': True,
    'seed': 1337,
    'shuffle': True,
    'verbose': True,
    'run_save_name': 'LGB_{}_Stats&Transforms'.format(prefix),
    'save_model': False,
    'save_history': False,
    'save_statistics': False,
    'output_statistics': True,
    'output_importance': True,
}


XGB_pipeline = GBMPipeline(
    use_lgb=pipeline_params['use_lgb'],
    predict_test=pipeline_params['predict_test'],
    seed=pipeline_params['seed'],
    shuffle=pipeline_params['shuffle'],
    verbose=pipeline_params['verbose'],
    run_save_name=pipeline_params['run_save_name'],
    save_model=pipeline_params['save_model'],
    save_history=pipeline_params['save_history'],
    save_statistics=pipeline_params['save_statistics'],
    output_statistics=pipeline_params['output_statistics'],
    output_importance=pipeline_params['output_importance'],
)


In [6]:
if pipeline_params['use_lgb']:
    gbm_params = lgb_params
else:
    gbm_params = xgb_params


if pipeline_params['predict_test']:
    val_preds, test_preds, gbm = XGB_pipeline.fold_run(X_train[features], y_train=train[target_columns],
                                           X_test=X_test[features],
                                           model_params=gbm_params,
                                           train_params=train_params,
                                           output_submission=True)
else:
    val_preds, gbm = XGB_pipeline.fold_run(X_train[features], y_train=X_train[targets],
                               model_params=gbm_params,
                               train_params=train_params)


utils.save_parameter_dict(
    'checkpoints/{0}/{0}_gbm_parameters.txt'.format(pipeline_params['run_save_name']), gbm_params)
utils.save_parameter_dict('checkpoints/{0}/{0}_train_parameters.txt'.format(
    pipeline_params['run_save_name']), train_params)
utils.save_parameter_dict('checkpoints/{0}/{0}_pipeline_parameters.txt'.format(
    pipeline_params['run_save_name']), pipeline_params)

Using LightGBM
OOF train predictions shape: (95851, 6)
X_train shape: (95851, 589)
OOF test predictions shape: (226998, 6, 5)
X_test shape: (226998, 589)
Running KFold run with 5 folds
Start training with parameters: {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'learning_rate': 0.05, 'num_leaves': 255, 'max_depth': 10, 'min_child_weight': 20, 'subsample': 0.8, 'reg_lambda': 0, 'nthread': 10} 
 

Training model for column: toxic
Minimum validation split loss for current fold/bag: 0.1462012888106474 

Seconds it took to train the model: 22.12356400489807 

Best iterations: [453] 

Visualize model feature importance.
Predicting on test data.
Training model for column: severe_toxic
Minimum validation split loss for current fold/bag: 0.024794101812459085 

Seconds it took to train the model: 31.673904180526733 

Best iterations: [453, 173] 

Visualize model feature importance.
Predicting on test data.
Training model for column: obscene
Minimu

Predicting on test data.
Training model for column: severe_toxic
Minimum validation split loss for current fold/bag: 0.025027475998357838 

Seconds it took to train the model: 361.18121314048767 

Best iterations: [453, 173, 272, 315, 319, 232, 492, 256, 299, 378, 360, 210, 459, 215, 314, 301, 259, 200, 353, 214, 370, 205, 345, 252, 562, 224] 

Visualize model feature importance.
Predicting on test data.
Training model for column: obscene
Minimum validation split loss for current fold/bag: 0.06486630813522588 

Seconds it took to train the model: 376.1468462944031 

Best iterations: [453, 173, 272, 315, 319, 232, 492, 256, 299, 378, 360, 210, 459, 215, 314, 301, 259, 200, 353, 214, 370, 205, 345, 252, 562, 224, 245] 

Visualize model feature importance.
Predicting on test data.
Training model for column: threat
Minimum validation split loss for current fold/bag: 0.012478882900373788 

Seconds it took to train the model: 388.1970884799957 

Best iterations: [453, 173, 272, 315, 319, 232