In [1]:
import gc
import glob
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import utils
import utils_fe
from gbm_pipeline import GBMPipeline
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
use_lgb = False
run_kfold = True
split_for_validation = True

if run_kfold:
    prefix = 'KFold'
else:
    prefix = 'Bag'

    
run_name = 'XGB_{}_TfidfBasic'.format(prefix)
src = '/home/w/Projects/Toxic/data/features/'


train = pd.read_pickle("../data/train_basic_clean.pkl")
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X = pd.read_pickle("../data/features/data_Tfidf_2GramWord.pkl")

X_train = X[:train.shape[0], :]
X_test = X[train.shape[0]:, :]


del X
gc.collect()

11

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.05,
    'max_depth': 10,
    'min_child_weight': 20,
    'subsample': 0.8,
    'lambda': 0,
    'tree_method': 'hist',
    'nthread': 4,
    'silent': True,
}

lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 255,
    'max_depth': 10,
    'min_child_weight': 20,
    'subsample': 0.8,
    'reg_lambda': 0,
    'nthread': 10,
}


if split_for_validation:
    
    train_params = {
        'boost_round': 10000,
        'stopping_rounds': 50,
        'verbose_eval': 50,
    }
    
else:
    
    train_params = {
        'boost_round': 161,
        'stopping_rounds': 50,
        'verbose_eval': False,
    }

    
pipeline_params = {
    'use_lgb': use_lgb,
    'predict_test': True,
    'seed': 1337,
    'shuffle': True,
    'verbose': True,
    'run_save_name': run_name,
    'save_model': False,
    'save_history': False,
    'save_statistics': False,
    'output_statistics': True,
    'output_importance': True,
}


XGB_pipeline = GBMPipeline(
    use_lgb=pipeline_params['use_lgb'],
    predict_test=pipeline_params['predict_test'],
    seed=pipeline_params['seed'],
    shuffle=pipeline_params['shuffle'],
    verbose=pipeline_params['verbose'],
    run_save_name=pipeline_params['run_save_name'],
    save_model=pipeline_params['save_model'],
    save_history=pipeline_params['save_history'],
    save_statistics=pipeline_params['save_statistics'],
    output_statistics=pipeline_params['output_statistics'],
    output_importance=pipeline_params['output_importance'],
)


In [None]:
if pipeline_params['use_lgb']:
    gbm_params = lgb_params
else:
    gbm_params = xgb_params


if pipeline_params['predict_test']:
    val_preds, test_preds, gbm = XGB_pipeline.bag_run(X_train, y_train=train[target_columns],
                                           #X_valid=X_valid, y_valid=X_valid[target_columns],
                                           X_test=X_test,
                                           model_params=gbm_params,
                                           train_params=train_params,
                                           output_submission=True)
else:
    val_preds, gbm = XGB_pipeline.bag_run(X_train, y_train=X_train[targets],
                               model_params=gbm_params,
                               train_params=train_params)


utils.save_parameter_dict(
    'checkpoints/{0}/{0}_gbm_parameters.txt'.format(pipeline_params['run_save_name']), gbm_params)
utils.save_parameter_dict('checkpoints/{0}/{0}_train_parameters.txt'.format(
    pipeline_params['run_save_name']), train_params)
utils.save_parameter_dict('checkpoints/{0}/{0}_pipeline_parameters.txt'.format(
    pipeline_params['run_save_name']), pipeline_params)

Using XGBoost
Running bagging (currently just one bag).
Start training with parameters: {'objective': 'binary:logistic', 'eval_metric': 'logloss', 'eta': 0.05, 'max_depth': 10, 'min_child_weight': 20, 'subsample': 0.8, 'lambda': 0, 'tree_method': 'hist', 'nthread': 4, 'silent': True} 
 

X_train shape: (95851, 668367)
X_test shape: (226998, 668367)
Splitting data - validation split size: 0.2, split seed: 1337
Training model for column: toxic
[0]	train-logloss:0.654975	valid-logloss:0.655032
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.199078	valid-logloss:0.202288
