In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib
import lightgbm as lgb
import scipy
from scipy import sparse
from pandas.core.common import SettingWithCopyWarning
import scipy.stats as sp
import pandas as pd
import numpy as np
from collections import Counter
import warnings
import time
import sys
import random
import os
import gc
import datetime

import matplotlib.pyplot as plt

In [2]:
path = '../data/'
data_path = '../trainTestData/'
middle_path = '../model/'

train_y = pd.read_csv(path + 'age_train.csv', names=['uid', 'label'])
sub = pd.read_csv(path + 'age_test.csv', names=['uid'])

train_csr = sparse.load_npz(data_path + 'trainData30.npz')
test_csr = sparse.load_npz(data_path + 'testData30.npz')
train_y = train_y["label"].values

# train_csr = train_csr[:200]
# train_y = train_y['label'].values[:200]
# print(train_csr.shape, test_csr.shape)

In [3]:
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='multiclass',
    metrics='multi_error',
    num_class=6,
    n_estimators=20000,
    learning_rate=0.1,
    num_leaves=512,
    max_depth=-1,
    subsample=0.95,
    colsample_bytree=0.5,
    subsample_freq=1,
    reg_alpha=1,
    reg_lambda=1,
    random_state=42,
    n_jobs=48,
    
        
    gpu_platform_id=-1,
    gpu_device_id=-1,
    device='gpu',
    max_bin=255,
)

oof = np.zeros((train_csr.shape[0], 6))
sub_preds = np.zeros((test_csr.shape[0], 6))
skf = StratifiedKFold(n_splits=5, random_state=812, shuffle=True)
t = time.time()


for index, (train_index, test_index) in enumerate(skf.split(train_csr, train_y)):
    print('Fold {}'.format(index + 1))
    lgb_model.fit(train_csr[train_index], train_y[train_index],
                  eval_set=[(train_csr[train_index], train_y[train_index]),
                            (train_csr[test_index], train_y[test_index])],
                  eval_names=['train', 'valid'],
                  early_stopping_rounds=200, verbose=10)

    oof[test_index] = lgb_model.predict_proba(train_csr[test_index], num_iteration=lgb_model.best_iteration_)
    sub_preds += lgb_model.predict_proba(test_csr, num_iteration=lgb_model.best_iteration_) / skf.n_splits
#     lgb_model.savetxt(middle_path+'model/lgb_zl'+str(index)+'_model.txt')
    joblib.dump(lgb_model, '../model/lgb_zl_13100_2'+str(index)+'_model.pkl')

print(oof.shape, train_y.shape)


Fold 1
Training until validation scores don't improve for 200 rounds.
[10]	train's multi_error: 0.599182	valid's multi_error: 0.60391
[20]	train's multi_error: 0.568929	valid's multi_error: 0.577846
[30]	train's multi_error: 0.556034	valid's multi_error: 0.568211
[40]	train's multi_error: 0.54878	valid's multi_error: 0.564065
[50]	train's multi_error: 0.54332	valid's multi_error: 0.561458
[60]	train's multi_error: 0.538996	valid's multi_error: 0.559841
[70]	train's multi_error: 0.535531	valid's multi_error: 0.558614
[80]	train's multi_error: 0.532387	valid's multi_error: 0.557866
[90]	train's multi_error: 0.529215	valid's multi_error: 0.557371
[100]	train's multi_error: 0.526478	valid's multi_error: 0.557087
[110]	train's multi_error: 0.523807	valid's multi_error: 0.556761
[120]	train's multi_error: 0.521263	valid's multi_error: 0.556821
[130]	train's multi_error: 0.518757	valid's multi_error: 0.556632
[140]	train's multi_error: 0.516285	valid's multi_error: 0.556219
[150]	train's mult

Fold 4
Training until validation scores don't improve for 200 rounds.
[10]	train's multi_error: 0.599075	valid's multi_error: 0.604159
[20]	train's multi_error: 0.568899	valid's multi_error: 0.577774
[30]	train's multi_error: 0.556091	valid's multi_error: 0.568828
[40]	train's multi_error: 0.548706	valid's multi_error: 0.563604
[50]	train's multi_error: 0.543367	valid's multi_error: 0.560965
[60]	train's multi_error: 0.539044	valid's multi_error: 0.559706
[70]	train's multi_error: 0.535604	valid's multi_error: 0.558614
[80]	train's multi_error: 0.532318	valid's multi_error: 0.557958
[90]	train's multi_error: 0.529277	valid's multi_error: 0.557199
[100]	train's multi_error: 0.526392	valid's multi_error: 0.556963
[110]	train's multi_error: 0.523783	valid's multi_error: 0.5569
[120]	train's multi_error: 0.521247	valid's multi_error: 0.556652
[130]	train's multi_error: 0.518753	valid's multi_error: 0.556659
[140]	train's multi_error: 0.516395	valid's multi_error: 0.556206
[150]	train's mul

In [4]:
cv_final = accuracy_score(train_y, np.argmax(oof, axis=1)+1)
print('\ncv acc:', cv_final)

sub['label'] = np.argmax(sub_preds, axis=1) + 1
# sub.to_csv(middle_path + 'sub_{}.csv'.format(cv_final), index=False)


oof = np.zeros((train_csr.shape[0], 6))
sub_preds = np.zeros((test_csr.shape[0], 6))
skf = StratifiedKFold(n_splits=5, random_state=812, shuffle=True)
t = time.time()


cv acc: 0.44456517412935326


In [5]:
for index, (train_index, test_index) in enumerate(skf.split(train_csr, train_y)):
    print('Fold {}'.format(index + 1))
    lgb_model = joblib.load('../model/lgb_zl_13100_2'+str(index)+'_model.pkl')

    oof[test_index] = lgb_model.predict_proba(train_csr[test_index], num_iteration=lgb_model.best_iteration_)
    sub_preds += lgb_model.predict_proba(test_csr, num_iteration=lgb_model.best_iteration_) / skf.n_splits


cv_final = accuracy_score(train_y, np.argmax(oof, axis=1)+1)
print('\ncv acc:', cv_final)

np.savetxt('../processed/lgboost_val_13100.txt', oof, fmt='%s', delimiter=',', newline='\n')
np.savetxt('../processed/lgboost_test_13100.txt', sub_preds, fmt='%s', delimiter=',', newline='\n')

sub['label'] = np.argmax(sub_preds, axis=1) + 1
# sub.to_csv(middle_path + 'sub_{}.csv'.format(cv_final), index=False)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5

cv acc: 0.44456517412935326
