In [1]:
import numpy as np
import os
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from lightgbm import LGBMClassifier
import optuna

In [2]:
data = fetch_20newsgroups()

X = data['data'][:5000]
y = data['target'][:5000]

In [3]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),   
    ('lgbc', LGBMClassifier(objective='f1_score', class_weight='balanced', n_jobs=1))])

In [4]:
def objective(trial):    
    
    joblib.dump(study, 'study.pkl')
    
    tfidf__analyzer = trial.suggest_categorical('tfidf__analyzer', ['word', 'char', 'char_wb']) 
    tfidf__lowercase = trial.suggest_categorical('tfidf__lowercase', [False, True]) 
    tfidf__max_features = trial.suggest_int('tfidf__max_features', 500, 10_000) 
    lgbc__num_leaves = trial.suggest_int('lgbc__num_leaves', 2, 150) 
    lgbc__max_depth = trial.suggest_int('lgbc__max_depth', 2, 100) 
    lgbc__n_estimators = trial.suggest_int('lgbc__n_estimators', 10, 200) 
    lgbc__subsample_for_bin = trial.suggest_int('lgbc__subsample_for_bin', 2000, 300_000) 
    lgbc__min_child_samples = trial.suggest_int('lgbc__min_child_samples', 20, 500) 
    lgbc__reg_alpha = trial.suggest_uniform('lgbc__reg_alpha', 0.0, 1.0) 
    lgbc__colsample_bytree = trial.suggest_uniform('lgbc__colsample_bytree', 0.6, 1.0) 
    lgbc__learning_rate = trial.suggest_loguniform('lgbc__learning_rate', 1e-3, 1e-1)   
    

    params = {
        'tfidf__analyzer': tfidf__analyzer,
        'tfidf__lowercase': tfidf__lowercase,
        'tfidf__max_features': tfidf__max_features,
        'lgbc__num_leaves': lgbc__num_leaves,
        'lgbc__max_depth': lgbc__max_depth,
        'lgbc__n_estimators': lgbc__n_estimators,
        'lgbc__subsample_for_bin': lgbc__subsample_for_bin,
        'lgbc__min_child_samples': lgbc__min_child_samples,
        'lgbc__reg_alpha': lgbc__reg_alpha,
        'lgbc__colsample_bytree': lgbc__colsample_bytree,
        'lgbc__learning_rate': lgbc__learning_rate
    }
    
    model.set_params(**params)

    return - np.mean(cross_val_score(model, X, y, cv=8, n_jobs=-1))

In [5]:
if os.path.isfile('study.pkl'):
    study = joblib.load('study.pkl')
else:
    study = optuna.create_study()
study.optimize(objective, timeout=3600)

[I 2019-02-25 21:01:12,960] Finished a trial resulted in value: -0.40009081233137944. Current best value is -0.40009081233137944 with parameters: {'tfidf__analyzer': 'char_wb', 'tfidf__lowercase': False, 'tfidf__max_features': 8674, 'lgbc__num_leaves': 136, 'lgbc__max_depth': 57, 'lgbc__n_estimators': 188, 'lgbc__subsample_for_bin': 113091, 'lgbc__min_child_samples': 336, 'lgbc__reg_alpha': 0.06067536960030162, 'lgbc__colsample_bytree': 0.710570015765403, 'lgbc__learning_rate': 0.05996568102968543}.
[I 2019-02-25 21:02:41,766] Finished a trial resulted in value: -0.4746442321600257. Current best value is -0.4746442321600257 with parameters: {'tfidf__analyzer': 'word', 'tfidf__lowercase': True, 'tfidf__max_features': 8671, 'lgbc__num_leaves': 125, 'lgbc__max_depth': 48, 'lgbc__n_estimators': 144, 'lgbc__subsample_for_bin': 140009, 'lgbc__min_child_samples': 199, 'lgbc__reg_alpha': 0.4490141244341166, 'lgbc__colsample_bytree': 0.6721984237674553, 'lgbc__learning_rate': 0.0943053801787786

[I 2019-02-25 21:23:35,573] Finished a trial resulted in value: -0.29386696092105635. Current best value is -0.7156505349501154 with parameters: {'tfidf__analyzer': 'word', 'tfidf__lowercase': True, 'tfidf__max_features': 9969, 'lgbc__num_leaves': 65, 'lgbc__max_depth': 29, 'lgbc__n_estimators': 19, 'lgbc__subsample_for_bin': 9309, 'lgbc__min_child_samples': 22, 'lgbc__reg_alpha': 0.7097473879672915, 'lgbc__colsample_bytree': 0.7875704223990483, 'lgbc__learning_rate': 0.03605652961575837}.
[I 2019-02-25 21:24:23,192] Finished a trial resulted in value: -0.30128933881698994. Current best value is -0.7156505349501154 with parameters: {'tfidf__analyzer': 'word', 'tfidf__lowercase': True, 'tfidf__max_features': 9969, 'lgbc__num_leaves': 65, 'lgbc__max_depth': 29, 'lgbc__n_estimators': 19, 'lgbc__subsample_for_bin': 9309, 'lgbc__min_child_samples': 22, 'lgbc__reg_alpha': 0.7097473879672915, 'lgbc__colsample_bytree': 0.7875704223990483, 'lgbc__learning_rate': 0.03605652961575837}.
[I 2019-02

[I 2019-02-25 21:36:06,919] Finished a trial resulted in value: -0.4061360938676233. Current best value is -0.7190499407225581 with parameters: {'tfidf__analyzer': 'word', 'tfidf__lowercase': True, 'tfidf__max_features': 7020, 'lgbc__num_leaves': 119, 'lgbc__max_depth': 17, 'lgbc__n_estimators': 38, 'lgbc__subsample_for_bin': 32671, 'lgbc__min_child_samples': 20, 'lgbc__reg_alpha': 0.8551167951265838, 'lgbc__colsample_bytree': 0.7852068815310277, 'lgbc__learning_rate': 0.023043160872099802}.
[I 2019-02-25 21:37:38,459] Finished a trial resulted in value: -0.3383337351765082. Current best value is -0.7190499407225581 with parameters: {'tfidf__analyzer': 'word', 'tfidf__lowercase': True, 'tfidf__max_features': 7020, 'lgbc__num_leaves': 119, 'lgbc__max_depth': 17, 'lgbc__n_estimators': 38, 'lgbc__subsample_for_bin': 32671, 'lgbc__min_child_samples': 20, 'lgbc__reg_alpha': 0.8551167951265838, 'lgbc__colsample_bytree': 0.7852068815310277, 'lgbc__learning_rate': 0.023043160872099802}.
[I 201

[I 2019-02-25 21:54:42,867] Finished a trial resulted in value: -0.6212739489742806. Current best value is -0.7190499407225581 with parameters: {'tfidf__analyzer': 'word', 'tfidf__lowercase': True, 'tfidf__max_features': 7020, 'lgbc__num_leaves': 119, 'lgbc__max_depth': 17, 'lgbc__n_estimators': 38, 'lgbc__subsample_for_bin': 32671, 'lgbc__min_child_samples': 20, 'lgbc__reg_alpha': 0.8551167951265838, 'lgbc__colsample_bytree': 0.7852068815310277, 'lgbc__learning_rate': 0.023043160872099802}.
[I 2019-02-25 21:55:32,785] Finished a trial resulted in value: -0.36249154672996553. Current best value is -0.7190499407225581 with parameters: {'tfidf__analyzer': 'word', 'tfidf__lowercase': True, 'tfidf__max_features': 7020, 'lgbc__num_leaves': 119, 'lgbc__max_depth': 17, 'lgbc__n_estimators': 38, 'lgbc__subsample_for_bin': 32671, 'lgbc__min_child_samples': 20, 'lgbc__reg_alpha': 0.8551167951265838, 'lgbc__colsample_bytree': 0.7852068815310277, 'lgbc__learning_rate': 0.023043160872099802}.
[I 20

In [6]:
print('best_value:', study.best_value)

best_value: -0.7190499407225581


In [7]:
model.set_params(**study.best_params)
model.fit(X, y)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=7020, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=32671,
        subsample_freq=0))])