1. データの読み込み

In [63]:
import math
import random
import time
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import transformers as T
from pathlib import Path
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import tqdm
from sklearn.feature_selection import SelectKBest, chi2
from scipy.optimize import minimize_scalar
from sklearn import naive_bayes

In [64]:
warnings.filterwarnings("ignore")

In [65]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [66]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [67]:
DATA_DIR = './dataset/data4'

In [68]:
def text_process(mess):
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', 'im', 'dont', 'doin', 'ure']
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return ' '.join([word.lower() for word in nopunc.split() if word.lower() not in STOPWORDS])
#データの読み込み
df_train = pd.read_csv(DATA_DIR  +"/train.csv", index_col=0).fillna(" ")
df_test = pd.read_csv(DATA_DIR + "/test.csv", index_col=0).fillna(" ")
sub = pd.read_csv(DATA_DIR + "/sample_submit.csv", index_col=0)
df_train.title = train.text.apply(text_process)
df_test.title = test.text.apply(text_process)

In [93]:
from scipy.misc import derivative

def original_init_score(y):
    y = y.mean()
    return np.log(y/(1-y))

class focal_loss:
    def __init__(self, alpha, gamma, balance=False):
        self.alpha = alpha
        self.gamma = gamma
        self.balance = balance
        
    def focal_loss_objective(self, y_pred, trn_data):
        y_true = trn_data.get_label()
        def fl(x,t):
            p = 1/(1+np.exp(-x))
            if self.balance:
                return -(t*(1-p)**self.gamma * np.log(p) * self.alpha + p ** self.gamma * (1-t) * np.log(1-p) * (1-self.alpha)) 
            else:
                return -(t*(1-p)**self.gamma * np.log(p) + p ** self.gamma * (1-t) * np.log(1-p))
        partial_fl = lambda x: fl(x, y_true)
        grad = derivative(partial_fl, y_pred, n=1, dx=1e-6)
        hess = derivative(partial_fl, y_pred, n=2, dx=1e-6)
        return grad, hess
    def original_binary_logloss_metric(self, y_pred, trn_data):
        y_train = trn_data.get_label()
        pred = 1/(1+np.exp(-y_pred))
        loss = -(y_train * np.log(pred) + (1-y_train)*np.log(1-pred))    
        return 'original_binary_logloss', np.mean(loss), False

In [69]:
sub_prob = []
for fold in tqdm.tqdm(range(5), total=5):
#     sub = np.zeros()
    train = df_train[df_train.fold!=fold]
    val = df_train[df_train.fold==fold]
    X_train, y_train = train.loc[:, 'text'], train.loc[:, 'judgement']
    X_val, y_val = val.loc[:, 'text'], val.loc[:, 'judgement']
    X_test = df_test.loc[:, 'text']
    rus = RandomUnderSampler(random_state=71)
    pipline = Pipeline([('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(n_components=500, random_state=0)), 
            ('sc',MinMaxScaler())])
    X_train = pipline.fit_transform(X_train)
    X_val = pipline.transform(X_val)
    X_test = pipline.transform(X_test)
    X_train, y_train = rus.fit_resample(X_train,y_train)
    max_score = 0
    columns_num = X_train.shape[1]
    for k in range(columns_num):
        select_num = columns_num-k
        select = SelectKBest(score_func = chi2, k=select_num)
        X_tr_ = select.fit_transform(X_train, y_train)
        X_val_ = select.transform(X_val)
        X_tes_ = select.transform(X_test)
        print(X_tr_.shape)
        model = LogisticRegression()
        model.fit(X_tr_, y_train)
        prob = model.predict_proba(X_val_)[:,1]
        def calc_thres(x:float):
            predict =  np.where(prob < x, 0, 1)
            return -fbeta_score(y_val, predict, beta=7)
        thres = minimize_scalar(calc_thres, method='Bounded', bounds=(0.01, 0.99))
        score = -thres.fun
        print(score,max_score)
        if score < max_score:
            break
        else:
            max_score = score
            X_tes_fix = X_tes_
            model_fix = model
    prediction = model_fix.predict_proba(X_tes_fix)[:,1]
    test_prob =  np.where(prediction < thres.x, 0, 1)
    sub_prob.append(test_prob)
    print('fold{}::{}::{}'.format(fold, thres.x, -thres.fun))

  0%|                                                                                                                                                                                        | 0/5 [00:00<?, ?it/s]

(1012, 500)
0.8585265629586147 0
(1012, 499)


 20%|███████████████████████████████████▏                                                                                                                                            | 1/5 [00:24<01:38, 24.61s/it]

0.8566865108174821 0.8585265629586147
fold0::0.40123557940930576::0.8566865108174821
(1012, 500)
0.8629515869533422 0
(1012, 499)


 40%|██████████████████████████████████████████████████████████████████████▍                                                                                                         | 2/5 [00:49<01:15, 25.01s/it]

0.8633304067895815 0.8629515869533422
(1012, 498)
0.8616912516430553 0.8633304067895815
fold1::0.42228417588658657::0.8616912516430553


 40%|██████████████████████████████████████████████████████████████████████▍                                                                                                         | 2/5 [00:58<01:28, 29.39s/it]


KeyboardInterrupt: 

In [105]:
import optuna.integration.lightgbm as lgb_o
import lightgbm as lgb_o
params = dict(n_estimators=10000,
              metric='binary',
                        num_leaves=31,
                        learning_rate=0.01,
                        colsample_bytree=0.3, 
                        importance_type="gain")

focal = focal_loss(alpha=0.25, gamma=0.2, balance=False)
sub_prob = []
for fold in tqdm.tqdm(range(5), total=5):
#     sub = np.zeros()
    train = df_train[df_train.fold!=fold]
    val = df_train[df_train.fold==fold]
    X_train, y_train = train.loc[:, 'text'], train.loc[:, 'judgement']
    X_val, y_val = val.loc[:, 'text'], val.loc[:, 'judgement']
    X_test = df_test.loc[:, 'text']
    pipline = Pipeline([('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(n_components=500, random_state=0)), 
            ('sc',MinMaxScaler())])
    X_train = pipline.fit_transform(X_train)
    X_val = pipline.transform(X_val)
    X_test = pipline.transform(X_test)
    rus = RandomUnderSampler(random_state=71)
    X_train, y_train = rus.fit_resample(X_train,y_train)
    lgb_train = lgb_o.Dataset(X_train, y_train)
    lgb_eval = lgb_o.Dataset(X_val, y_val)
    evals_result = {}
                         
    model = lgb_o.train(params,train_set=lgb_train,
                 # 評価データ
                 valid_sets = lgb_eval,
                evals_result=evals_result,
                 early_stopping_rounds=1000)
    prob = model.predict(X_val)
    def calc_thres(x:float):
        predict =  np.where(prob < x, 0, 1)
        return -fbeta_score(y_val, predict, beta=7)
    thres = minimize_scalar(calc_thres, method='Bounded', bounds=(0.01, 0.99))
    score = -thres.fun
    prediction = model.predict(X_test)
    test_prob =  np.where(prediction < thres.x, 0, 1)
    sub_prob.append(test_prob)
    print('fold{}::{}::{}'.format(fold, thres.x, -thres.fun))

  0%|                                                                                                                                                                                        | 0/5 [00:00<?, ?it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 1012, number of used features: 500
[LightGBM] [Info] Start training from score 0.500000
[1]	valid_0's binary_logloss: 0.688978
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's binary_logloss: 0.685783
[3]	valid_0's binary_logloss: 0.68128
[4]	valid_0's binary_logloss: 0.678231
[5]	valid_0's binary_logloss: 0.674769
[6]	valid_0's binary_logloss: 0.671282
[7]	valid_0's binary_logloss: 0.668057
[8]	valid_0's binary_logloss: 0.664195
[9]	valid_0's binary_logloss: 0.660053
[10]	valid_0's binary_logloss: 0.656685
[11]	valid_0's binary_logloss: 0.652368
[12]	valid_0's binary_logloss: 0.649218
[13]	valid_0's binary_logloss: 0.646215
[14]	valid_0's binary_logloss: 0.642912
[15]	valid_0's binary_logloss: 0.639386
[16]	valid_0's binary_logloss: 0.636204
[17]	valid_0's binary_logloss: 0.633382
[18]	valid_0's binary_loglo

 20%|███████████████████████████████████▏                                                                                                                                            | 1/5 [00:28<01:53, 28.37s/it]

fold0::0.4151697934475248::0.865580448065173
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 1012, number of used features: 500
[LightGBM] [Info] Start training from score 0.500000
[1]	valid_0's binary_logloss: 0.688888
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's binary_logloss: 0.684832
[3]	valid_0's binary_logloss: 0.680998
[4]	valid_0's binary_logloss: 0.67746
[5]	valid_0's binary_logloss: 0.67411
[6]	valid_0's binary_logloss: 0.670593
[7]	valid_0's binary_logloss: 0.667411
[8]	valid_0's binary_logloss: 0.663575
[9]	valid_0's binary_logloss: 0.659829
[10]	valid_0's binary_logloss: 0.656097
[11]	valid_0's binary_logloss: 0.651706
[12]	valid_0's binary_logloss: 0.648394
[13]	valid_0's binary_logloss: 0.644667
[14]	valid_0's binary_logloss: 0.640994
[15]	valid_0's binary_logloss: 0.638003
[16]	valid_0's binary_logloss: 0.6347
[17]	valid_0's binary_log

 40%|██████████████████████████████████████████████████████████████████████▍                                                                                                         | 2/5 [01:00<01:31, 30.66s/it]

fold1::0.4080008553991677::0.8697527314548592
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 1012, number of used features: 500
[LightGBM] [Info] Start training from score 0.500000
[1]	valid_0's binary_logloss: 0.688576
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's binary_logloss: 0.684416
[3]	valid_0's binary_logloss: 0.679579
[4]	valid_0's binary_logloss: 0.675447
[5]	valid_0's binary_logloss: 0.67236
[6]	valid_0's binary_logloss: 0.669086
[7]	valid_0's binary_logloss: 0.665609
[8]	valid_0's binary_logloss: 0.662209
[9]	valid_0's binary_logloss: 0.658214
[10]	valid_0's binary_logloss: 0.654401
[11]	valid_0's binary_logloss: 0.650052
[12]	valid_0's binary_logloss: 0.647016
[13]	valid_0's binary_logloss: 0.643316
[14]	valid_0's binary_logloss: 0.640115
[15]	valid_0's binary_logloss: 0.637535
[16]	valid_0's binary_logloss: 0.634214
[17]	valid_0's binary

 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 3/5 [01:29<00:59, 29.82s/it]

fold2::0.4311392941152049::0.8700102354145343
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 1010, number of used features: 500
[LightGBM] [Info] Start training from score 0.500000
[1]	valid_0's binary_logloss: 0.688683
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's binary_logloss: 0.684937
[3]	valid_0's binary_logloss: 0.680609
[4]	valid_0's binary_logloss: 0.677703
[5]	valid_0's binary_logloss: 0.674948
[6]	valid_0's binary_logloss: 0.671973
[7]	valid_0's binary_logloss: 0.669082
[8]	valid_0's binary_logloss: 0.665367
[9]	valid_0's binary_logloss: 0.661561
[10]	valid_0's binary_logloss: 0.657927
[11]	valid_0's binary_logloss: 0.653632
[12]	valid_0's binary_logloss: 0.650302
[13]	valid_0's binary_logloss: 0.646639
[14]	valid_0's binary_logloss: 0.643076
[15]	valid_0's binary_logloss: 0.640468
[16]	valid_0's binary_logloss: 0.636703
[17]	valid_0's binar

 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 4/5 [02:01<00:30, 30.64s/it]

fold3::0.39885821565575097::0.8453933228256199
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 1010, number of used features: 500
[LightGBM] [Info] Start training from score 0.500000
[1]	valid_0's binary_logloss: 0.689088
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's binary_logloss: 0.685099
[3]	valid_0's binary_logloss: 0.68092
[4]	valid_0's binary_logloss: 0.677362
[5]	valid_0's binary_logloss: 0.673929
[6]	valid_0's binary_logloss: 0.670916
[7]	valid_0's binary_logloss: 0.667804
[8]	valid_0's binary_logloss: 0.664345
[9]	valid_0's binary_logloss: 0.659996
[10]	valid_0's binary_logloss: 0.656113
[11]	valid_0's binary_logloss: 0.65192
[12]	valid_0's binary_logloss: 0.648777
[13]	valid_0's binary_logloss: 0.64565
[14]	valid_0's binary_logloss: 0.641692
[15]	valid_0's binary_logloss: 0.638951
[16]	valid_0's binary_logloss: 0.635905
[17]	valid_0's binary_

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:29<00:00, 29.92s/it]

fold4::0.4258707885792814::0.8668015024559375





In [106]:
from scipy import stats
mode_2, count_2 = stats.mode(np.stack(sub_prob), axis=0)
sub = pd.read_csv(DATA_DIR + "/sample_submit.csv", header = None)
sub.columns = ["id", "judgement"]
sub["judgement"] = mode_2[0]
#保存先のディレクトリ作成
save_dir = "./result/result9"
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
sub.to_csv(save_dir + '/submission.csv', header=None, index=None)