1. データの読み込み

In [1]:
import math
import random
import time
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import transformers as T
from pathlib import Path
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
import tqdm

In [2]:
warnings.filterwarnings("ignore")

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [5]:
DATA_DIR = './dataset/data2'
OUTPUT_DIR = './result/result1'
if not os.path.isdir(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [6]:
train = pd.read_csv(DATA_DIR  +"/train.csv",  index_col=0)
test = pd.read_csv(DATA_DIR + "/test.csv", index_col=0)
sub = pd.read_csv(DATA_DIR + "/sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]

In [16]:
train.head()

Unnamed: 0,id,title,abstract,judgement,fold,title_vector_0,title_vector_1,title_vector_2,title_vector_3,title_vector_4,...,title_vector_758,title_vector_759,title_vector_760,title_vector_761,title_vector_762,title_vector_763,title_vector_764,title_vector_765,title_vector_766,title_vector_767
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,0,-0.607368,0.214609,0.491671,0.585819,-0.481558,...,-0.346558,-0.154007,-0.117931,-0.730297,0.508722,-0.853626,-0.103609,-0.367437,0.437865,-0.956179
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,1,-0.668105,0.257561,0.392438,0.400021,-0.309336,...,-0.290643,0.016189,-0.041532,-0.404358,0.5532,-0.726713,-0.378171,-0.17393,0.417677,-0.867161
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,4,-0.418311,0.380539,0.42832,0.422149,-0.010591,...,-0.279595,-0.057832,-0.185633,-0.344625,0.426403,-0.636598,-0.22035,-0.417752,0.389819,-0.433953
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,3,-0.516953,0.167355,0.391707,0.479743,-0.377085,...,-0.420219,-0.101646,-0.24892,-0.573272,0.490884,-1.071567,-0.13399,-0.370881,0.267396,-0.936951
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0,1,-0.591694,0.274961,0.413193,0.453942,-0.142811,...,-0.475409,-0.040903,-0.233842,-0.449259,0.592372,-0.901793,-0.269731,-0.337088,0.322325,-0.929041


In [45]:
from imblearn.under_sampling import RandomUnderSampler
positive_count_train = train['judgement'].sum()
rus = RandomUnderSampler(random_state=71)
X, y = train.drop(columns=['id', 'title', 'abstract', 'judgement', 'fold']), train['judgement']
X_resample, y_resample = rus.fit_resample(X,y)
train_resample = pd.concat([X_resample, y_resample], axis=1)
X_test = test.drop(columns=['id', 'title', 'abstract'])

In [34]:
from pycaret.classification import *
exp1 = setup(train_resample, target = 'judgement')

Unnamed: 0,Description,Value
0,session_id,1248
1,Target,judgement
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(1264, 769)"
5,Missing Values,False
6,Numeric Features,768
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [35]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7917,0.0,0.771,0.8052,0.786,0.5835,0.5862,0.017
lr,Logistic Regression,0.7612,0.852,0.7348,0.7775,0.7542,0.5224,0.5249,0.239
lightgbm,Light Gradient Boosting Machine,0.7329,0.8109,0.6984,0.7511,0.7224,0.4657,0.4682,1.147
gbc,Gradient Boosting Classifier,0.7308,0.8078,0.6983,0.7471,0.7209,0.4615,0.4635,1.353
et,Extra Trees Classifier,0.7171,0.7806,0.7032,0.7235,0.7121,0.4343,0.4357,0.077
rf,Random Forest Classifier,0.7036,0.768,0.6714,0.7179,0.6925,0.4071,0.4093,0.121
ada,Ada Boost Classifier,0.7036,0.7882,0.7165,0.6982,0.7064,0.4073,0.4084,0.286
knn,K Neighbors Classifier,0.6731,0.7214,0.6804,0.6704,0.6742,0.3462,0.3473,0.103
svm,SVM - Linear Kernel,0.6538,0.0,0.5637,0.8136,0.5539,0.3059,0.3904,0.035
dt,Decision Tree Classifier,0.6335,0.6334,0.6325,0.6346,0.6322,0.2668,0.2681,0.065


RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=1248,
                solver='auto', tol=0.001)

In [43]:
ridge = create_model('ridge')
tuned_ridge = tune_model(ridge)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8202,0.0,0.7778,0.8537,0.814,0.6408,0.6434
1,0.8764,0.0,0.8409,0.9024,0.8706,0.7526,0.7543
2,0.8202,0.0,0.75,0.8684,0.8049,0.6399,0.6458
3,0.7978,0.0,0.7955,0.7955,0.7955,0.5955,0.5955
4,0.875,0.0,0.8409,0.9024,0.8706,0.75,0.7517
5,0.7841,0.0,0.8182,0.766,0.7912,0.5682,0.5695
6,0.7386,0.0,0.6591,0.7838,0.716,0.4773,0.4834
7,0.7386,0.0,0.75,0.7333,0.7416,0.4773,0.4774
8,0.7386,0.0,0.7273,0.7442,0.7356,0.4773,0.4774
9,0.8068,0.0,0.8636,0.7755,0.8172,0.6136,0.6176


In [62]:
ridge_predict = predict_model(tuned_ridge, X_test)

In [63]:
sub['judgement'] = ridge_predict['Label']

In [64]:
#保存先のディレクトリ作成
save_dir = "./result/result3"
os.makedirs(save_dir)
sub.to_csv(save_dir + '/submission.csv', header=None, index=None)

In [66]:
evaluate_model(tuned_ridge)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…