1. データの読み込み

In [1]:
import math
import random
import time
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import transformers as T
from pathlib import Path
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
import tqdm

In [2]:
warnings.filterwarnings("ignore")

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [5]:
DATA_DIR = './dataset/data3'
OUTPUT_DIR = './result/result4'
if not os.path.isdir(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [35]:
train = pd.read_csv(DATA_DIR  +"/train.csv",  index_col=0)
test = pd.read_csv(DATA_DIR + "/test.csv", index_col=0)
sub = pd.read_csv(DATA_DIR + "/sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]

In [7]:
train.head()

Unnamed: 0_level_0,title,abstract,judgement,len_title,len_abstract,title_token,abstract_token,title_count_detection,title_count_lewy,title_count_bodies,...,abstract_count_reference,abstract_count_amyloid,abstract_count_endoscopic,abstract_count_ctl,abstract_count_metaiodobenzylguanidine,abstract_count_detected,abstract_count_easytouse,abstract_count_sampling,abstract_count_truth,abstract_count_readings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,58,1321,oneyear age changes mri brain volumes older ad...,longitudinal studies indicate declines cogniti...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,182,1361,supportive csf biomarker evidence enhance nati...,present study undertaken validate measurement ...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,59,1047,occurrence basal ganglia germ cell tumors with...,objective report case series basal ganglia cal...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,87,2686,new developments diagnosis therapy crohns dise...,etiology pathogenesis idiopathic chronicinflam...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0,107,1,prolonged shedding sarscov elderly liver trans...,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from imblearn.under_sampling import RandomUnderSampler
positive_count_train = train['judgement'].sum()
rus = RandomUnderSampler(random_state=71)
X, y = train.drop(columns=['title', 'abstract', 'judgement', 'title_token', 'abstract_token']), train['judgement']
X_resample, y_resample = rus.fit_resample(X,y)
train_resample = pd.concat([X_resample, y_resample], axis=1)
X_test = test.drop(columns=['title', 'abstract', 'title_token', 'abstract_token'])

In [21]:
from pycaret.classification import *
exp1 = setup(train_resample, target = 'judgement')

Unnamed: 0,Description,Value
0,session_id,5972
1,Target,judgement
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(1264, 1003)"
5,Missing Values,False
6,Numeric Features,2
7,Categorical Features,1000
8,Ordinal Features,False
9,High Cardinality Features,False


In [22]:
from sklearn.metrics import fbeta_score
def fbeta_score(x, y):
    score = fbeta_score(x, y, beta=7.0)
    return score
add_metric('fbeta', 'fbeta', fbeta_score, target = 'pred_proba', greater_is_better=True)
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,fbeta,TT (Sec)
lr,Logistic Regression,0.8834,0.9512,0.8487,0.9152,0.8791,0.7668,0.7713,0.0,0.822
et,Extra Trees Classifier,0.8789,0.9413,0.8396,0.9126,0.8727,0.7578,0.7628,0.0,0.113
rf,Random Forest Classifier,0.8733,0.9504,0.8218,0.9195,0.8657,0.7467,0.7539,0.0,0.097
ridge,Ridge Classifier,0.8586,0.0,0.7948,0.9157,0.8483,0.7173,0.7267,0.0,0.039
gbc,Gradient Boosting Classifier,0.8563,0.9484,0.8014,0.9033,0.8465,0.7126,0.7207,0.0,0.167
ada,Ada Boost Classifier,0.8552,0.9378,0.8059,0.8998,0.8469,0.7105,0.7189,0.0,0.069
lightgbm,Light Gradient Boosting Machine,0.8484,0.9428,0.8354,0.8627,0.8462,0.6967,0.7014,0.0,0.039
nb,Naive Bayes,0.8438,0.8855,0.8126,0.8682,0.8384,0.6877,0.6905,0.0,0.02
dt,Decision Tree Classifier,0.819,0.8185,0.7653,0.8629,0.8072,0.6381,0.6474,0.0,0.02
lda,Linear Discriminant Analysis,0.7069,0.6887,0.5936,0.7766,0.6688,0.414,0.4295,0.0,0.21


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=5972, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
et = create_model('et')
tuned_et = tune_model(et)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,fbeta
0,0.809,0.9217,0.7778,0.8333,0.8046,0.6182,0.6196,0.0
1,0.7978,0.9409,0.6889,0.8857,0.775,0.5965,0.6121,0.0
2,0.8315,0.948,0.8222,0.8409,0.8315,0.663,0.6631,0.0
3,0.8764,0.9707,0.7727,0.9714,0.8608,0.7522,0.7682,0.0
4,0.8636,0.9582,0.9318,0.82,0.8723,0.7273,0.7341,0.0
5,0.9091,0.9711,0.9318,0.8913,0.9111,0.8182,0.819,0.0
6,0.8182,0.945,0.7045,0.9118,0.7949,0.6364,0.6535,0.0
7,0.8182,0.9248,0.7273,0.8889,0.8,0.6364,0.6472,0.0
8,0.8182,0.8778,0.75,0.8684,0.8049,0.6364,0.6424,0.0
9,0.8409,0.9592,0.7045,0.9688,0.8158,0.6818,0.7087,0.0


In [26]:
et_predict = predict_model(tuned_et, X_test)

In [62]:
sub = pd.read_csv(DATA_DIR + "/sample_submit.csv", header = None)
sub.columns = ["id", "judgement"]
sub["judgement"] = et_predict['Label'].values

In [63]:
#保存先のディレクトリ作成
save_dir = "./result/result4"
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
sub.to_csv(save_dir + '/submission.csv', header=None, index=None)

In [30]:
evaluate_model(tuned_ridge)

NameError: name 'tuned_ridge' is not defined

In [47]:
sub = pd.read_csv(DATA_DIR + "/sample_submit.csv", header=None)