1. データの読み込み

In [1]:
import math
import random
import time
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import transformers as T
from pathlib import Path
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
import tqdm

In [2]:
class config:
    model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract'
    DATA_DIR = './dataset/data1'
    OUTPUT_DIR ='./dataset/data5'

In [3]:
warnings.filterwarnings("ignore")

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [8]:
if not os.path.isdir(config.OUTPUT_DIR):
    os.makedirs(config.OUTPUT_DIR)

In [10]:
train = pd.read_csv(config.DATA_DIR  +"/train.csv")
test = pd.read_csv(config.DATA_DIR + "/test.csv")
sub = pd.read_csv(config.DATA_DIR + "/sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]

In [11]:
def get_train_data(train):

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

In [12]:
def get_test_data(test):
    return test

In [13]:
train = get_train_data(train)

In [14]:
class get_embedding_representaion():
    def __init__(self, df, model_name):
        tokenizer = T.BertTokenizer.from_pretrained(model_name)
        self.title = df["title"].tolist()
        self.encoded = tokenizer.batch_encode_plus(
          self.title, padding=True, add_special_tokens=True
        )
        self.df = df
        self.model = T.BertModel.from_pretrained(model_name)

    def get_format(self, device, num=64):
        self.model = self.model.to(device)
        input_ids = torch.tensor(self.encoded["input_ids"], device=device)
        input_ids = input_ids[:, :512]
        vec = []
        with torch.no_grad():
            for i in tqdm.tqdm(range((len(input_ids)//num)+1), total =len(input_ids)//num):
                start = num * i
                end  = num * (i+1)
                # 単語ベクトルを計算
                outputs = self.model(input_ids[start:end])
                vec.append(outputs[0][:, 0, :].data.cpu())
        vec = torch.cat(vec, axis=0).detach().numpy().copy()
        df_vec = pd.DataFrame(vec).rename(columns=lambda x:f"title_vector_{x}")
        return df_vec

In [15]:
get_embedding = get_embedding_representaion(train, 'bert-base-uncased')
traindf_vec = get_embedding.get_format(device = device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
425it [00:58,  7.22it/s]                                                        


In [16]:
get_embedding = get_embedding_representaion(test, 'bert-base-uncased')
testdf_vec = get_embedding.get_format(device = device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  9%|███▋                                      | 56/638 [00:09<01:40,  5.80it/s]


KeyboardInterrupt: 

In [20]:
train_df = pd.concat([train, traindf_vec], axis = 1)
test_df = pd.concat([test, testdf_vec], axis = 1)

In [26]:
# os.makedirs(save_dir)
train_df.to_csv(os.path.join(config.save_dir, 'train.csv'))
test_df.to_csv(os.path.join(config.save_dir, 'test.csv'))