# 分析の方針②について

今回の目標：  
基礎研究における分析方針②を実行する

今回やること：  
前処理によって得られたテキストデータと、株価のデータを用いて、Bertモデルのファインチューニングを行う。  
①株価のデータ処理(株価の上昇率)  
②特徴量の抽出  
③LightGBMを最適化できるようにする  


今回やらないこと：  
テキストデータの前処理

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments, AdamW
from torch.optim import lr_scheduler
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch

In [None]:
#デイリーデータのインストール
data = pd.read_csv('Daily_data.csv', index_col=0)
data = data.groupby(level=0).tail(3)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
    def get_labels(self):
        return self.labels

In [None]:
data['stock'] = data['stock'].pct_change()
data.dropna(inplace=True)
y = data['stock'].tolist()
x = data['text'].tolist()
date = list(data.index)

In [None]:
X_train, X_test, y_train, y_test, date_train, date_test = train_test_split(x, y, date, shuffle = True)

#トークン化するためのパッケージのインストール
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#エンコーディング
train_encodings = tokenizer(X_train, max_length = 128, truncation = True, padding = "max_length")
test_encodings = tokenizer(X_train, max_length = 128, truncation = True, padding = "max_length")

#データセット型に変換
train_dataset = Dataset(train_encodings, y_train)
val_dataset = Dataset(test_encodings, y_test)


#データセット作成
train_loader = DataLoader(train_dataset, batch_size=8)
validation_loader = DataLoader(val_dataset, batch_size=8)

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, linear_size = 1):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.classifier = nn.Sequential(
            nn.Dropout(p = 0.5),
            nn.Linear(in_features=768, out_features=linear_size)
            )
    
    def forward(self, input_ids, token_type_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        output = self.classifier(bert_output['last_hidden_state'][:,0,:])
        return output
    
    def freeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=False
            
    def unfreeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=True

In [None]:
#model
model = BertClassifier()

#モデルをGPUに渡す(CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [None]:
Decay_factor = 0.9
learning_rate = 2.0e-5
epochs = 3
freeze_epochs = [4, 5]
unfreeze_epochs = [1, 2, 3]

#学習率の設定
#Embedding Layer, Encoder Layer 0～11, Pooling Layer, Classifier Layerの、15個のレイヤーについての学習率を用意する
LearingRate_list = [learning_rate * (Decay_factor ** i) for i in range(15)]
Embedding_lr = [{'params' : model.bert.embeddings.parameters(), 'lr' : LearingRate_list[0], 'weight_decay' : 0.01}]
Layers_lr = [{'params' : layer.parameters(), 'lr' : LearingRate_list[i+1], 'weight_decay' : 0.01} for i, layer in enumerate(model.bert.encoder.layer)]
Pooling_lr = [{'params' : model.bert.pooler.parameters(), 'lr' : LearingRate_list[13], 'weight_decay' : 0.01}]
Classifier_lr = [{'params' : model.classifier.parameters(), 'lr' : LearingRate_list[14], 'weight_decay' : 0.01}]

#学習方法の指定
optimizer = AdamW(Embedding_lr + Layers_lr + Pooling_lr + Classifier_lr, lr=learning_rate)#, eps=1e-8)
#optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay = 0.1)

#ラベル"1"についてのロスの重さを設定
loss_func = nn.MSELoss()
#scheduler = lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.1)
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: 0.1 ** epoch)
history = []
best_acc = 0

In [None]:
Accumulation_steps = 8
total_loss = []
running_loss =0

for epoch in range(1, epochs + 1):
    print("=== Epoch: ", epoch, " / ", epochs, " ===")

    #エポックによってはBERT自体の学習をするかどうかを判断する
    if epoch in freeze_epochs:  # stop training bert
        print("Freeze BERT")
        model.freeze_bert()
        
    if epoch in unfreeze_epochs:  # train bert
        print("Unfreeze BERT")
        model.unfreeze_bert()

    # ===== train =====
    model.train()  # train mode
    #print("バッチ学習の開始", optimizer.param_groups[0]["lr"])

    for i, batch in tqdm(enumerate(train_loader)): 
        batch = {k: v.to(device) for k, v in batch.items()}


        input_ids = batch['input_ids']
        token_type_ids = batch['token_type_ids']
        attention_mask = batch['attention_mask']
        y_batch = batch['labels']

        #print("===============================================",'\n')
        #print("学習時のラベルの比率", sum(y_batch)/len(y_batch))

        probas = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)

        batch_loss = loss_func(probas, y_batch.view(-1, 1))
        running_loss += batch_loss.item()
        batch_loss = batch_loss / Accumulation_steps #勾配累積ごとに平均化
        batch_loss.backward()  # calculate gradient per (learnable) weight

        if (i + 1) % Accumulation_steps == 0 or (i + 1) == len(train_loader):
            optimizer.step()  # update (learnable) weights
            if (i + 1) == len(train_loader):
            #学習率の更新
                scheduler.step()
                print("学習率の確認：", optimizer.param_groups[0]["lr"])

            optimizer.zero_grad()  # reset the gradients

        if (i + 1) % 100 == 0:
            print('学習のロス', running_loss / 100)
            total_loss.append(running_loss / 100)
            running_loss = 0

    print("-----------トレーニング終了-----------------------")

    # ===== validate =====
    model.eval()  # evaluation mode
    acc_val_total = 0
    fbeta_val_total = 0

    # metrics on val
    accumulated_val_loss = 0 
    for batch in validation_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        x_batch = batch['input_ids']
        token_type_ids = batch['token_type_ids']
        attention_mask = batch['attention_mask']
        y_batch = batch['labels']

        with torch.no_grad(): # no gradients, because no update is performed
            probas = model(input_ids=x_batch, token_type_ids = token_type_ids, attention_mask=attention_mask)
        loss = loss_func(probas, y_batch)
        accumulated_val_loss += loss


    avg_val_loss = accumulated_val_loss / len(validation_loader)

    #print("Train Acc:", acc_train_total)
    print("Val Acc:", acc_val_total, "\n")
    print("fbeta Acc:", fbeta_val_total, "\n")
    if acc_val_total > best_acc:
        best_acc = acc_val_total
    
    history.append({"acc_val": acc_val_total, "fbeta_val": fbeta_val_total})

=== Epoch:  1  /  3  ===
Unfreeze BERT


KeyboardInterrupt: 