In [1]:
!pip install datasets lightning --quiet

In [2]:
import datetime
import os
import gc
from pprint import pprint

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModel
from datasets import Dataset, load_dataset

import lightning as L

In [3]:
import google.colab
google.colab.drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/projects/FinText/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
config = {
    'CommunityModel': 'beomi/KcELECTRA-base-v2022',
    'ArticleModel': 'psyche/kolongformer-4096',
    'num_epoch': 30,
    'batch_size': 32,
    'learning_rate': 2e-5,
    'weight_decay': 1e-5
}

In [35]:
class TokenizerMapWrapper:
    def __init__(self):
        self.community_tokenizer = AutoTokenizer.from_pretrained(config['CommunityModel'])
        self.article_tokenizer = AutoTokenizer.from_pretrained(config['ArticleModel'])

    def __call__(self, batch):
        community_texts = batch['CommunityText']
        article_texts = batch['ArticleText']
        if community_texts is not None:
            for i, community_text in enumerate(community_texts):
                if community_text is None:
                    community_text = ' '
                community_text = self.community_tokenizer(community_text, padding='max_length', truncation=True, max_length=512)
                community_texts[i] = community_text
        else:
            community_texts = []

        if article_texts is not None:
            for i, article_text in enumerate(article_texts):
                if article_text is None:
                    article_text = ' '
                article_input_ids = self.article_tokenizer(article_text, padding='max_length', truncation=True, max_length=4096)
                article_texts[i] = article_text
        else:
            article_texts = []

        batch['CommunityText'] = community_texts
        batch['ArticleText'] = article_texts
        return batch

In [43]:
class FinTextDataModule(L.LightningDataModule):
    def __init__(self, tokenizer_map_wrapper, data_path='./data-dir/data-df.pkl'):
        super().__init__()
        self.dataset = Dataset.from_pandas(pd.read_pickle(data_path))
        self.tokenizer_map_wrapper = tokenizer_map_wrapper

    def prepare_data(self, stage=None):
        dataset_dict = self.dataset.train_test_split(test_size=0.1, shuffle=False)
        self.train_dataset = dataset_dict['train']
        self.test_dataset = dataset_dict['test']

    def setup(self, stage=None):
        if stage == "fit":
            self.train_dataset = self.train_dataset.map(self.tokenizer_map_wrapper, batched=False)
        if stage == "test":
            self.test_dataset = self.test_dataset.map(self.tokenizer_map_wrapper, batched=False)

    def train_dataloader(self):
        return DataLoader(self.train_dataset.with_format('torch'), batch_size=config['batch_size'], shuffle=False, drop_last=True)

    def test_dataloader(self):
        return DataLoader(self.train_dataset.with_format('torch'), batch_size=config['batch_size'], shuffle=False, drop_last=True)

In [7]:
class FinTextModel(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.community_model = AutoModel.from_pretrained(config['CommunityModel'])
        self.article_model = AutoModel.from_pretrained(config['ArticleModel'])
        self.stock_lstm = nn.LSTM(input_size=4, hidden_size=10, num_layers=5, batch_first=True)
        self.stock_linear = nn.Linear(in_features=10, out_features=4)
        self.total_model = nn.Sequential(
            nn.Linear(in_features=768 * 2 + 4, out_features=768 + 4),
            nn.Linear(in_features=768 + 4, out_features=1),
            nn.Sigmoid()
        )

        self.criterion = nn.CrossEntropyLoss()

    def configure_optimizers(self):
        self.optimizer = optim.AdamW(self.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=config['num_epoch'])
        return [self.optimizer], [self.scheduler]

    def forward(self, batch, batch_idx):
        # batch['text']: list
        community_outputs = self.community_model(**batch['CommunityText'])
        community_outputs = community_outputs.pooler_output

        article_outputs = self.article_model(**batch['ArticleText'])
        article_outputs = article_outputs.pooler_output

        stock_input = batch['Stock']
        stock_outputs, _ = self.stock_lstm(stock_input)
        stock_outputs = self.stock_linear(stock_outputs)

        total_input = torch.cat([community_outputs, article_outputs, stock_outputs], dim=1)
        total_outputs = self.total_model(total_input)
        return total_outputs

    def training_step(self, batch, batch_idx):
        total_outputs = self.forward(batch, batch_idx)
        loss = self.criterion(total_outputs, batch['Label'])
        self.log('train_loss', loss)
        return loss


In [44]:
datamodule = FinTextDataModule(TokenizerMapWrapper())

In [45]:
model = FinTextModel()

Some weights of LongformerModel were not initialized from the model checkpoint at psyche/kolongformer-4096 and are newly initialized: ['longformer.pooler.dense.weight', 'longformer.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
trainer = L.Trainer(max_epochs=config['num_epoch'])

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [47]:
trainer.fit(model, datamodule=datamodule)

Map:   0%|          | 0/118 [00:00<?, ? examples/s]

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name            | Type             | Params
-----------------------------------------------------
0 | community_model | ElectraModel     | 127 M 
1 | article_model   | LongformerModel  | 148 M 
2 | stock_lstm      | LSTM             | 4.2 K 
3 | stock_linear    | Linear           | 44    
4 | total_model     | Sequential       | 1.2 M 
5 | criterion       | CrossEntropyLoss | 0     
-----------------------------------------------------
277 M     Trainable params
0         Non-trainable params
277 M     Total params
1,108.161 Total estimated model params size (MB)
INFO:lightning.pytorch.callbacks.model_summary:
  | Name            | Type             | Params
-----------------------------------------------------
0 | community_model | ElectraModel     | 127 M 
1 | article_model   | LongformerModel  | 148 M 
2 | stock_lstm      | LSTM             | 4

Training: |          | 0/? [00:00<?, ?it/s]

RuntimeError: ignored