### Generating Features for Analysis

In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch
import numpy as np
import pandas as pd

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [2]:
from datasets import load_dataset

financial_news = load_dataset('financial_phrasebank', 'sentences_50agree')

In [3]:
financial_news

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4846
    })
})

In [4]:
data = pd.DataFrame(financial_news['train'])
data.head()

Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2


In [5]:
pre_trained_model_name = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(pre_trained_model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
tokenizer.convert_tokens_to_ids( tokenizer.tokenize( data.sentence[0]) )

[1792,
 1106,
 13529,
 117,
 1103,
 1419,
 1144,
 1185,
 2714,
 1106,
 1815,
 1155,
 1707,
 1106,
 2733,
 117,
 1780,
 1115,
 1110,
 1187,
 1103,
 1419,
 1110,
 2898,
 119]

In [7]:
tokenizer.sep_token, tokenizer.sep_token_id

('[SEP]', 102)

### Implementing and Encoding

In [8]:
encoding = tokenizer.encode_plus(
    tokenizer.tokenize( data.sentence[0]),
    max_length=32,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

In [9]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [10]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['[CLS]',
 'According',
 'to',
 'Gran',
 ',',
 'the',
 'company',
 'has',
 'no',
 'plans',
 'to',
 'move',
 'all',
 'production',
 'to',
 'Russia',
 ',',
 'although',
 'that',
 'is',
 'where',
 'the',
 'company',
 'is',
 'growing',
 '.',
 '[SEP]']

### Developing the Dataset

In [11]:
class FinPhraseDataset(Dataset):

    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens = True,
            max_length = self.max_len,
            return_token_type_ids = False,
            padding = True,
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt'    
        )

        return {
            'review_text' : review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask' : encoding['attention_mask'].flatten(),
            'targets' : torch.tensor(target, dtype=torch.long)
        }

    

In [12]:
from sklearn.model_selection  import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [13]:
df_train, df_test = train_test_split(data, test_size=.1, random_state=391)

In [14]:
df_train.shape, df_test.shape

((4361, 2), (485, 2))

In [15]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = FinPhraseDataset(
        reviews= data.sentence.to_numpy(),
        targets= data.label.to_numpy(),
        tokenizer= tokenizer,
        max_len = max_len
    )

    return DataLoader( ds, batch_size=batch_size, num_workers=4 )

In [16]:
max_len = 90
batch_size = 16

In [17]:
train_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
test_loader = create_data_loader(df_test, tokenizer, max_len, batch_size)