In [1]:
!pip install smart_pytorch

Collecting smart_pytorch
  Downloading smart_pytorch-0.0.4-py3-none-any.whl (3.8 kB)
Collecting data-science-types>=0.2 (from smart_pytorch)
  Downloading data_science_types-0.2.23-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.7/42.7 kB[0m [31m549.5 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.6->smart_pytorch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.6->smart_pytorch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.6->smart_pytorch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.6->smart_pytorch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-

In [3]:
import pandas as pd
import os
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaConfig

from smart_pytorch import SMARTLoss, kl_loss, sym_kl_loss
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from torch.utils.data import (
    Dataset,
    DataLoader,
    RandomSampler,
    SequentialSampler
)
import math
from transformers.optimization import (
    AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.metrics import (
    confusion_matrix,
    matthews_corrcoef,
    accuracy_score,
    roc_curve,
    auc,
    average_precision_score,
    f1_score,
)
from scipy.special import softmax
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

In [4]:
print(torch.version.cuda)
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.backends.cudnn.enabled)


model_name = "roberta-large"

num_labels = 3
device = torch.device("cuda")

tokenizer_name = model_name

max_seq_length = 128
train_batch_size = 8
test_batch_size = 8
warmup_ratio = 0.06
weight_decay=0.0
gradient_accumulation_steps = 1
num_train_epochs = 20
learning_rate = 1e-05
adam_epsilon = 1e-08


class SMARTRobertaClassificationModel(nn.Module):

    def __init__(self, model, weight = 0.02):
        super().__init__()
        self.model = model
        self.weight = weight

    def forward(self, input_ids, attention_mask, labels):

        # Get initial embeddings
        embed = self.model.roberta.embeddings(input_ids)

        # Define eval function
        def eval(embed):
            outputs = self.model.roberta(inputs_embeds=embed, attention_mask=attention_mask)
            pooled = outputs[0]
            logits = self.model.classifier(pooled)
            return logits

        # Define SMART loss
        smart_loss_fn = SMARTLoss(eval_fn = eval, loss_fn = kl_loss, loss_last_fn = sym_kl_loss)
        # Compute initial (unperturbed) state
        state = eval(embed)
        # Apply classification loss
        loss = F.cross_entropy(state.view(-1, 3), labels.view(-1))
        # Apply smart loss
        loss += self.weight * smart_loss_fn(embed, state)

        return state, loss

tokenizer = AutoTokenizer.from_pretrained(model_name)

config = RobertaConfig.from_pretrained(model_name, num_labels=num_labels)
model = AutoModelForSequenceClassification.from_pretrained(tokenizer_name,config=config)

model_smart = SMARTRobertaClassificationModel(model)


class MyClassificationDataset(Dataset):

    def __init__(self, data, tokenizer):
        text, labels = data
        self.examples = tokenizer(text=text,text_pair=None,truncation=True,padding="max_length",
                                  max_length=max_seq_length,return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)


    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, index):
        return {key: self.examples[key][index] for key in self.examples}, self.labels[index]


def get_inputs_dict(batch):
    inputs = {key: value.squeeze(1).to(device) for key, value in batch[0].items()}
    inputs["labels"] = batch[1].to(device)
    return inputs


def compute_metrics(preds, model_outputs, labels, eval_examples=None, multi_label=False):
    assert len(preds) == len(labels)
    mismatched = labels != preds
    mcc = matthews_corrcoef(labels, preds)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    con_m = confusion_matrix(labels, preds, labels=[0, 1, 2])
    return (
        {
            **{"mcc": mcc, "acc":acc, "f1": f1},
        },
        con_m
    )

def print_confusion_matrix(result):
    print('confusion matrix:')
    print('            predicted    ')
    print('          0     |     1')
    print('    ----------------------')
    print('   0 | ',format(result['tn'],'5d'),' | ',format(result['fp'],'5d'))
    print('gt -----------------------')
    print('   1 | ',format(result['fn'],'5d'),' | ',format(result['tp'],'5d'))
    print('---------------------------------------------------')

12.1
2.2.1+cu121
True
True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
df_sample =  pd.read_csv("../Data pre-processing/youtube_preprocessed_2024.csv")

new_labels=np.zeros(218)

sample_examples = (df_sample['Summary'].astype(str).tolist(),new_labels)
sample_dataset = MyClassificationDataset(sample_examples,tokenizer)

sample_dataloader = DataLoader(sample_dataset,shuffle=False,batch_size=test_batch_size)

print(sample_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7ab25a43a260>


In [6]:
torch.cuda.empty_cache()

model_smart.to(device)
pred_final = []
for epoch in range(8,9): # Select epoch 8

    eval_loss = 0.0
    nb_eval_steps = 0
    n_batches = len(sample_dataloader)
    preds = np.empty((len(sample_dataset), num_labels))
    out_label_ids = np.empty((len(sample_dataset)))

    PATH = "SMART_RoBERTa_Large_FinancialPhraseBank/AllAgree/epoch"+str(epoch)
    model_smart.load_state_dict(torch.load(PATH))
    model_smart.eval()

    for i,test_batch in enumerate(sample_dataloader):
        test_batch = get_inputs_dict(test_batch)
        input_ids = test_batch['input_ids'].to(device)
        attention_mask = test_batch['attention_mask'].to(device)
        labels = test_batch['labels'].to(device)
        logits, tmp_eval_loss = model_smart(input_ids, attention_mask=attention_mask, labels=labels)
        eval_loss += tmp_eval_loss.item()

        nb_eval_steps += 1
        start_index = test_batch_size * i
        end_index = start_index + test_batch_size if i != (n_batches - 1) else len(sample_dataset)
        preds[start_index:end_index] = logits.detach().cpu().numpy()
        out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()

    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds
    preds = np.argmax(preds, axis=1)
    result, con_m = compute_metrics(preds, model_outputs, out_label_ids)
    if epoch == 8:
        pred_final = preds

In [7]:
pred_final_df=pd.DataFrame(preds)
pred_final_df.columns = ['Sentiment']
df_sample = df_sample.join(pred_final_df)
df_sample

Unnamed: 0.1,Unnamed: 0,Video ID,Title,Channel,Upload date,Stock,Summary,Company,Symbol,Sentiment
0,0,YEoJq_PcOgc,What can reignite Apple shares?,CNBC Television,2024-02-28 17:55:38+00:00,1,i don't expect anything to happen at the share...,Apple,AAPL,2
1,1,BOm0zNiaNjg,Alphabet's AI problems: Stock falls 4%,CNBC Television,2024-02-26 19:01:28+00:00,3,The stock of alphabet is down 4%. The stock ne...,Alphabet,GOOGL,0
2,2,cTncTPylZQ8,Apple shares touch a 4-month low,CNBC Television,2024-03-05 17:54:20+00:00,1,i'm not particularly very bullish on the eye w...,Apple,AAPL,1
3,3,qAITe2Hn8Hc,"Three-Stock Lunch: Broadcom, Kroger & Costco",CNBC Television,2024-03-07 20:47:58+00:00,217,Kroger is up 23% over the past month. This has...,Kroger,KR,0
4,4,qAITe2Hn8Hc,"Three-Stock Lunch: Broadcom, Kroger & Costco",CNBC Television,2024-03-07 20:47:58+00:00,23,"finally, costco reporting results after the be...",Costco,COST,1
...,...,...,...,...,...,...,...,...,...,...
213,213,xbU_b4Pwank,"Target foresees sales rebound, plans new store...",Reuters,2024-03-06 06:46:05+00:00,107,target is aiming for a better year in 2024. on...,Target,TGT,2
214,214,q1DoDWjQkDk,Tesla steps up EV price war in China | REUTERS,Reuters,2024-03-01 12:34:11+00:00,11,tesla is stepping up a price war over electric...,Tesla,TSLA,2
215,215,gUwKeUkdBVw,Meta resolves issue after thousands report out...,NBC News,2024-03-05 21:30:02+00:00,5,thousands of meta users were not able to sign ...,Meta,META,0
216,216,xOqh-4THhF8,Texas Gov. Greg Abbott on border crisis and Tr...,ABC News,2024-03-06 03:30:01+00:00,37,,Abbott,ABT,1


In [8]:
df_sample.to_csv("youtube_sentiments_2024.csv",index=False)