In [None]:
# Automatic reload of external .py scripts
%load_ext autoreload
%autoreload 2

## Set-up cwd

Ana

In [None]:
# Local directory
%cd /home/abaric/TakeLab/projects/retriever-sentiment/

In [None]:
# Server directory
%cd /home/abaric/retriever-sentiment/

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

import torch
import torch.utils.data
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from models.bertic_nn import Bertic_NN
from models.fine_tune_framework import DLFramework

from dataset_class.token_dataset import TokenDataset
from data_transformation.label_transformation import label_transformation
from data_transformation.token_transformation import *

# Fine-tuning BERTić model
__________________________________________________________________________________________________________________________________________________________________________________

## Set-up GPU

In [None]:
# Set up GPU

if torch.cuda.is_available():   
    # Choose between cuda:0 or cuda:1 based on GPU availability  
    device = torch.device("cuda:1")            

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
device

## Import data

In [None]:
data_path = 'data/gold_label/stone_gold_label.csv'
data = pd.read_csv(data_path)
data

Sentiment label distribution

In [None]:
data['aggregated_sentiment'].value_counts()

Tone label distribution

In [None]:
data['aggregated_tone'].value_counts()

### Mask NER

In [None]:
# Replace named entities with [MASK]

masked_target = []

for idx, row in data.iterrows():
    text = row['text']
    target = row['text'][row['ner_begin']:row['ner_end']]
    masked_target.append(text.replace(target, '[MASK]'))

In [None]:
data['masked_target_text'] = masked_target
data['target_entity']  = '[MASK]'
data['target_order'] = 0

## Split dataset to train and test

In [None]:
import random 

def split_dataset(X, y, test_proportion):
    random_state =  random.randint(0, 1000)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state = random_state)

    return X_train, X_test, y_train, y_test

In [None]:
X = data[['document_id', 'text', 'ner', 'target_order', 'ner_type']]

# Set sentiment labels as target labels
label = 'aggregated_sentiment'
y = data[label]

X_train, X_test, y_train, y_test = split_dataset(X, y, 0.3)

In [None]:
# Train and test shapes
print('Train and test shapes:\n')
print('X_train dimension = ', X_train.shape)
print('y_train dimension = ', y_train.shape)

print()

print('X_test dimension = ', X_test.shape)
print('y_test dimension = ', y_test.shape)

print('..........................................')

# Label distribution
print(f'Train label distribution:\n\n{y_train.value_counts()}')
print('\n----------------------------------------')
print(f'Test label distribution:\n\n{y_test.value_counts()}')

## Dataset setup

In [None]:
X_train

In [None]:
X_train.iloc[585]['text']

In [None]:
train_dataset = TokenDataset(X_train, y_train,
                            token_transformation,
                            label_transformation)


test_dataset = TokenDataset(X_test, y_test,
                            token_transformation,
                            label_transformation)

## Feature transformation

In [None]:
from feature_selection.token_selection import *
from feature_selection.token_aggregation import *
from feature_selection.layer_strategy import *

from data_transformation.feature_transformation import FeatureTransformation

In [None]:
# Feature transformation configs

# Only target tokens - 1
ft_11 = FeatureTransformation(only_target, average_aggregation, last_layer)
ft_12 = FeatureTransformation(only_target, average_aggregation, second_to_last)
ft_13 = FeatureTransformation(only_target, average_aggregation, sum_all)
ft_14 = FeatureTransformation(only_target, average_aggregation, last_layer)
ft_15 = FeatureTransformation(only_target, average_aggregation, mean_all)

ft_1 = [ft_11, ft_12, ft_13, ft_14, ft_15]

# TODO: Dodati masked i only_target + NER_type

## Train model

### Set model config

In [None]:
# CONFIG_1
config = {}
config['batch_size'] = 16
config['epochs'] = 50
config['gradient_clipping'] = True

loss = nn.CrossEntropyLoss()

### Set model + framework

In [None]:
# Init model
bertic = Bertic_NN(ft_14, 769, 3)
bertic.to(device)


# Init train/eval module
bertic_module = DLFramework(bertic, loss, config, device)

### Train and eval model

In [None]:
bertic_stats, clf_reports = bertic_module.run(train_dataset, test_dataset)

In [None]:
train_loss = bertic_stats['train_loss']
test_loss = bertic_stats['val_loss']
f1 = bertic_stats['val_f1']

epochs = range(1, config['epochs']+1)

plt.plot(epochs, train_loss, label='Train loss')
plt.plot(epochs, test_loss, label='Test loss')
plt.legend(loc="upper left")
plt.show()

plt.plot(epochs, f1, label='Test F1')
plt.legend(loc="upper left")
plt.show()