In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, ClassLabel, Features, Value
from transformers import AutoModelForSequenceClassification
import evaluate
from transformers import TrainingArguments, Trainer
from huggingface_hub import login
from scipy.special import softmax

from scripts import preprocessing as pr
from scripts import utils
from scripts import lstm_model as lm
from scripts import gru_model as gr

In [2]:
# Reading the Parquet file using PyArrow
with open('.\\data\\output_speech_us_central_bank_v2.parquet', 'rb') as handle:
    text = pq.read_table(handle).to_pandas()

In [3]:
preprocessor = pr.TextPreprocessor(
    remove_stopwords=False,
    apply_pos_tagging=True,      # Enable POS tagging
    apply_lemmatization=False     # Enable lemmatization
)

text[['speech_text_processed_text', 'speech_text_word_tokens',
      'speech_text_sent_tokens', 'speech_text_word_tokens_wo_stopwords',
      'speech_text_pos_tags']] = preprocessor.preprocess_dataframe(text, 'speech_text')

text.dropna(inplace=True)

text['processed_speech_text'] = text['speech_text_word_tokens'].apply(' '.join)  # speech_text_word_tokens_wo_stopwords

## 1. LR

In [4]:
data = text[['speech_text_processed_text', 'target_label']]

# Initialize the vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 1))

# Fit and transform the text data
X = vectorizer.fit_transform(data['speech_text_processed_text'])

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the sentiment labels
data['sentiment_encoded'] = label_encoder.fit_transform(data['target_label'])
y = data['sentiment_encoded']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment_encoded'] = label_encoder.fit_transform(data['target_label'])


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, data.index, 
                                                                                 stratify=y, test_size=0.2, 
                                                                                 random_state=42)

In [6]:
# Train the Logistic Regression model
model_lr = LogisticRegression(class_weight='balanced')
model_lr.fit(X_train, y_train)

# Make predictions
pred_prob_lr = model_lr.predict_proba(X_test)[:, 1]
pred_label_lr = (pred_prob_lr >= 0.5).astype(int)

# Evaluate the model
display(utils.test_report(Y_test=y_test, labels=pred_label_lr, probs=pred_prob_lr))
print(confusion_matrix(y_test, pred_label_lr))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg,balanced accuracy,auc
precision,0.467532,0.875862,0.734234,0.671697,0.776539,0.71131,0.819775
recall,0.666667,0.755952,0.734234,0.71131,0.734234,0.71131,0.819775
f1-score,0.549618,0.811502,0.734234,0.68056,0.7478,0.71131,0.819775
support,54.0,168.0,0.734234,222.0,222.0,0.71131,0.819775


[[ 36  18]
 [ 41 127]]


## 2. Rule Based using VADER 

In [45]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER Lexicon
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

## Setting up function for prediction
def rule_based_sentiment(text):
    score = sia.polarity_scores(text)
    compound = score['compound']
    if compound > 0:
        return 1, compound  # Positive
    else:
        return 0, compound  # Negative

# Storing test set using test indices
vader_test_X = data.loc[indices_test, 'speech_text_processed_text']
vader_test_y = data.loc[indices_test, 'sentiment_encoded']

# Predicting on the test set
pred_vader = vader_test_X.apply(rule_based_sentiment)

pred_label_vader = pd.DataFrame(pred_vader.tolist())[0]
pred_prob_vader = pd.DataFrame(pred_vader.tolist())[1]

# Since vader model provides a score between -1 and 1, we will rescale it for a range between 0 and 1
pred_prob_vader = (pred_prob_vader + 1) / 2


# Evaluate the model
display(utils.test_report(Y_test=vader_test_y, labels=pred_label_vader, probs=pred_prob_vader))
print(confusion_matrix(vader_test_y, pred_label_vader))

[nltk_data] Downloading package vader_lexicon to C:\Users\MOHAMMED
[nltk_data]     USAMA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,0,1,accuracy,macro avg,weighted avg,balanced accuracy,auc
precision,0.333333,0.761905,0.738739,0.547619,0.657658,0.513228,0.565256
recall,0.074074,0.952381,0.738739,0.513228,0.738739,0.513228,0.565256
f1-score,0.121212,0.846561,0.738739,0.483886,0.670125,0.513228,0.565256
support,54.0,168.0,0.738739,222.0,222.0,0.513228,0.565256


[[  4  50]
 [  8 160]]


In [None]:
pred_label_vader.value_counts()

0
1    210
0     12
Name: count, dtype: int64

## 3. NB

In [None]:
# Initialize and train the Naive Bayes model
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

# Make predictions
y_pred_nb = model_nb.predict(X_test)

# Make predictions
pred_prob_nb = model_nb.predict_proba(X_test)[:, 1]
pred_label_nb = (pred_prob_nb >= 0.5).astype(int)

# Evaluate the model
display(utils.test_report(Y_test=y_test, labels=pred_label_nb, probs=pred_prob_nb))
print(confusion_matrix(y_test, pred_label_nb))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,0,1,accuracy,macro avg,weighted avg,balanced accuracy,auc
precision,0.0,0.756757,0.756757,0.378378,0.572681,0.5,0.613095
recall,0.0,1.0,0.756757,0.5,0.756757,0.5,0.613095
f1-score,0.0,0.861538,0.756757,0.430769,0.651975,0.5,0.613095
support,54.0,168.0,0.756757,222.0,222.0,0.5,0.613095


[[  0  54]
 [  0 168]]


## 4. RF

In [None]:
rf_tuning = False

if rf_tuning:
    # Tuning
    model_rf = RandomForestClassifier()

    param_grid = {
        'n_estimators': [50, 70, 100, 120],
        'max_depth': [None, 10, 15, 20, 30],
        'min_samples_split': [5, 10, 15, 20, 30],
        'min_samples_leaf': [4, 8, 10, 15],
        'bootstrap': [True],
        'class_weight': ['balanced'],
        'criterion': ['log_loss'],  # 'entropy'
        'max_features': [None]
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Random search
    r_search = RandomizedSearchCV(estimator=model_rf, param_distributions=param_grid, cv=skf, 
                                n_iter=10, scoring='f1_macro', n_jobs=-1, verbose=1,
                                refit=False)

    r_search.fit(X_train, y_train)

    # Save best parameters
    best_params = r_search.best_params_
    with open(f'./model/rf/rf_best_params.pickle', 'wb') as handle:
        pickle.dump(best_params, handle)
else:
    with open('./model/rf/rf_best_params.pickle', 'rb') as handle:
        best_params = pickle.load(handle)

# Random Forest Model
model_rf = RandomForestClassifier(**best_params, n_jobs=-1)
model_rf.fit(X_train, y_train)

# Make predictions
pred_prob_rf = model_rf.predict_proba(X_test)[:, 1]
pred_label_rf = (pred_prob_rf >= 0.5).astype(int)

# Evaluate the model
display(utils.test_report(Y_test=y_test, labels=pred_label_rf, probs=pred_prob_rf))
print(confusion_matrix(y_test, pred_label_rf))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg,balanced accuracy,auc
precision,0.666667,0.830688,0.806306,0.748677,0.790791,0.670966,0.816138
recall,0.407407,0.934524,0.806306,0.670966,0.806306,0.670966,0.816138
f1-score,0.505747,0.879552,0.806306,0.692649,0.788626,0.670966,0.816138
support,54.0,168.0,0.806306,222.0,222.0,0.670966,0.816138


[[ 22  32]
 [ 11 157]]


## 5. LSTM

In [None]:
# Word vocabulary
word_tokens = text['speech_text_word_tokens'].dropna().tolist()
all_word_tokens = [token for sublist in word_tokens for token in sublist]
word2idx = lm.build_vocab(all_word_tokens, max_vocab_size=50000, min_freq=2)
vocab_size = len(word2idx)

print(f"Vocabulary size: {vocab_size}")

# POS vocabulary
pos_tags = text['speech_text_pos_tags'].dropna().tolist()
all_pos_tags = [tag for sublist in pos_tags for tag in sublist]

unique_pos_tags = sorted(set(all_pos_tags))
pos_tag2idx = {tag: idx for idx, tag in enumerate(unique_pos_tags, start=1)}
pos_tag2idx['<PAD>'] = 0  # Padding token

print(f"Number of unique POS tags: {len(pos_tag2idx)}")

# Encode tokens and POS tags
text['encoded_text'] = text['speech_text_word_tokens'].apply(lambda x: lm.encode_text(x, word2idx))
text['encoded_pos_tags'] = lm.encode_pos_tags(text['speech_text_pos_tags'].dropna().tolist(), pos_tag2idx)

# Encode labels
label_mapping = {'Positive': 0, 'Negative': 1}
text['label'] = list(map(label_mapping.get, text['target_label']))

print(f"Label distribution: {text['label'].value_counts()}")

# Final dataset
data = text[['encoded_text', 'encoded_pos_tags', 'label']]

Vocabulary size: 19682
Number of unique POS tags: 1
Label distribution: label
0    840
1    269
Name: count, dtype: int64


In [None]:
# Split data into train / test set
train_df, test_df = data.loc[indices_train], data.loc[indices_test]

# Slice train set into windows of fixed length
slice_data = True
window_size = 150

if slice_data:
  # Slice train set
  sliced_rows = []
  for _, row in train_df.iterrows():
      sliced_rows.extend(lm.slice_lists(row, window_size))

  train_df = pd.DataFrame(sliced_rows, columns=['encoded_text', 'encoded_pos_tags', 'label'])

print(f"Training set size: {train_df.shape}")
print(f"Test set size: {test_df.shape}")

Training set size: (17012, 3)
Test set size: (222, 3)


In [None]:
# Create Datasets
train_dataset = lm.SpeechDataset(
    train_df['encoded_text'].tolist(),
    train_df['encoded_pos_tags'].tolist(),
    train_df['label'].tolist()
)

# valid_dataset = SpeechDataset(
#     valid_df['encoded_text'].tolist(),
#     valid_df['encoded_pos_tags'].tolist(),
#     valid_df['label'].tolist()
# )

test_dataset = lm.SpeechDataset(
    test_df['encoded_text'].tolist(),
    test_df['encoded_pos_tags'].tolist(),
    test_df['label'].tolist()
)

# Create DataLoaders
BATCH_SIZE = 64

train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=lm.collate_fn
)

# valid_loader = DataLoader(
#     valid_dataset, 
#     batch_size=BATCH_SIZE, 
#     shuffle=False, 
#     collate_fn=collate_fn
# )

test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=lm.collate_fn
)

In [None]:
# Hyperparameters
HIDDEN_DIM = 128
CELL_STATE_INFO = 'linear'  # 'ignore', 'add', 'linear'
OUTPUT_DIM = 1  # Binary Classification
N_LAYERS = 2
DROPOUT_LSTM = 0.2
DROPUT_H_C_STATES = 0.05
PAD_IDX = word2idx['<PAD>']
POS_PAD_IDX = pos_tag2idx['<PAD>']
VOCAB_SIZE = len(word2idx)
POS_VOCAB_SIZE = len(pos_tag2idx)
EMBEDDING_DIM = 50
POS_EMBEDDING_DIM = 32
FREEZE_EMBEDDINGS = False
POS_EMBEDDINGS = False

# Load pretrained embeddings
## !wget http://nlp.stanford.edu/data/glove.6B.zip
## !unzip glove.6B.zip

# url = "http://nlp.stanford.edu/data/glove.6B.zip"
# response = requests.get(url, stream=True)

# with open("./data/embeddings/glove.6B.zip", "wb") as file:
#     for chunk in response.iter_content(chunk_size=1024*1024):
#         file.write(chunk)

# archive  = zipfile.ZipFile("data/embeddings/glove.6B.zip", "r")

# for file in archive.namelist():
#     if file.startswith('glove.6B.50d.txt'):
#         archive.extract(file, 'data/embeddings/')

# Create Embedding Matrix
embedding_index = {}
with open('data/embeddings/glove.6B.50d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coeffs

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word2idx.items():  # iterates through each word in the vocabulary (which is created before)
    if i >= vocab_size:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Initialize model
model_lstm = lm.SentimentLSTM(
    hidden_dim=HIDDEN_DIM,
    cell_state_info=CELL_STATE_INFO,
    output_dim=OUTPUT_DIM,
    n_layers=N_LAYERS,
    dropout_lstm=DROPOUT_LSTM,
    dropout_h_c_states=DROPUT_H_C_STATES,
    pad_idx=PAD_IDX,
    pos_pad_idx=POS_PAD_IDX,
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    pos_vocab_size=POS_VOCAB_SIZE,
    pos_embedding_dim=POS_EMBEDDING_DIM,
    pretrained_embeddings=embedding_matrix,
    freeze_embeddings=FREEZE_EMBEDDINGS,
    pos_embeddings=POS_EMBEDDINGS
)

# Define loss and optimizer
target_count = data['label'].value_counts()
neg = target_count[0]
pos = target_count[1]

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(neg / pos))
optimizer = optim.AdamW(model_lstm.parameters(), lr=1e-4, weight_decay=1e-4)

# Move model and criterion to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_lstm = model_lstm.to(device)
criterion = criterion.to(device)

print(f"Using device: {device}")

Using device: cpu


# 6. GRU

In [None]:
CELL_STATE_INFO = 'linear'
DROPUT_H_C_STATES=0.05

In [None]:
model_gru = gr.SentimentGRU(
    hidden_dim=HIDDEN_DIM,
    cell_state_info=CELL_STATE_INFO,  # Add this
    output_dim=OUTPUT_DIM,
    n_layers=N_LAYERS,
    dropout_gru=DROPOUT_LSTM,  # Use the same dropout for GRU
    dropout_h_c_states=DROPUT_H_C_STATES,  # Add this
    pad_idx=PAD_IDX,
    pos_pad_idx=POS_PAD_IDX,
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    pos_vocab_size=POS_VOCAB_SIZE,
    pos_embedding_dim=POS_EMBEDDING_DIM,
    pretrained_embeddings=embedding_matrix,
    freeze_embeddings=FREEZE_EMBEDDINGS,
    pos_embeddings=POS_EMBEDDINGS
)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_gru = model_gru.to(device)

In [None]:
# Create Datasets
train_dataset = lm.SpeechDataset(
    train_df['encoded_text'].tolist(),
    train_df['encoded_pos_tags'].tolist(),
    train_df['label'].tolist()
)

# valid_dataset = SpeechDataset(
#     valid_df['encoded_text'].tolist(),
#     valid_df['encoded_pos_tags'].tolist(),
#     valid_df['label'].tolist()
# )

test_dataset = lm.SpeechDataset(
    test_df['encoded_text'].tolist(),
    test_df['encoded_pos_tags'].tolist(),
    test_df['label'].tolist()
)

# Create DataLoaders
BATCH_SIZE = 64

train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=lm.collate_fn
)

# valid_loader = DataLoader(
#     valid_dataset, 
#     batch_size=BATCH_SIZE, 
#     shuffle=False, 
#     collate_fn=collate_fn
# )

test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=lm.collate_fn
)

In [None]:
optimizer = torch.optim.AdamW(model_gru.parameters(), lr=1e-4, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(neg / pos).to(device))
NUM_EPOCHS = 25
# # Training loop
# for epoch in range(NUM_EPOCHS):
#     train_loss, train_metrics = gr.train_model_gru(
#         model_gru, train_loader, criterion, optimizer, device
#     )
#     print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Train F1: {train_metrics['f1_score']:.4f}")


In [None]:
gru_training = True

if gru_training:
    NUM_EPOCHS = 25

    train_f1 = []
    test_f1 = []

    # Training Loop Without Early Stopping
    for epoch in range(NUM_EPOCHS):
        train_loss, train_metrics = gr.train_model_gru(
            model_gru,
            train_loader,
            criterion,
            optimizer,
            device
        )

        # Evaluate on Test Set
        test_loss, test_metrics, _ = gr.test_model_gru(model_gru, test_loader, criterion, device)

        print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
        print(f'\tTrain Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f}')
        print(f'\tTrain F1-Score: {train_metrics["f1_score"]:.4f} | Test F1-Score: {test_metrics["f1_score"]:.4f}')

        train_f1.append(train_metrics["f1_score"])
        test_f1.append(test_metrics["f1_score"])

        # Save the model
        gr.save_checkpoint({
            'epoch': epoch + 1,
            'model_state_dict': model_gru.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, filename=r'model/gru/best_model.pth')
else:
    # Load the best model
    gr.load_checkpoint(r'model/gru/best_model.pth', model_gru, optimizer, device=device)

# Make predictions
loss_hru, metrics_hru, preds_gru = gr.test_model_gru(model_gru, test_loader, criterion, device)
pred_prob_gru, pred_label_gru, _ = preds_gru

# Evaluate the model
display(utils.test_report(Y_test=y_test, labels=1 - np.array(pred_label_gru), probs=1 - np.array(pred_prob_gru)))
print(confusion_matrix(y_test, 1 - np.array(pred_label_gru)))

Epoch 1/25
	Train Loss: 1.0982 | Test Loss: 1.0137
	Train F1-Score: 0.2982 | Test F1-Score: 0.5853
Epoch 2/25
	Train Loss: 1.0660 | Test Loss: 0.9724
	Train F1-Score: 0.4790 | Test F1-Score: 0.5775
Epoch 3/25
	Train Loss: 1.0422 | Test Loss: 0.9577
	Train F1-Score: 0.5150 | Test F1-Score: 0.5577
Epoch 4/25
	Train Loss: 1.0214 | Test Loss: 1.0368
	Train F1-Score: 0.5397 | Test F1-Score: 0.5800
Epoch 5/25
	Train Loss: 1.0050 | Test Loss: 0.9390
	Train F1-Score: 0.5785 | Test F1-Score: 0.5975
Epoch 6/25
	Train Loss: 0.9884 | Test Loss: 1.0955
	Train F1-Score: 0.5949 | Test F1-Score: 0.6074
Epoch 7/25
	Train Loss: 0.9757 | Test Loss: 0.8919
	Train F1-Score: 0.6055 | Test F1-Score: 0.6541
Epoch 8/25
	Train Loss: 0.9648 | Test Loss: 0.8939
	Train F1-Score: 0.6196 | Test F1-Score: 0.6441
Epoch 9/25
	Train Loss: 0.9515 | Test Loss: 0.8933
	Train F1-Score: 0.6298 | Test F1-Score: 0.6858
Epoch 10/25
	Train Loss: 0.9425 | Test Loss: 0.8552
	Train F1-Score: 0.6348 | Test F1-Score: 0.6476
Epoch 11/

Unnamed: 0,0,1,accuracy,macro avg,weighted avg,balanced accuracy,auc
precision,0.708333,0.813131,0.801802,0.760732,0.78764,0.636574,0.799824
recall,0.314815,0.958333,0.801802,0.636574,0.801802,0.636574,0.799824
f1-score,0.435897,0.879781,0.801802,0.657839,0.77181,0.636574,0.799824
support,54.0,168.0,0.801802,222.0,222.0,0.636574,0.799824


[[ 17  37]
 [  7 161]]


## 7. Transformer based Model

### Model used
This model is a fine-tuned version of distilroberta-base on the financial_phrasebank dataset.  
Source: https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis

In [None]:
# Import Necessary Libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import torch
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np
from datasets import Dataset, ClassLabel, Features, Value
from sklearn.model_selection import train_test_split
import pyarrow as pa
import pyarrow.parquet as pq


from scripts import preprocessing as pr

In [None]:
# Reading the Parquet file using PyArrow
with open(r'data/output_speech_us_central_bank_v2.parquet', 'rb') as handle:
    text = pq.read_table(handle).to_pandas()

In [None]:
preprocessor = pr.TextPreprocessor(
    remove_stopwords=True,
    apply_pos_tagging=True,      # Enable POS tagging
    apply_lemmatization=True     # Enable lemmatization
)

text[['speech_text_processed_text', 'speech_text_word_tokens',
      'speech_text_sent_tokens', 'speech_text_word_tokens_wo_stopwords',
      'speech_text_pos_tags']] = preprocessor.preprocess_dataframe(text, 'speech_text')

text.dropna(inplace=True)

In [None]:
text['processed_speech_text_2'] = text['speech_text_word_tokens_wo_stopwords'].apply(' '.join)

In [None]:
data = text[[ 'target_label','speech_text_processed_text']]


from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
data['target_label'] = label_encoder.fit_transform(data['target_label'])

data = data.rename(columns={'speech_text_processed_text':'text','target_label':'label'})

y = data['label']
X = data['text']

# # Split the data into training and testing sets
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, data.index,
                                                                                 stratify=y, test_size=0.2,
                                                                                 random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['target_label'] = label_encoder.fit_transform(data['target_label'])


In [None]:
# Since we want the same train test split for all the models so loading the indices of the train and test sets
indices = pd.read_excel(r'data/idx.xlsx')
indices_train = indices[0]
indices_test = indices[indices[1].notna()][1]

In [None]:
train_dataset  = data.loc[indices_train].copy()
val_dataset   = data.loc[indices_test].copy()

#### Approach 1:
Since the max token size accepted by the transformer is 512 tokens, we will split the text with higher than 500 tokens into chunks and train the model

In [None]:
def split_text_into_chunks(text, label, chunk_size=500):
    words = text.split()  # Split the text into words
    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return [{"text": chunk, "label": label} for chunk in chunks]

In [None]:
# Apply the function to each row of the DataFrame
result = []
for _, row in train_dataset.iterrows():
    result.extend(split_text_into_chunks(row["text"], row["label"]))

# Convert the result into a new DataFrame
split_df = pd.DataFrame(result)

split_df['word_count'] = split_df['text'].apply(lambda x: len(x.split()))
# New train dataset
train_dataset = split_df[['label','text']]

In [None]:
# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_dataset)
val_dataset = Dataset.from_pandas(val_dataset)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

Training samples: 5208
Validation samples: 222


In [None]:
### Creating copy of val dataset just to check if the loaded models is reproducible
test_dataset = data.loc[indices_test].copy()
test_dataset = Dataset.from_pandas(test_dataset)

print(f"Test samples: {len(test_dataset)}")

Test samples: 222


In [None]:
# Setting up GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [None]:
# Tokenization
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        #max_length=128  # Adjust based on your data
    )

# Tokenizing the training set
train_dataset = train_dataset.map(tokenize_function, batched=True)

# Tokenizing the validation set
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5208 [00:00<?, ? examples/s]

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification

# Load the pre-trained model for sequence classification
# Since the transformer is intially for 3 labels, we are setting up new labels

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # Binary classification
    id2label={0: "negative", 1: "positive"},
    label2id={"negative": 0, "positive": 1},
    ignore_mismatched_sizes=True
)

# Move the model to the appropriate device
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [None]:
#!pip install evaluate

import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch",report_to="none",)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
from huggingface_hub import login
# import google.generativeai as genai
# # Used to securely store your API key
# from google.colab import userdata


# Hugging_Face_NLP=userdata.get('Hugging_Face_NLP')


from scripts import config  #This scripts is for Hugging face API. Please create the relevant API key file

# Login using your API key
login(token=config.Hugging_Face_NLP)

In [None]:
trainer.train()

  0%|          | 0/1953 [00:00<?, ?it/s]

{'loss': 0.3682, 'grad_norm': 41.941734313964844, 'learning_rate': 3.719918074756785e-05, 'epoch': 0.77}


  0%|          | 0/28 [00:00<?, ?it/s]

{'eval_loss': 0.7911430597305298, 'eval_accuracy': 0.7792792792792793, 'eval_runtime': 7.1853, 'eval_samples_per_second': 30.896, 'eval_steps_per_second': 3.897, 'epoch': 1.0}
{'loss': 0.3177, 'grad_norm': 51.35104751586914, 'learning_rate': 2.439836149513569e-05, 'epoch': 1.54}


  0%|          | 0/28 [00:00<?, ?it/s]

{'eval_loss': 0.9518962502479553, 'eval_accuracy': 0.7567567567567568, 'eval_runtime': 7.1266, 'eval_samples_per_second': 31.151, 'eval_steps_per_second': 3.929, 'epoch': 2.0}
{'loss': 0.2275, 'grad_norm': 92.83016204833984, 'learning_rate': 1.1597542242703534e-05, 'epoch': 2.3}


  0%|          | 0/28 [00:00<?, ?it/s]

{'eval_loss': 1.1411126852035522, 'eval_accuracy': 0.7567567567567568, 'eval_runtime': 7.3337, 'eval_samples_per_second': 30.271, 'eval_steps_per_second': 3.818, 'epoch': 3.0}
{'train_runtime': 2187.7731, 'train_samples_per_second': 7.142, 'train_steps_per_second': 0.893, 'train_loss': 0.27990975406801033, 'epoch': 3.0}


TrainOutput(global_step=1953, training_loss=0.27990975406801033, metrics={'train_runtime': 2187.7731, 'train_samples_per_second': 7.142, 'train_steps_per_second': 0.893, 'total_flos': 2069670636601344.0, 'train_loss': 0.27990975406801033, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(val_dataset)

  0%|          | 0/28 [00:00<?, ?it/s]

In [None]:
import numpy as np
import pandas as pd
from scipy.special import softmax
logits = predictions.predictions  # Shape: (num_samples, num_classes)

# Apply SoftMax to convert logits to probabilities
probabilities = softmax(logits, axis=1)
predicted_labels = np.argmax(probabilities, axis=1)

prob_df = pd.DataFrame(probabilities, columns=[0,1])
probabilities = prob_df[1].values

In [None]:
from scripts import utils
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate the model
display(utils.test_report(Y_test=val_dataset['label'], labels=predicted_labels, probs=probabilities))
print(confusion_matrix(val_dataset['label'], predicted_labels))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg,balanced accuracy,auc
precision,0.5,0.816667,0.756757,0.658333,0.73964,0.631944,0.765542
recall,0.388889,0.875,0.756757,0.631944,0.756757,0.631944,0.765542
f1-score,0.4375,0.844828,0.756757,0.641164,0.745748,0.631944,0.765542
support,54.0,168.0,0.756757,222.0,222.0,0.631944,0.765542


[[ 21  33]
 [ 21 147]]


#### Approach 2:
Alternatively, if we don't set the max tokens parameters and keep the input data as it is then it truncates the inputs accordlingly and process it for the model 

In [None]:
# Reset the train and validation dataset
train_dataset  = data.loc[indices_train].copy()
val_dataset   = data.loc[indices_test].copy()

In [None]:
# Tokenization
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        #max_length=128  # Adjust based on your data
    )

# Tokenizing the training set
train_dataset = train_dataset.map(tokenize_function, batched=True)

# Tokenizing the validation set
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5208 [00:00<?, ? examples/s]

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification

# Load the pre-trained model for sequence classification
# Since the transformer is intially for 3 labels, we are setting up new labels

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # Binary classification
    id2label={0: "negative", 1: "positive"},
    label2id={"negative": 0, "positive": 1},
    ignore_mismatched_sizes=True
)

# Move the model to the appropriate device
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch",report_to="none",)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
from huggingface_hub import login
from scripts import config

# Login using your API key
login(token=config.Hugging_Face_NLP)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(val_dataset)
logits = predictions.predictions  # Shape: (num_samples, num_classes)

# Apply SoftMax to convert logits to probabilities
probabilities = softmax(logits, axis=1)
predicted_labels = np.argmax(probabilities, axis=1)

prob_df = pd.DataFrame(probabilities, columns=[0,1])
probabilities = prob_df[1].values

In [None]:
from scripts import utils
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate the model
display(utils.test_report(Y_test=val_dataset['label'], labels=predicted_labels, probs=probabilities))
print(confusion_matrix(val_dataset['label'], predicted_labels))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg,balanced accuracy,auc
precision,0.75,0.854839,0.837838,0.802419,0.829337,0.723214,0.876213
recall,0.5,0.946429,0.837838,0.723214,0.837838,0.723214,0.876213
f1-score,0.6,0.898305,0.837838,0.749153,0.825744,0.723214,0.876213
support,54.0,168.0,0.837838,222.0,222.0,0.723214,0.876213


[[ 27  27]
 [  9 159]]
