In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [None]:
encoder = LabelEncoder()
scaler = StandardScaler()
df['Sentiment'] = encoder.fit_transform(df['Sentiment'])
# df['Sentiment'] = scaler.fit_transform(df[['Sentiment']])
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,2
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",0
2,"For the last quarter of 2010 , Componenta 's n...",2
3,According to the Finnish-Russian Chamber of Co...,1
4,The Swedish buyout firm has sold its remaining...,1


In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

df['preprocessed_text'] = df['Sentence'].apply(preprocess_text)
df.head()

Unnamed: 0,Sentence,Sentiment,preprocessed_text
0,The GeoSolutions technology will leverage Bene...,2,geosolutions technology leverage benefon gps s...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",0,esi lows bk real possibility
2,"For the last quarter of 2010 , Componenta 's n...",2,last quarter componenta net sales doubled eurm...
3,According to the Finnish-Russian Chamber of Co...,1,according finnishrussian chamber commerce majo...
4,The Swedish buyout firm has sold its remaining...,1,swedish buyout firm sold remaining percent sta...


In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['preprocessed_text'])

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['Sentiment'], test_size=0.2, random_state=42)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'Multinomial NB': MultinomialNB(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(logging_level='Silent')
}

param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'Multinomial NB': {
        'alpha': [0.1, 0.5, 1.0, 2.0]  # Smoothing parameter
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'max_depth': [-1, 10, 20],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2]
    }
}

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"\n--- {name} Results ---")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)


--- Logistic Regression Results ---
Accuracy: 0.699743370402053
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.15      0.22       175
           1       0.69      0.90      0.78       622
           2       0.77      0.63      0.69       372

    accuracy                           0.70      1169
   macro avg       0.63      0.56      0.57      1169
weighted avg       0.68      0.70      0.67      1169


--- Random Forest Results ---
Accuracy: 0.6638152266894782
Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.15      0.18       175
           1       0.68      0.83      0.75       622
           2       0.78      0.63      0.70       372

    accuracy                           0.66      1169
   macro avg       0.57      0.54      0.54      1169
weighted avg       0.65      0.66      0.65      1169


--- Gradient Boosting Results ---
Accuracy: 0.6612489307100086
Cla

In [None]:
for name, model in models.items():
    param_grid = param_grids[name]
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"\n--- {name} Results ---")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

Fitting 5 folds for each of 20 candidates, totalling 100 fits

--- Logistic Regression Results ---
Accuracy: 0.699743370402053
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.15      0.22       175
           1       0.69      0.90      0.78       622
           2       0.77      0.63      0.69       372

    accuracy                           0.70      1169
   macro avg       0.63      0.56      0.57      1169
weighted avg       0.68      0.70      0.67      1169

Fitting 5 folds for each of 216 candidates, totalling 1080 fits

--- Random Forest Results ---
Accuracy: 0.6928999144568007
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.11      0.18       175
           1       0.69      0.91      0.78       622
           2       0.73      0.61      0.66       372

    accuracy                           0.69      1169
   macro avg       0.64      0.54      0.54      1

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load data
df = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')
encoder = LabelEncoder()
df['Sentiment'] = encoder.fit_transform(df['Sentiment'])

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['Sentence', 'Sentiment']].rename(columns={'Sentence': 'text', 'Sentiment': 'label'}))
test_dataset = Dataset.from_pandas(test_df[['Sentence', 'Sentiment']].rename(columns={'Sentence': 'text', 'Sentiment': 'label'}))

# Load FinBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    report = classification_report(labels, predictions, output_dict=True)
    return {'accuracy': accuracy, 'report': report}

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='none'  # Disable wandb logging
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
print("Training model...")
trainer.train()

# Evaluate
print("Evaluating model...")
eval_results = trainer.evaluate()
accuracy = eval_results['eval_accuracy']
print(f"\n--- FinBERT Results ---")
print(f"Accuracy: {accuracy}")

# Predict and get detailed report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=-1)
y_test = test_dataset['label']
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Map:   0%|          | 0/4673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

Training model...


Epoch,Training Loss,Validation Loss,Accuracy,Report
1,0.5068,0.566238,0.773311,"{'0': {'precision': 0.5043478260869565, 'recall': 0.6628571428571428, 'f1-score': 0.5728395061728395, 'support': 175}, '1': {'precision': 0.8925143953934741, 'recall': 0.747588424437299, 'f1-score': 0.8136482939632544, 'support': 622}, '2': {'precision': 0.7727272727272727, 'recall': 0.8682795698924731, 'f1-score': 0.8177215189873417, 'support': 372}, 'accuracy': 0.7733105218135158, 'macro avg': {'precision': 0.7231964980692345, 'recall': 0.7595750457289716, 'f1-score': 0.7347364397078119, 'support': 1169}, 'weighted avg': {'precision': 0.7962868853331939, 'recall': 0.7733105218135158, 'f1-score': 0.7788952587584964, 'support': 1169}}"
2,0.2189,0.52751,0.816938,"{'0': {'precision': 0.7424242424242424, 'recall': 0.28, 'f1-score': 0.40663900414937765, 'support': 175}, '1': {'precision': 0.8035714285714286, 'recall': 0.9405144694533762, 'f1-score': 0.8666666666666667, 'support': 622}, '2': {'precision': 0.856, 'recall': 0.8629032258064516, 'f1-score': 0.859437751004016, 'support': 372}, 'accuracy': 0.8169375534644996, 'macro avg': {'precision': 0.8006652236652236, 'recall': 0.6944725650866093, 'f1-score': 0.71091447394002, 'support': 1169}, 'weighted avg': {'precision': 0.8111015149663567, 'recall': 0.8169375534644996, 'f1-score': 0.7954998595092402, 'support': 1169}}"
3,0.3169,0.621851,0.791275,"{'0': {'precision': 0.48223350253807107, 'recall': 0.5428571428571428, 'f1-score': 0.510752688172043, 'support': 175}, '1': {'precision': 0.8568994889267462, 'recall': 0.8086816720257235, 'f1-score': 0.8320926385442515, 'support': 622}, '2': {'precision': 0.8493506493506493, 'recall': 0.8790322580645161, 'f1-score': 0.8639365918097754, 'support': 372}, 'accuracy': 0.7912745936698032, 'macro avg': {'precision': 0.7294945469384887, 'recall': 0.7435236909824607, 'f1-score': 0.7355939728420232, 'support': 1169}, 'weighted avg': {'precision': 0.7984095693884005, 'recall': 0.7912745936698032, 'f1-score': 0.7941212606996308, 'support': 1169}}"


Evaluating model...



--- FinBERT Results ---
Accuracy: 0.8169375534644996
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.28      0.41       175
           1       0.80      0.94      0.87       622
           2       0.86      0.86      0.86       372

    accuracy                           0.82      1169
   macro avg       0.80      0.69      0.71      1169
weighted avg       0.81      0.82      0.80      1169



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load data
df = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')
encoder = LabelEncoder()
df['Sentiment'] = encoder.fit_transform(df['Sentiment'])

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['Sentence', 'Sentiment']].rename(columns={'Sentence': 'text', 'Sentiment': 'label'}))
test_dataset = Dataset.from_pandas(test_df[['Sentence', 'Sentiment']].rename(columns={'Sentence': 'text', 'Sentiment': 'label'}))

# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Compute class weights
class_counts = np.bincount(train_df['Sentiment'])
total_samples = len(train_df)
class_weights = total_samples / (3 * class_counts)
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')

# Custom Trainer with class weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    report = classification_report(labels, predictions, output_dict=True)
    return {'accuracy': accuracy, 'report': report}

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=200,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='none'
)

# Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
print("Training model...")
trainer.train()

# Evaluate
print("Evaluating model...")
eval_results = trainer.evaluate()
accuracy = eval_results['eval_accuracy']
print(f"\n--- RoBERTa Results ---")
print(f"Accuracy: {accuracy}")

# Predict and get detailed report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=-1)
y_test = test_dataset['label']
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

Training model...


Epoch,Training Loss,Validation Loss,Accuracy,Report
1,0.37,0.532511,0.792985,"{'0': {'precision': 0.5303030303030303, 'recall': 0.8, 'f1-score': 0.6378132118451025, 'support': 175}, '1': {'precision': 0.9212121212121213, 'recall': 0.7331189710610932, 'f1-score': 0.8164726947179947, 'support': 622}, '2': {'precision': 0.8073170731707318, 'recall': 0.8897849462365591, 'f1-score': 0.8465473145780051, 'support': 372}, 'accuracy': 0.7929854576561164, 'macro avg': {'precision': 0.7529440748952944, 'recall': 0.8076346390992174, 'f1-score': 0.766944407047034, 'support': 1169}, 'weighted avg': {'precision': 0.8264490341458356, 'recall': 0.7929854576561164, 'f1-score': 0.7992976297780183, 'support': 1169}}"
2,0.3822,0.546409,0.818648,"{'0': {'precision': 0.5504201680672269, 'recall': 0.7485714285714286, 'f1-score': 0.6343825665859565, 'support': 175}, '1': {'precision': 0.903107861060329, 'recall': 0.7942122186495176, 'f1-score': 0.8451668092386654, 'support': 622}, '2': {'precision': 0.8645833333333334, 'recall': 0.8924731182795699, 'f1-score': 0.8783068783068784, 'support': 372}, 'accuracy': 0.8186484174508126, 'macro avg': {'precision': 0.7727037874869631, 'recall': 0.8117522551668387, 'f1-score': 0.7859520847105, 'support': 1169}, 'weighted avg': {'precision': 0.8380509999925486, 'recall': 0.8186484174508126, 'f1-score': 0.8241581379205739, 'support': 1169}}"
3,0.4678,0.464003,0.828058,"{'0': {'precision': 0.5457875457875457, 'recall': 0.8514285714285714, 'f1-score': 0.6651785714285714, 'support': 175}, '1': {'precision': 0.9425742574257425, 'recall': 0.7652733118971061, 'f1-score': 0.84472049689441, 'support': 622}, '2': {'precision': 0.8772378516624041, 'recall': 0.9220430107526881, 'f1-score': 0.8990825688073394, 'support': 372}, 'accuracy': 0.8280581693755347, 'macro avg': {'precision': 0.7885332182918975, 'recall': 0.8462482980261218, 'f1-score': 0.8029938790434402, 'support': 1169}, 'weighted avg': {'precision': 0.862383652224163, 'recall': 0.8280581693755347, 'f1-score': 0.8351420997986768, 'support': 1169}}"
4,0.1915,0.506073,0.821215,"{'0': {'precision': 0.5480427046263345, 'recall': 0.88, 'f1-score': 0.6754385964912281, 'support': 175}, '1': {'precision': 0.948559670781893, 'recall': 0.7411575562700965, 'f1-score': 0.8321299638989169, 'support': 622}, '2': {'precision': 0.8582089552238806, 'recall': 0.9274193548387096, 'f1-score': 0.8914728682170542, 'support': 372}, 'accuracy': 0.8212147134302823, 'macro avg': {'precision': 0.7849371102107027, 'recall': 0.8495256370362686, 'f1-score': 0.7996804762023997, 'support': 1169}, 'weighted avg': {'precision': 0.8598505730361246, 'recall': 0.8212147134302823, 'f1-score': 0.8275573130092689, 'support': 1169}}"
5,0.1459,0.588706,0.826347,"{'0': {'precision': 0.5538461538461539, 'recall': 0.8228571428571428, 'f1-score': 0.6620689655172415, 'support': 175}, '1': {'precision': 0.9300970873786408, 'recall': 0.770096463022508, 'f1-score': 0.8425681618293756, 'support': 622}, '2': {'precision': 0.8705583756345178, 'recall': 0.9220430107526881, 'f1-score': 0.8955613577023498, 'support': 372}, 'accuracy': 0.8263473053892215, 'macro avg': {'precision': 0.7848338722864375, 'recall': 0.838332205544113, 'f1-score': 0.8000661616829889, 'support': 1169}, 'weighted avg': {'precision': 0.854825646713971, 'recall': 0.8263473053892215, 'f1-score': 0.8324108560210974, 'support': 1169}}"


Evaluating model...



--- RoBERTa Results ---
Accuracy: 0.8280581693755347
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.85      0.67       175
           1       0.94      0.77      0.84       622
           2       0.88      0.92      0.90       372

    accuracy                           0.83      1169
   macro avg       0.79      0.85      0.80      1169
weighted avg       0.86      0.83      0.84      1169



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load data
df = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')

# Remove Class 1 (neutral sentiment)
df = df[df['Sentiment'] != 'neutral']
# Encode labels (negative=0, positive=1)
encoder = LabelEncoder()
df['Sentiment'] = encoder.fit_transform(df['Sentiment'])

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['Sentence', 'Sentiment']].rename(columns={'Sentence': 'text', 'Sentiment': 'label'}))
test_dataset = Dataset.from_pandas(test_df[['Sentence', 'Sentiment']].rename(columns={'Sentence': 'text', 'Sentiment': 'label'}))

# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)  # 2 classes: negative, positive

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Compute class weights (for 2 classes)
class_counts = np.bincount(train_df['Sentiment'])
total_samples = len(train_df)
class_weights = total_samples / (2 * class_counts)
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')

# Custom Trainer with class weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    report = classification_report(labels, predictions, output_dict=True)
    return {'accuracy': accuracy, 'report': report}

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=200,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='none'
)

# Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
print("Training model...")
trainer.train()

# Evaluate
print("Evaluating model...")
eval_results = trainer.evaluate()
accuracy = eval_results['eval_accuracy']
print(f"\n--- RoBERTa Results (Class 1 Removed) ---")
print(f"Accuracy: {accuracy}")

# Predict and get detailed report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=-1)
y_test = test_dataset['label']
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2169 [00:00<?, ? examples/s]

Map:   0%|          | 0/543 [00:00<?, ? examples/s]

Training model...


Epoch,Training Loss,Validation Loss,Accuracy,Report
1,0.4033,0.242926,0.928177,"{'0': {'precision': 0.8397790055248618, 'recall': 0.9382716049382716, 'f1-score': 0.8862973760932945, 'support': 162}, '1': {'precision': 0.9723756906077348, 'recall': 0.9238845144356955, 'f1-score': 0.9475100942126514, 'support': 381}, 'accuracy': 0.9281767955801105, 'macro avg': {'precision': 0.9060773480662982, 'recall': 0.9310780596869835, 'f1-score': 0.916903735152973, 'support': 543}, 'weighted avg': {'precision': 0.93281645859406, 'recall': 0.9281767955801105, 'f1-score': 0.9292477363206886, 'support': 543}}"
2,0.3062,0.316286,0.944751,"{'0': {'precision': 0.9342105263157895, 'recall': 0.8765432098765432, 'f1-score': 0.9044585987261147, 'support': 162}, '1': {'precision': 0.948849104859335, 'recall': 0.973753280839895, 'f1-score': 0.961139896373057, 'support': 381}, 'accuracy': 0.9447513812154696, 'macro avg': {'precision': 0.9415298155875622, 'recall': 0.9251482453582192, 'f1-score': 0.9327992475495859, 'support': 543}, 'weighted avg': {'precision': 0.9444817941336364, 'recall': 0.9447513812154696, 'f1-score': 0.9442294539811515, 'support': 543}}"
3,0.0777,0.259561,0.952118,"{'0': {'precision': 0.9146341463414634, 'recall': 0.9259259259259259, 'f1-score': 0.9202453987730062, 'support': 162}, '1': {'precision': 0.9683377308707124, 'recall': 0.963254593175853, 'f1-score': 0.9657894736842105, 'support': 381}, 'accuracy': 0.9521178637200737, 'macro avg': {'precision': 0.9414859386060879, 'recall': 0.9445902595508895, 'f1-score': 0.9430174362286083, 'support': 543}, 'weighted avg': {'precision': 0.9523156669780082, 'recall': 0.9521178637200737, 'f1-score': 0.952201738627829, 'support': 543}}"
4,0.0959,0.324353,0.948435,"{'0': {'precision': 0.935064935064935, 'recall': 0.8888888888888888, 'f1-score': 0.9113924050632912, 'support': 162}, '1': {'precision': 0.9537275064267352, 'recall': 0.973753280839895, 'f1-score': 0.9636363636363636, 'support': 381}, 'accuracy': 0.9484346224677717, 'macro avg': {'precision': 0.9443962207458352, 'recall': 0.9313210848643919, 'f1-score': 0.9375143843498275, 'support': 543}, 'weighted avg': {'precision': 0.9481596674569164, 'recall': 0.9484346224677717, 'f1-score': 0.948049768260972, 'support': 543}}"
5,0.0247,0.261257,0.953959,"{'0': {'precision': 0.9151515151515152, 'recall': 0.9320987654320988, 'f1-score': 0.9235474006116208, 'support': 162}, '1': {'precision': 0.9708994708994709, 'recall': 0.963254593175853, 'f1-score': 0.9670619235836627, 'support': 381}, 'accuracy': 0.9539594843462247, 'macro avg': {'precision': 0.9430254930254931, 'recall': 0.9476766793039759, 'f1-score': 0.9453046620976417, 'support': 543}, 'weighted avg': {'precision': 0.9542674841017382, 'recall': 0.9539594843462247, 'f1-score': 0.9540796902107884, 'support': 543}}"


Evaluating model...



--- RoBERTa Results (Class 1 Removed) ---
Accuracy: 0.9539594843462247
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.92       162
           1       0.97      0.96      0.97       381

    accuracy                           0.95       543
   macro avg       0.94      0.95      0.95       543
weighted avg       0.95      0.95      0.95       543

