In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("RomanicBanglaSentiment.csv")
df.info()

X = df['text']
y = df['sentiment']

# Encode the labels (2 labels for binary classification)
le = LabelEncoder()
y_en = le.fit_transform(y)

# Prepare dataset for training
data = Dataset.from_pandas(pd.DataFrame({'text': X, 'label': y_en}))
data = data.train_test_split(test_size=0.2, seed=42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'FacebookAI/xlm-roberta-base'

# Determine number of labels
num_labels = len(le.classes_)  # Should be 2 for binary classification

# Load configuration with the correct number of labels
config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)

# Load tokenizer and model using the updated configuration
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True  # This ignores mismatches for the classifier head weights
)
model.to(device)
model.eval()

# Tokenizer function
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

# Apply tokenization
data = data.map(tokenize_function, batched=True)
data = data.remove_columns(['text'])
data = data.rename_column('label', 'labels')
data.set_format('torch')

# DataLoader for training and evaluation
train_dataloader = DataLoader(data['train'], batch_size=8, shuffle=True)  # Reduced batch size to help with memory
eval_dataloader = DataLoader(data['test'], batch_size=8, shuffle=False)

# Training arguments (with fp16 enabled for reduced memory usage)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='no',
    load_best_model_at_end=False,
    report_to=[],
    fp16=True  # Enable mixed precision training to reduce memory usage
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    tokenizer=tokenizer,
    compute_metrics=lambda eval_pred: {
        "accuracy": accuracy_score(eval_pred.label_ids, np.argmax(eval_pred.predictions, axis=-1)),
        "f1": f1_score(eval_pred.label_ids, np.argmax(eval_pred.predictions, axis=-1), average="binary")
    }
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Make predictions on the test set
predictions_output = trainer.predict(data['test'])
predictions = np.argmax(predictions_output.predictions, axis=1)
labels = predictions_output.label_ids

# Print classification report
from sklearn.metrics import classification_report
report = classification_report(labels, predictions, target_names=le.classes_)
print("Classification Report:\n", report)

# Plot confusion matrix
conf_matrix = confusion_matrix(labels, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


  from .autonotebook import tqdm as notebook_tqdm


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  4999 non-null   int64  
 1   text        4999 non-null   object 
 2   sentiment   4998 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 117.3+ KB


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3999/3999 [00:00<00:00, 6917.31 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 8322.66 examples

Epoch,Training Loss,Validation Loss


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("RomanicBanglaSentiment.csv")
X = df['text']
y = df['sentiment']

# Encode the labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_labels = len(le.classes_)

# Prepare Hugging Face dataset
dataset = Dataset.from_pandas(pd.DataFrame({'text': X, 'label': y_encoded}))
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Set device and model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'FacebookAI/xlm-roberta-base'

config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True).to(device)

# Tokenization
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(['text'])
dataset = dataset.rename_column('label', 'labels')
dataset.set_format('torch')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='no',
    load_best_model_at_end=False,
    report_to=[],
    fp16=True
)

# Custom metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")  # changed from 'binary'
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Training
trainer.train()

# Evaluation
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Predictions
pred_output = trainer.predict(dataset['test'])
preds = np.argmax(pred_output.predictions, axis=1)
true_labels = pred_output.label_ids

# Classification report
report = classification_report(true_labels, preds, target_names=le.classes_)
print("Classification Report:\n", report)

# Confusion Matrix
conf_matrix = confusion_matrix(true_labels, preds)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3999/3999 [00:00<00:00, 5471.46 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 5518.36 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5074,0.581532,0.737,0.742589
2,0.5066,0.504626,0.787,0.784746
3,0.4685,0.497547,0.818,0.799652
4,0.3277,0.553647,0.81,0.797022
5,0.2981,0.581533,0.818,0.804491


Evaluation results: {'eval_loss': 0.5815327167510986, 'eval_accuracy': 0.818, 'eval_f1': 0.8044912280701754, 'eval_runtime': 6.7456, 'eval_samples_per_second': 148.245, 'eval_steps_per_second': 9.339, 'epoch': 5.0}


ValueError: Number of classes, 3, does not match size of target_names, 5. Try specifying the labels parameter

In [2]:
trainer.save_model('RoBERTa')