In [1]:
import os
os.makedirs('/content/eng', exist_ok=True)

In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Correct path to the newly uploaded file
file_path = '/content/eng.csv'

# Reload the dataset to avoid file path issues
data = pd.read_csv(file_path)

# Re-define the preprocessing function
def preprocess_text(text):
    # Remove special characters and numbers, convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing to the text column
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Perform random train-test split without stratification due to class imbalance
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Save processed datasets
train_data_path = '/content/eng/processed_train_reloaded.csv'
test_data_path = '/content/eng/processed_test_reloaded.csv'

train_data.to_csv(train_data_path, index=False)
test_data.to_csv(test_data_path, index=False)

train_data_path, test_data_path

('/content/eng/processed_train_reloaded.csv',
 '/content/eng/processed_test_reloaded.csv')

In [3]:
import os
print(os.listdir('/content/eng'))

['processed_test_reloaded.csv', 'processed_train_reloaded.csv']


In [4]:
pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m553.0/897.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [5]:
# Import necessary libraries
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
import re

# Load MarianMT model for English to Spanish translation
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Function to translate text in batches
def translate_text(texts, tokenizer, model, batch_size=16):
    translations = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        outputs = model.generate(**inputs)
        translations.extend([tokenizer.decode(t, skip_special_tokens=True) for t in outputs])
    return translations

# Load and preprocess the dataset
file_path = '/content/eng.csv'  # Ensure this is the correct path
train_data = pd.read_csv(file_path)

# Preprocess text if not already done
train_data['cleaned_text'] = train_data['text'].str.replace(r'[^\w\s]', '', regex=True).str.lower()

# Translate a subset of the 'cleaned_text' column
train_texts = train_data['cleaned_text'].tolist()[:100]  # Limit to the first 100 entries for this example
train_translations = translate_text(train_texts, tokenizer, model)

# Create a subset of the dataset with translations
train_data_subset = train_data.iloc[:100].copy()
train_data_subset['translated_text'] = train_translations

# Save the translated training dataset
translated_train_path = '/content/eng/translated_train_subset.csv'
train_data_subset.to_csv(translated_train_path, index=False)
print(f"Translated subset saved to {translated_train_path}")

# Translate the entire dataset (if needed, remove the [:100] limit)
# This step may take longer depending on the dataset size
# train_texts_full = train_data['cleaned_text'].tolist()
# train_translations_full = translate_text(train_texts_full, tokenizer, model)
# train_data['translated_text'] = train_translations_full
# translated_train_full_path = '/mnt/data/translated_train_full.csv'
# train_data.to_csv(translated_train_full_path, index=False)
# print(f"Translated full dataset saved to {translated_train_full_path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translated subset saved to /content/eng/translated_train_subset.csv


In [6]:
import os

# Disable WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

# Load mBERT and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Define Dataset Class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(labels, dtype=torch.float),
        }

# Prepare the Data
emotion_columns = ["Anger", "Fear", "Joy", "Sadness", "Surprise"]
train_texts = train_data["cleaned_text"].tolist()
train_labels = train_data[emotion_columns].values.tolist()

test_texts = test_data["cleaned_text"].tolist()
test_labels = test_data[emotion_columns].values.tolist()

# Create Dataset Objects
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
test_dataset = EmotionDataset(test_texts, test_labels, tokenizer)

# Define Metrics
def compute_metrics(pred):
    logits, labels = pred
    predictions = (logits > 0).astype(int)
    precision = precision_score(labels, predictions, average="macro")
    recall = recall_score(labels, predictions, average="macro")
    f1 = f1_score(labels, predictions, average="macro")
    return {"precision": precision, "recall": recall, "f1": f1}

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-Tune the Model
trainer.train()

# Save the Model
model.save_pretrained("./fine_tuned_mbert")
tokenizer.save_pretrained("./fine_tuned_mbert")

# Evaluate the Model
metrics = trainer.evaluate()
print(metrics)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4952,0.466721,0.554156,0.406328,0.458779
2,0.4429,0.38691,0.813149,0.547741,0.587712
3,0.394,0.358461,0.819672,0.606682,0.655237


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3584614396095276, 'eval_precision': 0.8196724777319517, 'eval_recall': 0.6066816012466487, 'eval_f1': 0.6552369269331464, 'eval_runtime': 4.6301, 'eval_samples_per_second': 119.653, 'eval_steps_per_second': 7.559, 'epoch': 3.0}


In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import numpy as np

# Define a function to make predictions and calculate metrics
def evaluate_model(trainer, test_dataset, emotion_columns):
    # Make predictions
    predictions = trainer.predict(test_dataset)
    logits = predictions.predictions
    preds = (logits > 0).astype(int)  # Convert logits to binary predictions (multilabel format)
    true_labels = np.array([sample["labels"].numpy() for sample in test_dataset])  # Extract true labels

    # Calculate precision, recall, and F1 scores for each class
    precision = precision_score(true_labels, preds, average=None, zero_division=0)
    recall = recall_score(true_labels, preds, average=None, zero_division=0)
    f1 = f1_score(true_labels, preds, average=None, zero_division=0)


    # Macro-average F1 score
    macro_f1 = f1_score(true_labels, preds, average="macro")

    # Generate a detailed classification report
    report = classification_report(
        true_labels, preds, target_names=emotion_columns, digits=4
    )

    # Return metrics
    return {
        "precision_per_class": precision,
        "recall_per_class": recall,
        "f1_per_class": f1,
        "macro_f1": macro_f1,
        "classification_report": report,
    }

# Evaluate the model
metrics = evaluate_model(trainer, test_dataset, emotion_columns)

# Print metrics
print("Precision per class:", metrics["precision_per_class"])
print("Recall per class:", metrics["recall_per_class"])
print("F1 Score per class:", metrics["f1_per_class"])
print("Macro F1 Score:", metrics["macro_f1"])
print("Classification Report:\n", metrics["classification_report"])


Precision per class: [0.90909091 0.81976744 0.78181818 0.68604651 0.90163934]
Recall per class: [0.16393443 0.89808917 0.64179104 0.69005848 0.63953488]
F1 Score per class: [0.27777778 0.85714286 0.70491803 0.68804665 0.74829932]
Macro F1 Score: 0.6552369269331464
Classification Report:
               precision    recall  f1-score   support

       Anger     0.9091    0.1639    0.2778        61
        Fear     0.8198    0.8981    0.8571       314
         Joy     0.7818    0.6418    0.7049       134
     Sadness     0.6860    0.6901    0.6880       171
    Surprise     0.9016    0.6395    0.7483       172

   micro avg     0.7984    0.7113    0.7523       852
   macro avg     0.8197    0.6067    0.6552       852
weighted avg     0.8099    0.7113    0.7358       852
 samples avg     0.6879    0.6545    0.6477       852



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the model and tokenizer
model_path = "/content/fine_tuned_mbert"  # Replace with the correct path to your model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Define emotion labels (based on your training dataset)
labels = ["joy", "sadness", "fear", "anger", "surprise"]  # Adjust based on your model's configuration

# Function to predict the emotion of a custom input
def predict_emotion(sentence):
    # Tokenize input sentence
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128  # Adjust based on your model's max sequence length
    )

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted label
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    predicted_label = labels[predicted_class_id]

    return predicted_label

# Test with a custom sentence
custom_sentence = "Ich kann nicht glauben, wie nachlässig sie waren, alle Warnungen ignoriert und ein Chaos angerichtet haben, das jetzt alle anderen aufräumen müssen, was mich frustriert und absolut wütend über ihre Verantwortungslosigkeit macht"
predicted_emotion = predict_emotion(custom_sentence)
print(f"The predicted emotion is: {predicted_emotion}")

The predicted emotion is: sadness
