In [10]:
!pip install --upgrade transformers datasets scikit-learn



In [11]:
from google.colab import files
uploaded = files.upload()


Saving motivational_quotes_dataset.csv to motivational_quotes_dataset (3).csv


In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load CSV
df = pd.read_csv("motivational_quotes_dataset.csv")

# Rename columns for convenience
df = df.rename(columns={
    "Input Text (Emotion/Intent)": "text",
    "Output Text (Motivational Quote)": "quote"
})

# Encode quotes into numeric labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["quote"])


In [13]:
from datasets import Dataset
from transformers import BertTokenizer

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[["text", "label"]])
dataset = dataset.train_test_split(test_size=0.3)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenization
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=64)

tokenized_dataset = dataset.map(tokenize)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [14]:
import os
import numpy as np
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score

# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

# Load mBERT model with classification head
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(label_encoder.classes_)
)

# Accuracy metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# ✅ Fixed TrainingArguments (no evaluation_strategy)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    logging_dir="./logs"
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [15]:
trainer.train()


Step,Training Loss


TrainOutput(global_step=5, training_loss=1.6143516540527343, metrics={'train_runtime': 94.7621, 'train_samples_per_second': 0.211, 'train_steps_per_second': 0.053, 'total_flos': 657807168000.0, 'train_loss': 1.6143516540527343, 'epoch': 5.0})

In [18]:
import torch

def predict_quote(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]


user_input = input("💬 Enter how you're feeling: ")
result = predict_quote(user_input)
print(result)

💬 Enter how you're feeling: feeling excited 
Fall seven times, stand up eight. – Japanese Proverb
