In [4]:
pip install transformers datasets torch scikit-learn

Collecting transformers
  Using cached transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Using cached huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safetensors-0.5.2-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-19.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.3-py3-none-any.whl.met

ERROR: Could not install packages due to an OSError: [WinError 2] The system cannot find the file specified: 'c:\\Python311\\Scripts\\tqdm.exe' -> 'c:\\Python311\\Scripts\\tqdm.exe.deleteme'



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import pandas as pd
from transformers import BertTokenizer



# Load the dataset again (if needed)
file_path = "/mnt/data/textClassification.csv"
df = pd.read_csv("textClassification.csv");

# Drop NaN values if any
df = df.dropna()

# Encode emotion labels into numbers
label_encoder = LabelEncoder()
df["emotion_label"] = label_encoder.fit_transform(df["emotion"])

# Split into train and test sets (80% train, 20% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["emotion_label"], test_size=0.2, random_state=42
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
test_labels = torch.tensor(test_labels.values)

# Check label mapping
label_mapping = dict(enumerate(label_encoder.classes_))
label_mapping


In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset

# Load dataset
file_path = "textClassification.csv"
df = pd.read_csv(file_path)

# Encode emotion labels into numbers
label_encoder = LabelEncoder()
df["emotion_label"] = label_encoder.fit_transform(df["emotion"])

# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["emotion_label"], test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
test_labels = torch.tensor(test_labels.values)

# Create PyTorch dataset
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = EmotionDataset(train_encodings, train_labels)
test_dataset = EmotionDataset(test_encodings, test_labels)

# Load pre-trained BERT model
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./emotion_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()


ModuleNotFoundError: No module named 'transformers'

In [None]:
import joblib

# Save the trained model
joblib.dump(model, "emotion_classifier.pkl")

# Save the label encoder for later use
joblib.dump(label_encoder, "label_encoder.pkl")

print("Model saved as emotion_classifier.pkl")


In [None]:
# Load the model
model = joblib.load("emotion_classifier.pkl")
label_encoder = joblib.load("label_encoder.pkl")

# Tokenize a new sentence
text = "Thank you for failing me."
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

# Get prediction
with torch.no_grad():
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits).item()

# Convert back to emotion label
predicted_emotion = label_encoder.inverse_transform([predicted_label])[0]

print(f"Predicted Emotion: {predicted_emotion}")
