In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
tejaswiniramoju_datasetlargeone_path = kagglehub.dataset_download('tejaswiniramoju/datasetlargeone')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers pandas torch
!pip install datasets

In [None]:
import pandas as pd

# Change the file path to the Kaggle input directory
dataset = pd.read_csv("/kaggle/input/datasetlargeone/EMOTION_LARGE.csv")
dataset.head()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

label_mapping = {
    'sadness': 0,
    'joy': 1,
    'love': 2,
    'angry': 3,
    'fear': 4,
    'surprise': 5
}

# First, split the dataset into training (70%) and temporary (30%) sets
train_df, temp_df = train_test_split(dataset, test_size=0.3, random_state=42)  # 70% train, 30% temp

# Then, split the temporary set into validation (20%) and test (10%) sets
# To achieve this, we need 2/3 of the temp_df for validation and 1/3 for testing
val_df, test_df = train_test_split(temp_df, test_size=0.3333, random_state=42)  # 1/3 for test, 2/3 for validation

# Now you have:
# - train_df: 70% of the original dataset
# - val_df: 20% of the original dataset
# - test_df: 10% of the original datas

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Convert dataframes into Hugging Face datasets
hf_train_dataset = Dataset.from_pandas(train_df)
hf_val_dataset = Dataset.from_pandas(val_df)
hf_test_dataset = Dataset.from_pandas(test_df)

# Define the label mapping based on dataset's labels
label_mapping = {
    'sadness': 0,
    'joy': 1,
    'love': 2,
    'anger': 3,
    'fear': 4,
    'surprise': 5
}

# Map the labels to integers in the datasets
hf_train_dataset = hf_train_dataset.map(lambda examples: {'label': label_mapping[examples['labels']]})
hf_val_dataset = hf_val_dataset.map(lambda examples: {'label': label_mapping[examples['labels']]})
hf_test_dataset = hf_test_dataset.map(lambda examples: {'label': label_mapping[examples['labels']]})

# Load the DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

# Tokenization function for dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize all datasets
tokenized_train_dataset = hf_train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = hf_val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = hf_test_dataset.map(tokenize_function, batched=True)

In [None]:
# Shuffle the tokenized datasets
train_dataset = tokenized_train_dataset.shuffle(seed=42)  # Shuffle the training dataset
val_dataset = tokenized_val_dataset.shuffle(seed=42)      # Shuffle the validation dataset
eval_dataset = tokenized_test_dataset.shuffle(seed=42)    # Shuffle the testing dataset

import pandas as pd

# Convert the shuffled datasets to pandas DataFrames
train_df = pd.DataFrame(train_dataset)  # Convert the shuffled training dataset to DataFrame
val_df = pd.DataFrame(val_dataset)      # Convert the shuffled validation dataset to DataFrame
eval_df = pd.DataFrame(eval_dataset)    # Convert the shuffled evaluation dataset to DataFrame

# Save the DataFrames to CSV files
train_df.to_csv('train_dataset.csv', index=False)  # Save training DataFrame as CSV
val_df.to_csv('val_dataset.csv', index=False)      # Save validation DataFrame as CSV
eval_df.to_csv('eval_dataset.csv', index=False)    # Save evaluation DataFrame as CSV

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels=len(label_mapping))

In [None]:
!pip install evaluate



In [None]:

from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
from transformers import Trainer, TrainingArguments

# Calculate class weights based on label distribution in the training data
labels = train_df['label'].values  # Adjust if 'label' column has a different name
class_weights = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to("cuda" if torch.cuda.is_available() else "cpu")

# Define a custom Trainer to include class weights in the loss function
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Use weighted cross-entropy loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

# Initialize the custom Trainer with class weights
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset
)

# Train the model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Predict on test data
test_results = trainer.predict(eval_dataset)  # Replace test_dataset with your test data variable
preds = np.argmax(test_results.predictions, axis=1)  # Get the predicted class labels
labels = test_results.label_ids  # True labels from test data

# Step 2: Calculate performance metrics
accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

# Print out performance metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Step 3: Generate Confusion Matrix
conf_matrix = confusion_matrix(labels, preds)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_mapping.keys(), yticklabels=label_mapping.keys())
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import classification_report

# Generate the classification report
report = classification_report(labels, preds, target_names=label_mapping.keys())

# Print the classification report
print("Classification Report:\n")
print(report)

In [None]:
import torch

# Assume 'model' and 'tokenizer' are already defined and loaded
# Set the model to evaluation mode
model.eval()

# Move model to the same device as the inputs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to predict sentiment
def predict_sentiment(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device
    for key in inputs:
        inputs[key] = inputs[key].to(device)

    # Disable gradient calculation for inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Print raw logits for insight
    #print("Logits:", outputs.logits)

    # Get the predicted class
    predictions = outputs.logits.argmax(dim=-1)

    # Map the predicted class index to your labels (adjust this based on your label mapping)
    label_mapping = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
    predicted_label = label_mapping[predictions.item()]

    return predicted_label

# Example usage
test_sentences = [
    "I am so sad and alone.",  # Expected: sadness
    "This is the best day ever!",  # Expected: joy
    "I love this place!",  # Expected: love
    "I am angry at the situation.",  # Expected: anger
    "I fear that we will lose.",  # Expected: fear
    "What a surprising turn of events!"  # Expected: surprise
]

for text in test_sentences:
    predicted_sentiment = predict_sentiment(text)
    print(f"Input: '{text}' -> Predicted Sentiment: {predicted_sentiment}")