<a href="https://colab.research.google.com/github/RajaniBoddupally/Assignment2-RM/blob/main/Code_RM_A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers -q
!pip install -U scikit-learn -q
!pip install wordcloud -q
# Import all required libraries for data processing, visualization, modeling, and evaluation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm
from wordcloud import WordCloud
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Set the device to GPU if available, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
# Define the URL of the CSV file from the GitHub repository
Emotefile = 'https://raw.githubusercontent.com/RajaniBoddupally/Assignment2-RM/refs/heads/main/Emotion_classify_Data.csv'

# Read the CSV file from the specified URL into a DataFrame
Emotefile_Data = pd.read_csv(Emotefile)

# Print the first 5 rows of the DataFrame
print(Emotefile_Data.head())


In [None]:
# Convert all column names in the DataFrame to lowercase
Emotefile_Data.columns = [col.lower() for col in Emotefile_Data.columns]

# Display basic information about the dataset
print('\n Dataset Info:\n')
print(Emotefile_Data.info())

# Show descriptive statistics for numerical columns
print('\n Descriptive Statistics:\n', Emotefile_Data.describe(), '\n\n')

# Check and print the number of duplicate rows in the dataset
print('\nNumber of Duplicated Rows:', Emotefile_Data.duplicated().sum(), '\n\n')


In [None]:
# Check for null (missing) values in each column
print("Null values in each column:\n")
print(Emotefile_Data.isnull().sum())

# Display rows with missing values if any are found
if Emotefile_Data.isnull().any().any():
    print("\nRows with null values:\n")
    display(Emotefile_Data[Emotefile_Data.isnull().any(axis=1)])
else:
    print("\n✅ No missing values found in the dataset.")


In [None]:
# Plot the distribution of emotion labels without future warning
plt.figure(figsize=(8, 5))
sns.countplot(data=Emotefile_Data, x='emotion', hue='emotion', palette='Set2', legend=False)
plt.title('Distribution of Emotions')
plt.xlabel('Emotion')
plt.ylabel('Count')
plt.show()

In [None]:
# Calculate the count of each emotion category
emotion_counts = Emotefile_Data['emotion'].value_counts()
emotion_labels = emotion_counts.index
emotion_sizes = emotion_counts.values

# Create a pie chart to show the percentage distribution of emotions
plt.figure(figsize=(8, 6))
plt.pie(emotion_sizes,
        labels=emotion_labels,
        autopct='%1.1f%%',
        startangle=140,
        colors=plt.cm.Set3.colors)

# Set title and making sure that the pie is a perfect circle
plt.title('Percentage Distribution of Emotions')
plt.axis('equal')
plt.show()


In [None]:
# Convert the 'emotion' column to categorical type and encode it as numeric labels
Emotefile_Data['emotion'] = Emotefile_Data['emotion'].astype('category')
Emotefile_Data['label'] = Emotefile_Data['emotion'].cat.codes

# Extract the list of original emotion categories in order of their assigned codes
Emo_Lable_nms = Emotefile_Data['emotion'].cat.categories.tolist()

# Display the mapping between numeric labels and emotion categories
print("Label mapping:", dict(enumerate(Emo_Lable_nms)))


In [None]:
# Add a new column that stores the length of each comment
Emotefile_Data['EM_Comment_Len'] = Emotefile_Data['comment'].apply(len)

# Display summary statistics of the comment lengths
print("\nText Length Stats:\n", Emotefile_Data['EM_Comment_Len'].describe())

# Plot a histogram showing the distribution of comment lengths
plt.figure(figsize=(8, 5))
sns.histplot(Emotefile_Data['EM_Comment_Len'], bins=30, kde=True)
plt.title("Distribution of Comment Lengths")
plt.xlabel("Number of Characters")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Create a boxplot to compare comment lengths across different emotion categories
plt.figure(figsize=(10, 6))
sns.boxplot(data=Emotefile_Data, x='emotion', y='EM_Comment_Len', palette='Pastel2')
plt.title("Comment Length by Emotion")
plt.xlabel("Emotion")
plt.ylabel("Text Length")
plt.show()


In [None]:
# Generate word clouds for each unique emotion category
emotions = Emotefile_Data['emotion'].unique()
plt.figure(figsize=(15, 8))

for i, emotion in enumerate(emotions):
    # Combine all comments for the current emotion into a single string
    text = " ".join(Emotefile_Data[Emotefile_Data['emotion'] == emotion]['comment'])

    # Generate a word cloud from the combined text
    wordcloud = WordCloud(width=600, height=400, background_color='white', colormap='viridis').generate(text)

    # Plot the word cloud
    plt.subplot(1, len(emotions), i + 1)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{emotion.capitalize()}")

# Adjust layout and display all word clouds
plt.tight_layout()
plt.show()


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
# Initialize the WordNet lemmatizer and define English stopwords
lem_emo_data = WordNetLemmatizer()
Emo_Stop_Words = set(stopwords.words('english'))

# Define a function to clean and preprocess emotion comment text
def Emote_Change_PP(Emo_Comment):
    """
    Clean and preprocess a given emotion comment.

    This function performs the following steps:
    1. Converts text to lowercase
    2. Removes punctuation and numeric characters
    3. Splits the text into tokens (words)
    4. Removes English stopwords
    5. Applies lemmatization to each word

    Parameters:
    text (str): The raw comment text to be cleaned

    Returns:
    str: The cleaned and lemmatized text
    """
    Emo_lower = Emo_Comment.lower()
    Emo_Clean = re.sub(r'[^a-z\s]', '', Emo_lower)
    Emo_Token = Emo_Clean.split()
    Emo_lem_token = [lem_emo_data.lemmatize(word) for word in Emo_Token if word not in Emo_Stop_Words]
    return ' '.join(Emo_lem_token)


# Apply the preprocessing function to the 'comment' column
Emotefile_Data['Emote_cleaned'] = Emotefile_Data['comment'].apply(Emote_Change_PP)
# Display original and cleaned comment text for comparison
Emotefile_Data[['comment', 'Emote_cleaned']].head()


In [None]:
# Display the first 5 rows of the dataset
Emotefile_Data.head()


In [None]:
# Split the dataset into training and testing sets (80/20 split) with stratified sampling
Emo_Comment_Train, Emo_Comment_Test, Emo_Lbl_Train, Emo_Lbl_Test = train_test_split(
    Emotefile_Data['Emote_cleaned'],
    Emotefile_Data['label'],
    test_size=0.2,
    random_state=123456,
    stratify=Emotefile_Data['label']
)

# Convert target label variables to categorical type
Emo_Lbl_Train = Emo_Lbl_Train.astype('category')
Emo_Lbl_Test = Emo_Lbl_Test.astype('category')


In [None]:
# Load the pre-trained BERT tokenizer
Emo_tokenize = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and encode the training and testing comment text
Emo_encode_Train = Emo_tokenize(Emo_Comment_Train.tolist(), truncation=True, padding=True, return_tensors='pt')
Emo_encode_Test = Emo_tokenize(Emo_Comment_Test.tolist(), truncation=True, padding=True, return_tensors='pt')

# Convert training and testing labels to PyTorch tensors
Emo_Trn_lbls = torch.tensor(Emo_Lbl_Train.cat.codes.values)
Emo_Test_lbls = torch.tensor(Emo_Lbl_Test.cat.codes.values)

# Create TensorDatasets for training and testing
Emo_Train_DS = TensorDataset(Emo_encode_Train['input_ids'], Emo_encode_Train['attention_mask'], Emo_Trn_lbls)
Emo_Test_DS = TensorDataset(Emo_encode_Test['input_ids'], Emo_encode_Test['attention_mask'], Emo_Test_lbls)


In [None]:
# Load the pre-trained BERT model for sequence classification
Emo_Model_Bert = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(Emo_Lable_nms)
).to(device)

# Define the optimizer using AdamW
Emo_Optz = AdamW(Emo_Model_Bert.parameters(), lr=1e-5)

# Define the loss function for multi-class classification
Emo_Loss_Fn = nn.CrossEntropyLoss()


In [None]:
# Create DataLoaders for training and testing
Emo_Train_Ldr = DataLoader(Emo_Train_DS, batch_size=8, shuffle=True)
Emo_Test_Ldr = DataLoader(Emo_Test_DS, batch_size=8)

# Initialize lists to store accuracy and loss values across epochs
Emo_Train_Acc = []
Emo_Test_Acc = []
Emo_epoch_loss = []

# Loop through multiple training epochs
for epoch in range(5):  # Adjust the number of epochs as needed
    Emo_Model_Bert.train()  # Set model to training mode
    Ep_total_loss = 0
    Emo_Lbl_crct = 0
    Emo_all_ttl = 0

    # Iterate through training batches
    for batch in tqdm(Emo_Train_Ldr, desc=f"Epoch {epoch+1} - Training"):
        Emo_Ip_Ids, Emo_At_Masks, Emo_ep_lbls = [b.to(device) for b in batch]
        Emo_ep_lbls = Emo_ep_lbls.long()

        # Forward pass and loss computation
        Emo_Optz.zero_grad()
        Emo_ops = Emo_Model_Bert(input_ids=Emo_Ip_Ids, attention_mask=Emo_At_Masks)
        Emo_ep_loss = Emo_Loss_Fn(Emo_ops.logits, Emo_ep_lbls)

        # Backward pass and optimization
        Emo_ep_loss.backward()
        Emo_Optz.step()

        # Accumulate loss and correct predictions
        Ep_total_loss += Emo_ep_loss.item()
        Emo_bert_pred = torch.argmax(Emo_ops.logits, dim=1)
        Emo_Lbl_crct += (Emo_bert_pred == Emo_ep_lbls).sum().item()
        Emo_all_ttl += Emo_ep_lbls.size(0)

    # Calculate and store training accuracy and loss for this epoch
    Eval_Trn_acc = Emo_Lbl_crct / Emo_all_ttl
    Emo_Train_Acc.append(Eval_Trn_acc)
    Emo_epoch_loss.append(Ep_total_loss / len(Emo_Train_Ldr))

    # Set model to evaluation mode
    Emo_Model_Bert.eval()
    Emo_Test_Crct = 0
    Emo_Eval_ttl = 0

    # Evaluate model on test data without updating gradients
    with torch.no_grad():
        for batch in Emo_Test_Ldr:
            Eval_ips, Eval_att_masks, Eval_lbls = [b.to(device) for b in batch]
            Eval_lbls = Eval_lbls.long()
            totch_op = Emo_Model_Bert(input_ids=Eval_ips, attention_mask=Eval_att_masks)
            Eval_preds = torch.argmax(totch_op.logits, dim=1)
            Emo_Test_Crct += (Eval_preds == Eval_lbls).sum().item()
            Emo_Eval_ttl += Eval_lbls.size(0)

    # Calculate and store test accuracy for this epoch
    Eval_Test_acc = Emo_Test_Crct / Emo_Eval_ttl
    Emo_Test_Acc.append(Eval_Test_acc)

    # Display training and test accuracy along with average training loss
    print(f"Epoch {epoch+1}: Training Accuracy = {Eval_Trn_acc:.4f} \n , Testing Accuracy = {Eval_Test_acc:.4f}, Loss = {Ep_total_loss / len(Emo_Train_Ldr):.4f}")


In [None]:
# Plot training and testing accuracy over epochs
Num_epochs = list(range(1, len(Emo_Train_Acc) + 1))
plt.figure(figsize=(10, 6))
plt.plot(Num_epochs, Emo_Train_Acc, marker='o', label='Training Accuracy')
plt.plot(Num_epochs, Emo_Test_Acc, marker='o', label='Testing Accuracy')
plt.title("Training vs Testing Accuracy Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.xticks(Num_epochs)
plt.legend()
plt.show()


In [None]:
# Set the model to evaluation mode
Emo_Model_Bert.eval()

# Create a DataLoader for the test dataset
Emo_Test_Ldr = DataLoader(Emo_Test_DS, batch_size=8)

# Initialize lists to store predictions and true labels
Emo_Berts_Preds, Emo_org_lbls = [], []

# Perform inference on test data without computing gradients
with torch.no_grad():
    for batch in Emo_Test_Ldr:
        Emo_test_ips, Emo_test_Att, EMo_test_lbls = [b.to(device) for b in batch]
        Emo_test_op = Emo_Model_Bert(input_ids=Emo_test_ips, attention_mask=Emo_test_Att)
        Emo_test_pred = torch.argmax(Emo_test_op.logits, dim=1)
        Emo_Berts_Preds.extend(Emo_test_pred.cpu().numpy())
        Emo_org_lbls.extend(EMo_test_lbls.cpu().numpy())

# Compute and display evaluation metrics
EVal_Test_acc_all = accuracy_score(Emo_org_lbls, Emo_Berts_Preds)
print(f"Test Accuracy on Unseen Emotion Data: {EVal_Test_acc_all:.2f}")
print("\nClassification Report on Unseen Emotion Data:\n", classification_report(Emo_org_lbls, Emo_Berts_Preds, target_names=Emo_Lable_nms))


In [None]:
# Plot the confusion matrix for the test predictions
Emo_Eval_Confmat = confusion_matrix(Emo_org_lbls, Emo_Berts_Preds)
EMo_Cm_Disp = ConfusionMatrixDisplay(confusion_matrix=Emo_Eval_Confmat, display_labels=Emo_Lable_nms)
EMo_Cm_Disp.plot(cmap = 'Blues')
plt.title("Confusion Matrix for Emotion Classification")
plt.show()

In [None]:
# Compare predicted and actual emotion labels for 5 random test samples
Emo_Test_Samples = np.random.choice(len(Emo_Comment_Test), 5, replace=False)

for Ets in Emo_Test_Samples:
    Test_Sample_Emo = Emo_Comment_Test.iloc[Ets]
    Emo_orgl_Label = Emo_Lable_nms[Emo_Lbl_Test.iloc[Ets]]

    # Tokenize the sample and send it to the model
    Ips_token = Emo_tokenize(Test_Sample_Emo, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        Emo_Op_TOken = Emo_Model_Bert(**Ips_token)
    Emo_Pred_Samp_Label = Emo_Lable_nms[torch.argmax(Emo_Op_TOken.logits, dim=1).item()]

    # Display text, actual label, and predicted label
    print("\n Sample Comment from Test Dataset:", Test_Sample_Emo)
    print("Actual Emotion   :", Emo_orgl_Label)
    print("Predicted Emotion:", Emo_Pred_Samp_Label)
