<a href="https://colab.research.google.com/github/Sudesh8/Sentiment-Analysis/blob/main/CopySentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
Emotion Classification and Chatbot Training
This notebook trains an emotion classification model using BERT and then creates a chatbot that responds based on emotions.

```



# Install necessary libraries

In [1]:
!pip install pandas torch transformers scikit-learn nltk




# Import libraries

In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize




# Download necessary nltk datasets for text processing

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# Load the dataset from a CSV file
# Note: `on_bad_lines='skip'` skips any problematic lines in the dataset
data = pd.read_csv('/content/emotion_data.csv', on_bad_lines='skip')


In [5]:
# Show the dataset
data


Unnamed: 0,statement,status,statement_len,statement_clean,status_encoded,tokens
0,oh my gosh,Anxiety,3,oh gosh,0,"['oh', 'gosh']"
1,"trouble sleeping, confused mind, restless hear...",Anxiety,10,troubl sleep confus mind restless heart tune,0,"['troubl', 'sleep', 'confus', 'mind', 'restles..."
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,14,wrong back dear forward doubt stay restless re...,0,"['wrong', 'back', 'dear', 'forward', 'doubt', ..."
3,I've shifted my focus to something else but I'...,Anxiety,11,ive shift focus someth els im still worri,0,"['ive', 'shift', 'focus', 'someth', 'els', 'im..."
4,"I'm restless and restless, it's been a month n...",Anxiety,14,im restless restless month boy mean,0,"['im', 'restless', 'restless', 'month', 'boy',..."
...,...,...,...,...,...,...
52676,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety,322,nobodi take serious i'v dealt depressionanxiet...,0,"['nobodi', 'take', 'serious', 'i', ""'"", 'v', '..."
52677,"selfishness ""I don't feel very good, it's lik...",Anxiety,199,selfish dont feel good like dont belong world ...,0,"['selfish', 'dont', 'feel', 'good', 'like', 'd..."
52678,Is there any way to sleep better? I can't slee...,Anxiety,17,way sleep better cant sleep night med didnt help,0,"['way', 'sleep', 'better', 'cant', 'sleep', 'n..."
52679,"Public speaking tips? Hi, all. I have to give ...",Anxiety,74,public speak tip hi give present work next wee...,0,"['public', 'speak', 'tip', 'hi', 'give', 'pres..."


In [6]:
# Basic preprocessing: remove NaN, punctuation, and stopwords
stop_words = set(stopwords.words('english'))

In [7]:
def preprocess_text(statement):
    if pd.isna(statement): # If the text is NaN, return an empty string
        return ""
    text = statement  # Assign the input 'statement' to the variable 'text'
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove punctuation and non-alphanumeric characters
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(filtered_tokens)

In [8]:
# Apply preprocessing to the text column
data['cleaned_text'] = data['statement'].apply(preprocess_text)


In [9]:
data.sample(2) # Show two random rows of the cleaned data

Unnamed: 0,statement,status,statement_len,statement_clean,status_encoded,tokens,cleaned_text
26061,"Past few months, I have been feeling like real...",Depression,158,past month feel like realli shit know good job...,2,"['past', 'month', 'feel', 'like', 'realli', 's...",past months feeling like really shit know good...
34582,YOU ARE NOT ALONE... Don’t Google! We can conq...,Anxiety,118,alon don't googl conquer hi struggl health anx...,0,"['alon', 'do', ""n't"", 'googl', 'conquer', 'hi'...",alone dont google conquer hi struggling health...


In [10]:
# Drop unnecessary columns from the dataset
data = data.drop(['statement_len', 'status_encoded','tokens'], axis=1)

In [11]:
data.head(2) # Show two random rows of the cleaned data

Unnamed: 0,statement,status,statement_clean,cleaned_text
0,oh my gosh,Anxiety,oh gosh,oh gosh
1,"trouble sleeping, confused mind, restless hear...",Anxiety,troubl sleep confus mind restless heart tune,trouble sleeping confused mind restless heart ...


In [12]:
# Rename the 'status' column to 'emotion' for clarity
data.rename(columns={'status': 'emotion'}, inplace=True)


In [13]:
# Split the data into training and validation sets (80% train, 20% validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['cleaned_text'], data['emotion'], test_size=0.2, stratify=data['emotion']
)

In [14]:
# Print the size of the train and validation sets
print(train_texts.shape)
print(val_texts)

(42144,)
41235    uncertainty loss isolation covid 9 period cont...
29286    shame spiral validation asking feedback peers ...
31040                             need save enough payment
15465    think beautiful thing ever laid eyes beautiful...
29459    live alone make sure dog gets exercise needs k...
                               ...                        
936                            need cv fastzonauang zonaba
21611    done life get degree shooting take anymore lif...
49741    struggling keep friendships social received bi...
7159     could write essay tip icebergalso probably wan...
14691    someone asks want life respond qualitative qua...
Name: cleaned_text, Length: 10537, dtype: object


In [15]:
# Load a pre-trained tokenizer for BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
# Tokenize the training and validation text data

In [16]:
# Tokenize the data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

In [17]:
# Convert labels to numeric values based on the unique emotions in the dataset
label_map = {label: i for i, label in enumerate(data['emotion'].unique())}
train_labels = [label_map[label] for label in train_labels]
val_labels = [label_map[label] for label in val_labels]

In [18]:
# Define a custom Dataset class to handle tokenized inputs and labels

class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings # Tokenized inputs
        self.labels = labels # Emotion labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]) # Add label as tensor
        return item

    def __len__(self):
        return len(self.labels)

In [19]:

# Create train and validation datasets using the custom dataset class
train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)

In [19]:
# Load the pre-trained BERT model for sequence classification

In [20]:
# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_map))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Set up training arguments (learning rate, batch size, etc.)
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for model checkpoints
    evaluation_strategy="epoch",  # Evaluate after each epoch
    report_to="none",  # Disable logging integrations
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=1,  # Number of training epochs
    weight_decay=0.01,  # Weight decay for regularization
)




In [22]:

# Define Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [23]:


# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.5151,0.527096


TrainOutput(global_step=5268, training_loss=0.6412127877874816, metrics={'train_runtime': 4607.3806, 'train_samples_per_second': 9.147, 'train_steps_per_second': 1.143, 'total_flos': 1.10890501152768e+16, 'train_loss': 0.6412127877874816, 'epoch': 1.0})

In [24]:
# # Specify the path where you want to save the model
# SAVE_PATH = "./trained_model"

# # Save the trained model and tokenizer
# model.save_pretrained(SAVE_PATH)
# tokenizer.save_pretrained(SAVE_PATH)
# print(f"Model and tokenizer saved to {SAVE_PATH}")


In [25]:
# import shutil

# shutil.make_archive("trained_model", 'zip', SAVE_PATH)
# print("Model directory zipped as trained_model.zip")


In [26]:
# from google.colab import files

# files.download("trained_model.zip")


In [27]:
# Evaluate model on validation set
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)

In [28]:

# Map numeric predictions back to labels
reverse_label_map = {v: k for k, v in label_map.items()}
val_labels = [reverse_label_map[label] for label in val_labels]
preds = [reverse_label_map[pred] for pred in preds]

In [29]:

# Classification report
print(classification_report(val_labels, preds))


                      precision    recall  f1-score   support

             Anxiety       0.84      0.79      0.82       768
             Bipolar       0.82      0.77      0.79       556
          Depression       0.76      0.74      0.75      3081
              Normal       0.93      0.94      0.93      3269
Personality disorder       0.72      0.60      0.65       215
              Stress       0.65      0.65      0.65       517
            Suicidal       0.69      0.74      0.71      2131

            accuracy                           0.80     10537
           macro avg       0.77      0.75      0.76     10537
        weighted avg       0.80      0.80      0.80     10537



In [None]:
# # Create a pipeline for sentiment analysis with the trained model
# emotion_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [30]:
# Create a pipeline for sentiment analysis with the trained model
emotion_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Function to generate responses based on the predicted emotion
def generate_response(text, emotion_pipeline):
    emotion = emotion_pipeline(text)[0]['label']
    responses = {
        'sadness': "I'm here for you. Let me know if you'd like to talk about it.",
        'joy': "That's wonderful to hear! Keep up the positivity.",
        'anxiety': "Take a deep breath. Everything will be okay.",
        'stress': "Remember to take breaks and prioritize self-care."
    }
    return responses.get(emotion, "I'm here to help with anything you're feeling.")




Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Chatbot: I'm here to help with anything you're feeling.


In [33]:
# Test response generation
test_text = "I feel so overwhelmed with everything happening."
response = generate_response(test_text, emotion_pipeline)
print(f"Chatbot: {response}")

Chatbot: I'm here to help with anything you're feeling.


In [34]:
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.txt',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [36]:
from google.colab import files
import shutil

# Create a zip file of the saved model directory
shutil.make_archive('/content/saved_model', 'zip', '/content', './trained_model') # Changed the last argument

# Download the zip file
files.download('/content/saved_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:
label_map # Show the label map (for reference)

{'Anxiety': 0,
 'Normal': 1,
 'Depression': 2,
 'Suicidal': 3,
 'Stress': 4,
 'Bipolar': 5,
 'Personality disorder': 6}