In [48]:
# ========================================
# Cell 1: Install & Import Libraries
# ========================================

!pip install -q gradio
import pandas as pd
import numpy as np
import re
import string
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import gradio as gr
import datetime
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# ================================
#  Load & Combine CSVs
# ================================

# Upload your files in Colab manually or mount Google Drive

# Example: load files (adjust filenames if needed)
df_train = pd.read_csv("/content/train.txt", sep=';', names=['text', 'emotion'])
df_val = pd.read_csv("/content/val.txt", sep=';', names=['text', 'emotion'])
df_test = pd.read_csv("/content/test.txt", sep=';', names=['text', 'emotion'])

print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)

print("Sample rows:")
print(df_train.head())

Train shape: (16000, 2)
Validation shape: (2000, 2)
Test shape: (2000, 2)
Sample rows:
                                                text  emotion
0                            i didnt feel humiliated  sadness
1  i can go from feeling so hopeless to so damned...  sadness
2   im grabbing a minute to post i feel greedy wrong    anger
3  i am ever feeling nostalgic about the fireplac...     love
4                               i am feeling grouchy    anger


In [37]:
# ========================================
# Combine, check unique emotions, find & remove common words
# ========================================

# Combine
df_all = pd.concat([df_train, df_val, df_test], ignore_index=True)
print("Combined shape:", df_all.shape)

Combined shape: (20000, 2)


In [38]:
#check missing values
df_all.isnull().sum()

Unnamed: 0,0
text,0
emotion,0


In [39]:
# Show unique emotions
unique_emotions = df_all['emotion'].unique()
print("Unique emotion labels:", unique_emotions)

Unique emotion labels: ['sadness' 'anger' 'love' 'surprise' 'fear' 'joy']


In [40]:
df_all['emotion'].value_counts()

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
joy,6761
sadness,5797
anger,2709
fear,2373
love,1641
surprise,719


In [42]:
# Load normal stopwords
stop_words = set(stopwords.words('english'))

# Quick token frequency count to see top words
from collections import Counter

all_words = []
for text in df_all['text']:
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    all_words.extend(text.split())

word_freq = Counter(all_words)
print("Most common words:")
print(word_freq.most_common(20))

Most common words:
[('i', 32242), ('feel', 13938), ('and', 11996), ('to', 11208), ('the', 10462), ('a', 7748), ('feeling', 6431), ('that', 6314), ('of', 6182), ('my', 5326), ('in', 4239), ('it', 3922), ('like', 3616), ('so', 3127), ('im', 3055), ('for', 3021), ('me', 2899), ('was', 2828), ('have', 2803), ('but', 2790)]


In [43]:
# NLTK stopwords
stop_words = set(stopwords.words('english'))

#  custom stopwords to catch leftover generic words
custom_stopwords = {
    'feel', 'feeling', 'really', 'very', 'just', 'always',
    'today', 'now', 'im', 'ive', 'ill', 'cant', 'dont',
    'get', 'got', 'much', 'lot',
    'one', 'thing', 'know', 'like', 'people', 'time'
}

all_stopwords = stop_words.union(custom_stopwords)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    words = text.split()
    words = [w for w in words if w not in all_stopwords]
    return ' '.join(words)

df_all['clean_text'] = df_all['text'].apply(clean_text)

print("\nSample cleaned rows:")
print(df_all[['text', 'clean_text', 'emotion']].head())


Sample cleaned rows:
                                                text  \
0                            i didnt feel humiliated   
1  i can go from feeling so hopeless to so damned...   
2   im grabbing a minute to post i feel greedy wrong   
3  i am ever feeling nostalgic about the fireplac...   
4                               i am feeling grouchy   

                                          clean_text  emotion  
0                                   didnt humiliated  sadness  
1  go hopeless damned hopeful around someone care...  sadness  
2                  grabbing minute post greedy wrong    anger  
3            ever nostalgic fireplace still property     love  
4                                            grouchy    anger  


In [45]:
# ========================================
# split
# ========================================

from sklearn.model_selection import train_test_split

# Input and target
X = df_all['clean_text'].values
y = df_all['emotion'].values

# First split: 70% Train, 30% Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Second split: Temp → Validation and Test (50% each → 15% each overall)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Training samples:", len(X_train))
print("Validation samples:", len(X_val))
print("Test samples:", len(X_test))


Training samples: 14000
Validation samples: 3000
Test samples: 3000


In [46]:
# ========================================
# Cell 5: TF-IDF Vectorization
# ========================================

# Create TF-IDF vectorizer with a good limit on features
tfidf = TfidfVectorizer(max_features=5000)

# Fit only on training text
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF vectorization complete.")
print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Validation TF-IDF shape:", X_val_tfidf.shape)
print("Test TF-IDF shape:", X_test_tfidf.shape)


TF-IDF vectorization complete.
Train TF-IDF shape: (14000, 5000)
Validation TF-IDF shape: (3000, 5000)
Test TF-IDF shape: (3000, 5000)


In [47]:
# ========================================
# Cell 6: Train Logistic Regression
# ========================================


# Initialize model
lr_model = LogisticRegression(max_iter=200)

# Train model
lr_model.fit(X_train_tfidf, y_train)

print("Logistic Regression training complete.")


Logistic Regression training complete.


In [49]:
# ========================================
# Cell 7: Evaluate Logistic Regression (with metrics)
# ========================================

# Predict on test data
y_pred_lr = lr_model.predict(X_test_tfidf)

# Accuracy
accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy:.4f}")

# Detailed report: precision, recall, F1 for each class and averages
report = classification_report(y_test, y_pred_lr, digits=4)
print("\nClassification Report:")
print(report)

# Get average precision, recall, F1 (macro)
precision = precision_score(y_test, y_pred_lr, average='macro')
recall = recall_score(y_test, y_pred_lr, average='macro')
f1 = f1_score(y_test, y_pred_lr, average='macro')

print(f"\nMacro Average Precision: {precision:.4f}")
print(f"Macro Average Recall:    {recall:.4f}")
print(f"Macro Average F1 Score:  {f1:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_lr)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.8760

Classification Report:
              precision    recall  f1-score   support

       anger     0.9145    0.7887    0.8470       407
        fear     0.8903    0.7978    0.8415       356
         joy     0.8440    0.9655    0.9006      1014
        love     0.8883    0.6789    0.7696       246
     sadness     0.8949    0.9402    0.9169       869
    surprise     0.8696    0.5556    0.6780       108

    accuracy                         0.8760      3000
   macro avg     0.8836    0.7878    0.8256      3000
weighted avg     0.8783    0.8760    0.8723      3000


Macro Average Precision: 0.8836
Macro Average Recall:    0.7878
Macro Average F1 Score:  0.8256

Confusion Matrix:
[[321   9  38   0  39   0]
 [  9 284  25   2  30   6]
 [  3   4 979  15  12   1]
 [  3   1  66 167   8   1]
 [ 12   4  31   4 817   1]
 [  3  17  21   0   7  60]]


In [59]:
# ========================================
# Gradio Chatbot for Logistic Regression
# ========================================

import gradio as gr

def predict_emotion_lr(text):
    # Clean text same as training
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    words = text.split()
    words = [w for w in words if w not in all_stopwords]
    cleaned = ' '.join(words)

    # Transform with TF-IDF
    vector = tfidf.transform([cleaned])

    # Predict with Logistic Regression
    pred = lr_model.predict(vector)

    return f"Predicted Emotion: {pred[0]}"

# Build Gradio interface
iface = gr.Interface(
    fn=predict_emotion_lr,
    inputs=gr.Textbox(lines=2, placeholder="Type your text here..."),
    outputs="text",
    title="Emotion Predictor Chatbot (Logistic Regression)",
    description="Enter any text and this chatbot will predict the emotion category using Logistic Regression."
)

iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f7548f1b3c1c34b97e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [51]:
# Set maximum number of words & max sequence length
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 100

# Create and fit the tokenizer on training data only
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to the same length
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print("Tokenizer and padding complete.")
print("Example sequence:", X_train_seq[0])
print("Padded shape:", X_train_pad.shape)

Tokenizer and padding complete.
Example sequence: [45, 13, 1002]
Padded shape: (14000, 100)


In [52]:
# ========================================
# Cell 9: Encode Emotion Labels
# ========================================

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Encode labels to integers
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

y_train_enc = label_encoder.transform(y_train)
y_val_enc = label_encoder.transform(y_val)
y_test_enc = label_encoder.transform(y_test)

# Convert to categorical for Keras
y_train_cat = to_categorical(y_train_enc)
y_val_cat = to_categorical(y_val_enc)
y_test_cat = to_categorical(y_test_enc)

print("Label encoding complete. Classes:")
print(label_encoder.classes_)


Label encoding complete. Classes:
['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


### RNN MODEL

In [53]:
# ========================================
# Cell 10: Load GloVe Embeddings
# ========================================

import numpy as np

# Download GloVe 100D if not present
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

# Load 100D GloVe
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

print(f"Loaded {len(embedding_index)} word vectors from GloVe.")


Loaded 400000 word vectors from GloVe.


In [54]:
# ========================================
# Cell 11: Create Embedding Matrix
# ========================================

EMBEDDING_DIM = 100

word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print("Embedding matrix shape:", embedding_matrix.shape)


Embedding matrix shape: (10000, 100)


In [55]:
# ========================================
# Cell 12: Build Bi-LSTM Model
# ========================================

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=num_words,
                    output_dim=EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()




In [56]:
# ========================================
# Cell 13: Train Bi-LSTM
# ========================================

from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train_cat,
    validation_data=(X_val_pad, y_val_cat),
    epochs=10,
    batch_size=64,
    callbacks=[early_stop]
)


Epoch 1/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 262ms/step - accuracy: 0.4223 - loss: 1.5003 - val_accuracy: 0.6250 - val_loss: 1.0706
Epoch 2/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 192ms/step - accuracy: 0.6248 - loss: 1.0307 - val_accuracy: 0.7200 - val_loss: 0.7990
Epoch 3/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 188ms/step - accuracy: 0.6995 - loss: 0.8266 - val_accuracy: 0.7723 - val_loss: 0.6586
Epoch 4/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 181ms/step - accuracy: 0.7527 - loss: 0.6799 - val_accuracy: 0.8020 - val_loss: 0.5759
Epoch 5/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 179ms/step - accuracy: 0.7886 - loss: 0.5869 - val_accuracy: 0.8227 - val_loss: 0.4958
Epoch 6/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 185ms/step - accuracy: 0.8196 - loss: 0.5184 - val_accuracy: 0.8357 - val_loss: 0.4430
Epoch 7/10

In [57]:
# ========================================
# Cell 14: Evaluate Bi-LSTM
# ========================================

# Predict
y_pred_rnn_prob = model.predict(X_test_pad)
y_pred_rnn = np.argmax(y_pred_rnn_prob, axis=1)

# True labels
y_true_rnn = np.argmax(y_test_cat, axis=1)

# Metrics
accuracy = accuracy_score(y_true_rnn, y_pred_rnn)
precision = precision_score(y_true_rnn, y_pred_rnn, average='macro')
recall = recall_score(y_true_rnn, y_pred_rnn, average='macro')
f1 = f1_score(y_true_rnn, y_pred_rnn, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Macro Precision: {precision:.4f}")
print(f"Macro Recall: {recall:.4f}")
print(f"Macro F1 Score: {f1:.4f}")

# Confusion matrix
cm = confusion_matrix(y_true_rnn, y_pred_rnn)
print("\nConfusion Matrix:")
print(cm)


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step
Accuracy: 0.8707
Macro Precision: 0.8349
Macro Recall: 0.8321
Macro F1 Score: 0.8334

Confusion Matrix:
[[368   8  15   1  14   1]
 [ 11 291   6   2  26  20]
 [ 16  10 906  41  38   3]
 [  5   1  41 191   8   0]
 [ 26  21  31   8 780   3]
 [  1  22   8   0   1  76]]


In [58]:
# ========================================
# Cell 15: Gradio Emotion Chatbot
# ========================================

import gradio as gr

# Define a function that takes user input → cleans → predicts
def predict_emotion(text):
    # Clean text same as training
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    words = text.split()
    words = [w for w in words if w not in all_stopwords]
    cleaned = ' '.join(words)

    # Tokenize and pad
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

    # Predict
    pred = model.predict(pad)
    label = label_encoder.inverse_transform([np.argmax(pred)])

    return f"Predicted Emotion: {label[0]}"

# Build Gradio interface
iface = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Textbox(lines=2, placeholder="Type your text here..."),
    outputs="text",
    title="Emotion Predictor Chatbot",
    description="Enter any text and this chatbot will predict the emotion category."
)

# Launch
iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ba2d977ed0a44020aa.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [61]:
from google.colab import files
files.download("/content/train.txt")
files.download("/content/val.txt")
files.download("/content/test.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>