In [5]:
# Importing the libraries
import pandas as pd
import numpy as np
import re
import string
import warnings
warnings.filterwarnings('ignore')


In [1]:
# text preprocessing
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import joblib


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samanwita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Samanwita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Samanwita\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
# Loading the dataset
df = pd.read_csv("emotion_dataset.csv")

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
df.info()

print("\nMissing Values:")
print(df.isnull().sum())

print("\nEmotion Distribution:")
print(df['emotion'].value_counts())
print("\nEmotion Percentages:")
print(df['emotion'].value_counts(normalize=True) * 100)


Dataset Shape: (15999, 2)

First 5 rows:
                                                text  emotion
0  i can go from feeling so hopeless to so damned...  sadness
1   im grabbing a minute to post i feel greedy wrong    anger
2  i am ever feeling nostalgic about the fireplac...     love
3                               i am feeling grouchy    anger
4  ive been feeling a little burdened lately wasn...  sadness

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     15999 non-null  object
 1   emotion  15999 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB

Missing Values:
text       0
emotion    0
dtype: int64

Emotion Distribution:
emotion
joy         5362
sadness     4665
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

Emotion Percentages:
emotion
joy         33.514595
sadness

In [8]:
# Initializing lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Keeping the important emotion-related words
emotion_words = {'not', 'no', 'never', 'missing', 'miss', 'sad', 'happy', 
                 'angry', 'scared', 'fear', 'love', 'hate', 'joy', 'worry'}
stop_words = stop_words - emotion_words

def clean_text(text):
    """Enhanced text cleaning with stopword removal and lemmatization"""
    # Converting to lowercase
    text = text.lower()
    
    # Removing the URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    
    # Removing the numbers
    text = re.sub(r"\d+", "", text)
    
    # Removing the punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Removing the extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    # Tokenizing and removing stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_text)

print("Text Cleaning Complete!")
print("\nExample:")
print("Original:", df['text'].iloc[0])
print("Cleaned:", df['clean_text'].iloc[0])


Text Cleaning Complete!

Example:
Original: i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake
Cleaned: go feeling hopeless damned hopeful around someone care awake


In [9]:
# Features and labels are used
X = df['clean_text']
y = df['emotion']

#  labels are encoded
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Emotion Labels:", le.classes_)
print("Encoded Labels:", np.unique(y_encoded))

# the dataset is split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42,
    stratify=y_encoded  # Ensuring balanced distribution
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# checking the distribution
print("\nTraining set emotion distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for emotion, count in zip(le.classes_, counts):
    print(f"  {emotion}: {count}")


Emotion Labels: ['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']
Encoded Labels: [0 1 2 3 4 5]

Training samples: 12799
Testing samples: 3200

Training set emotion distribution:
  anger: 1727
  fear: 1550
  joy: 4289
  love: 1043
  sadness: 3732
  surprise: 458


In [10]:
# Bigrams and filtering enhance the TF-IDF representation
tfidf = TfidfVectorizer(
    max_features=5000,     
    ngram_range=(1, 2),    
    min_df=2,               
    max_df=0.95,            
    sublinear_tf=True       
)

# The system transforms the text into TF-IDF features
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF Vectorization Complete!")
print(f"Training features shape: {X_train_tfidf.shape}")
print(f"Testing features shape: {X_test_tfidf.shape}")
print(f"\nTop 10 features: {tfidf.get_feature_names_out()[:10]}")


TF-IDF Vectorization Complete!
Training features shape: (12799, 5000)
Testing features shape: (3200, 5000)

Top 10 features: ['abandoned' 'abc' 'ability' 'able' 'able feel' 'able find' 'able get'
 'able help' 'able move' 'able share']


In [11]:
#Class balancing is applied while training the Logistic Regression model
model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',  # Handles imbalanced classes
    random_state=42,
    solver='lbfgs'
)

print("Training model...")
model.fit(X_train_tfidf, y_train)
print("✓ Model training completed!")


Training model...
✓ Model training completed!


In [12]:
# Makeing predictions
y_pred = model.predict(X_test_tfidf)

# The algorithm measures the accuracy.
acc = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {acc:.4f} ({acc*100:.2f}%)")

# generating a details for classification report
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred, target_names=le.classes_))

# The confusion matrix is generated
print("\n" + "="*60)
print("CONFUSION MATRIX")
print("="*60)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
print(cm_df)


Model Accuracy: 0.8962 (89.62%)

CLASSIFICATION REPORT
              precision    recall  f1-score   support

       anger       0.89      0.89      0.89       432
        fear       0.89      0.85      0.87       387
         joy       0.94      0.88      0.91      1073
        love       0.73      0.93      0.81       261
     sadness       0.95      0.92      0.94       933
    surprise       0.67      0.92      0.78       114

    accuracy                           0.90      3200
   macro avg       0.85      0.90      0.87      3200
weighted avg       0.90      0.90      0.90      3200


CONFUSION MATRIX
          anger  fear  joy  love  sadness  surprise
anger       383     9   15     5       19         1
fear         10   329    8     1        9        30
joy          12     3  949    81       17        11
love          3     2   13   242        1         0
sadness      23    17   20     4      860         9
surprise      0     8    1     0        0       105


In [13]:
def predict_emotion(text):
    """Predict emotion for any text input"""
    # The text is cleaned before processing.
    cleaned = clean_text(text)
    
    # The input is converted into TF-IDF features
    vector = tfidf.transform([cleaned])
    
    #Predictions are made.
    pred = model.predict(vector)[0]
    label = le.inverse_transform([pred])[0]
    
    # The prediction probabilities are produced by the model
    proba = model.predict_proba(vector)[0]
    confidence = max(proba) * 100
    
    return label, confidence

# The function is tested.
test_examples = [
    "I am feeling very happy today!",
    "I am missing you",
    "This makes me so angry",
    "I'm scared about the exam",
    "I love spending time with you"
]

print("Testing Predictions:")
print("="*60)
for text in test_examples:
    emotion, confidence = predict_emotion(text)
    print(f"Text: {text}")
    print(f"Predicted Emotion: {emotion} (Confidence: {confidence:.2f}%)")
    print("-"*60)


Testing Predictions:
Text: I am feeling very happy today!
Predicted Emotion: joy (Confidence: 70.17%)
------------------------------------------------------------
Text: I am missing you
Predicted Emotion: sadness (Confidence: 30.28%)
------------------------------------------------------------
Text: This makes me so angry
Predicted Emotion: anger (Confidence: 90.29%)
------------------------------------------------------------
Text: I'm scared about the exam
Predicted Emotion: fear (Confidence: 89.05%)
------------------------------------------------------------
Text: I love spending time with you
Predicted Emotion: love (Confidence: 31.62%)
------------------------------------------------------------


In [14]:
#All components are saved
joblib.dump(model, "emotion_model.joblib")
joblib.dump(tfidf, "tfidf_vectorizer.joblib")
joblib.dump(le, "label_encoder.joblib")

print("✓ All models saved successfully!")
print("\nSaved files:")
print("  - emotion_model.joblib")
print("  - tfidf_vectorizer.joblib")
print("  - label_encoder.joblib")


✓ All models saved successfully!

Saved files:
  - emotion_model.joblib
  - tfidf_vectorizer.joblib
  - label_encoder.joblib


In [16]:
# The saved models are loaded
loaded_model = joblib.load("emotion_model.joblib")
loaded_tfidf = joblib.load("tfidf_vectorizer.joblib")
loaded_le = joblib.load("label_encoder.joblib")

print("✓ Models loaded successfully!")

# Testing is conducted with user examples
def predict_with_loaded_model(text):
    cleaned = clean_text(text)
    vector = loaded_tfidf.transform([cleaned])
    pred = loaded_model.predict(vector)[0]
    label = loaded_le.inverse_transform([pred])[0]
    proba = loaded_model.predict_proba(vector)[0]
    confidence = max(proba) * 100
    return label, confidence



✓ Models loaded successfully!


In [18]:

test_sentence = "I am with my family"
emotion, conf = predict_with_loaded_model(test_sentence)
print(f"\nTest: '{test_sentence}'")
print(f"Predicted Emotion: {emotion}")
print(f"Confidence: {conf:.2f}%")



Test: 'I am with my family'
Predicted Emotion: joy
Confidence: 30.58%
