In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [3]:
#Loading dataset
train_path = "C:/Users/raghu/Desktop/practice/data/train.txt"
test_path = "C:/Users/raghu/Desktop/practice/data/test.txt"


In [4]:
def load_data(path):
    df = pd.read_csv(path, sep=';', header=None, names=["text", "emotion"])
    return df
    
train_df = load_data(train_path)
test_df = load_data(test_path)


train_df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [6]:
print(train_df['emotion'].unique())

['sadness' 'anger' 'love' 'surprise' 'fear' 'joy']


In [7]:
print("Missing values:\n", train_df.isnull().sum())


Missing values:
 text       0
emotion    0
dtype: int64


In [8]:
print(f"Number of rows and columns in train data: {train_df.shape}")
print(f"Number of rows and columns in train data: {test_df.shape}")


Number of rows and columns in train data: (16000, 2)
Number of rows and columns in train data: (2000, 2)


In [9]:
train_df.describe()

Unnamed: 0,text,emotion
count,16000,16000
unique,15969,6
top,i feel on the verge of tears from weariness i ...,joy
freq,2,5362


In [None]:
pip install spacy

In [None]:
# Load spaCy model
import re

import spacy

nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words

In [None]:
# Text cleaning function using spaCy
def clean_text(text):

    # Lowercasing text
    text = text.lower()

    # Removing special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # Processing text through spaCy
    doc = nlp(text)

    # Lemmatization and stopwords removeable
    text = " ".join([token.lemma_ for token in doc if token.text not in stop_words])

    return text

# Cleaning in train and test datasets
train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

# Verifying if the text has been cleaned
print(train_df.head())
print(test_df.head())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, y_train = train_df['text'], train_df['emotion']
X_test, y_test = test_df['text'], test_df['emotion']

vectorizer = TfidfVectorizer(max_features=5000)

X_train_vec = vectorizer.fit_transform(X_train)

X_test_vec = vectorizer.transform(X_test)


In [None]:
# Train the Classifier
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Predict
y_pred = model.predict(X_test_vec)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
#Evaluation of the model
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Other metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
#Test with our input
def predict_emotion(text):
    vec = vectorizer.transform([text])
    prediction = model.predict(vec)
    return prediction[0]

predict_emotion("I feel so happy and happy today")


In [None]:
import joblib

# Save the model
joblib.dump(model, 'emotion_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')
