### 1. Importing Libraries

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pickle

### Emotion Analysis on Tweets Dataset for Diary Entries

### 2. Preprocessing

In [None]:
# Define stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
  text = text.lower()
  text = contractions.fix(text)
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = ''.join([i for i in text if not i.isdigit()])
  text = ' '.join([word for word in text.split() if word not in stop_words])
  text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
  text = ' '.join(text.split())
  return text

Loading the Dataset:

The data is imported from kaggle: https://www.kaggle.com/datasets/aadyasingh55/twitter-emotion-classification-dataset/data

In [25]:
# Load the dataset
df = pd.read_csv('./export_1731170924611.csv')

In [26]:
df.head()

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0
2,ive probably mentioned this before but i reall...,1
3,i was feeling a little low few days back,0
4,i beleive that i am much more sensitive to oth...,2


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    416809 non-null  object
 1   label   416809 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.4+ MB


Handling Null & Duplicates

In [28]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

Data after handling pre-processing

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 416123 entries, 0 to 416808
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    416123 non-null  object
 1   label   416123 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 9.5+ MB


### 3. Splitting the Data

In [30]:
# Clean the text data
df['cleaned_text'] = df['text'].apply(clean_text)

# Split the data into features and labels
X = df['cleaned_text']
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 4. Encoding

#### 4.1 Encoding : CountVectorizer

In [31]:
# Convert text data to numerical data using CountVectorizer
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

In [32]:
len(vectorizer.vocabulary_)

60514

#### 4.1 Encoding : TfidfVectorizer

In [33]:
# Convert text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [34]:
len(vectorizer.vocabulary_)

60514

### 5. Model Training

#### 5.1 Multinomial Naive Bayer

In [35]:
# Train and evaluate Naive Bayes classifier
nb_clf = MultinomialNB()
nb_clf.fit(X_train_count, y_train)
nb_y_pred = nb_clf.predict(X_test_count)
nb_report = classification_report(y_test, nb_y_pred)
print("Naive Bayes Accuracy:", nb_report)

Naive Bayes Accuracy:               precision    recall  f1-score   support

           0       0.88      0.94      0.91     24360
           1       0.86      0.93      0.90     28049
           2       0.82      0.63      0.71      6943
           3       0.90      0.85      0.88     11429
           4       0.84      0.82      0.83      9438
           5       0.89      0.39      0.55      3006

    accuracy                           0.87     83225
   macro avg       0.87      0.76      0.80     83225
weighted avg       0.87      0.87      0.86     83225



#### 5.2 Logistic Regression

In [None]:
# Train and evaluate Logistic Regression classifier
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train_tfidf, y_train)
lr_y_pred = lr_clf.predict(X_test_tfidf)
lr_report = classification_report(y_test, lr_y_pred)
print("Logistic Regression Accuracy:", lr_report)

### 6. Model Selection

In [None]:
print("The best-model is logistic regression")
# Save the vectorizer and best model
with open('tfidf_vectorizer_emotion.pkl', 'wb') as file:
  pickle.dump(vectorizer, file)

### 7. Packing the Model

In [None]:
with open('best_emotion_model.pkl', 'wb') as file:
  pickle.dump(lr_clf, file)

# Function to predict emotion of a given text
def predict_emotion(text):
  cleaned_text = clean_text(text)
  text_tfidf = vectorizer.transform([cleaned_text])
  prediction = lr_clf.predict(text_tfidf)
  emotions = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
  return emotions[prediction[0]]

### 8. Model Useage

In [40]:
# Example usage
text = input("Enter text to predict emotion: ")
# I like to play.
emotion = predict_emotion(text)
print(f'The predicted emotion is: {emotion}')

The predicted emotion is: joy
