# Data Preprocessing and model definition

In [9]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import joblib

# Load data
train = pd.read_table('train.txt', delimiter=";", header=None)
test = pd.read_table('test.txt', delimiter=";", header=None)
val = pd.read_table('val.txt', delimiter=";", header=None)

# Combine data
data = pd.concat([train, val, test])
data.columns = ["text", "label"]

# Check for missing values
data.isna().any(axis=1).sum()

# Preprocess function
porter = PorterStemmer()
def preprocess(line):
    review = re.sub("[^a-zA-z]", " ", line)
    review = review.lower()
    review = review.split()
    review = [porter.stem(word) for word in review if not word in stopwords.words("english")]
    return " ".join(review)

# Apply preprocessing
data["text"] = data["text"].apply(lambda x: preprocess(x))

# Encode labels
label_encoder = preprocessing.LabelEncoder()
data['emotion'] = label_encoder.fit_transform(data["label"])

# Vectorize text
cv = CountVectorizer(max_features=5000, ngram_range=(1, 3))
data_cv = cv.fit_transform(data["text"]).toarray()

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(data_cv, data['emotion'], test_size=0.3, random_state=42)

# Save the preprocessed data and label encoder for later use
joblib.dump(cv, 'count_vectorizer.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')


['label_encoder.joblib']

# Building the model

In [10]:
# Build the model
model = Sequential()
model.add(Dense(12, input_shape=(x_train.shape[1],), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(6, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=10)

# Save the trained model using joblib
joblib.dump(model, 'emotion_detection_model.joblib')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.4717 - loss: 1.3468
Epoch 2/10
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9085 - loss: 0.3321
Epoch 3/10
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9561 - loss: 0.1423
Epoch 4/10
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9744 - loss: 0.0851
Epoch 5/10
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9863 - loss: 0.0527
Epoch 6/10
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9900 - loss: 0.0371
Epoch 7/10
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9918 - loss: 0.0287
Epoch 8/10
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9915 - loss: 0.0251
Epoch 9/10
[1m1400/1400

['emotion_detection_model.joblib']

# Loading the model and testing with input

In [None]:
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import load_model

# Load the saved model, label encoder, and CountVectorizer
model = joblib.load('emotion_detection_model.joblib')
cv = joblib.load('count_vectorizer.joblib')
label_encoder = joblib.load('label_encoder.joblib')

# Define preprocessing and prediction function
porter = PorterStemmer()
def preprocess(line):
    review = re.sub("[^a-zA-z]", " ", line)
    review = review.lower()
    review = review.split()
    review = [porter.stem(word) for word in review if not word in stopwords.words("english")]
    return " ".join(review)

def predict_emotion(text):
    text = preprocess(text)
    array = cv.transform([text]).toarray()
    pred = model.predict(array)
    emotion = label_encoder.inverse_transform(range(pred.shape[1]))
    emotion_percentages = {emotion[i]: round(pred[0][i]*100, 2) for i in range(len(emotion))}
    return emotion_percentages

# Example usage
text = "I am feeling great today!"
emotion_percentages = predict_emotion(text)
print(f"Emotion percentages: {emotion_percentages}")
