<a href="https://colab.research.google.com/github/RofidaTamer/RafidaTamer/blob/main/Emotion_ClassificationDEPI_Task_Week12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Name : Rofida tamer abdelmoneam esmail**

# **Import necessary libraries**

In [16]:

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# **Upload The Data**

In [17]:
df = pd.read_csv('/text.csv')

# **Preprocessing**

In [18]:
# Preprocessing the text data
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **clean text**

In [19]:
def clean_text(text):
    # Remove special characters, links, and unnecessary whitespace
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.split()
    text = [word for word in text if word not in STOPWORDS]
    return " ".join(text)

In [20]:
# Apply the cleaning function to the text data
df['cleaned_text'] = df['text'].apply(clean_text)

In [6]:
def clean_text(text):
    # Remove special characters, links, and unnecessary whitespace
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.split()
    text = [word for word in text if word not in STOPWORDS]
    return " ".join(text)

# Apply the cleaning function to the text data
df['cleaned_text'] = df['text'].apply(clean_text)

# **Split dataset**

In [21]:

X = df['cleaned_text']
y = df['label']

# Split into train and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# **Feature extraction**

In [22]:
# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


# **Model Tuning - Logistic Regression with GridSearchCV**

In [23]:
#  Model Tuning - Logistic Regression with GridSearchCV
logreg = LogisticRegression(max_iter=1000)
param_grid = {'C': [0.1, 1, 10, 100]}
grid = GridSearchCV(logreg, param_grid, cv=5)
grid.fit(X_train_tfidf, y_train)

# Get the best model
best_logreg = grid.best_estimator_

# **Evaluate the tuned model**

In [24]:

y_pred_logreg = best_logreg.predict(X_test_tfidf)

# **accuracy**

In [25]:
# Print accuracy
print("Logistic Regression Accuracy after tuning:", accuracy_score(y_test, y_pred_logreg))


Logistic Regression Accuracy after tuning: 0.9006261845925002


# **classification report**

In [26]:
# Print classification report
print(classification_report(y_test, y_pred_logreg, target_names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']))


              precision    recall  f1-score   support

     sadness       0.94      0.95      0.94     24201
         joy       0.92      0.93      0.92     28164
        love       0.81      0.77      0.79      6929
       anger       0.91      0.91      0.91     11441
        fear       0.85      0.85      0.85      9594
    surprise       0.78      0.71      0.74      3033

    accuracy                           0.90     83362
   macro avg       0.87      0.85      0.86     83362
weighted avg       0.90      0.90      0.90     83362



# **Deep Learning Model - LSTM**

In [27]:


# Tokenization for LSTM model
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [28]:
# Padding the sequences
max_seq_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_len)

In [29]:
# LSTM Model Building
embedding_dim = 128
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=5000, output_dim=embedding_dim, input_length=max_seq_len))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(6, activation='softmax'))  # 6 classes for emotions


lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




# **Train the LSTM model**

In [31]:
# Train the LSTM model
batch_size = 64
epochs = 5
lstm_model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test_pad, y_test), verbose=2)


Epoch 1/5
5211/5211 - 1216s - 233ms/step - accuracy: 0.9408 - loss: 0.0899 - val_accuracy: 0.9392 - val_loss: 0.0919
Epoch 2/5
5211/5211 - 1216s - 233ms/step - accuracy: 0.9422 - loss: 0.0876 - val_accuracy: 0.9392 - val_loss: 0.0937
Epoch 3/5
5211/5211 - 1231s - 236ms/step - accuracy: 0.9424 - loss: 0.0857 - val_accuracy: 0.9395 - val_loss: 0.0922
Epoch 4/5
5211/5211 - 1152s - 221ms/step - accuracy: 0.9435 - loss: 0.0843 - val_accuracy: 0.9367 - val_loss: 0.0932
Epoch 5/5
5211/5211 - 1152s - 221ms/step - accuracy: 0.9434 - loss: 0.0834 - val_accuracy: 0.9340 - val_loss: 0.0940


<keras.src.callbacks.history.History at 0x7b6be8e80a60>

# ** Evaluate the LSTM model**

In [32]:

y_pred_lstm = np.argmax(lstm_model.predict(X_test_pad), axis=-1)


[1m2606/2606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 43ms/step


# **accuracy**

In [33]:
# Print accuracy
print("LSTM Model Accuracy:", accuracy_score(y_test, y_pred_lstm))


LSTM Model Accuracy: 0.9339627168254121


# **classification report for LSTM**

In [34]:

print(classification_report(y_test, y_pred_lstm, target_names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']))


              precision    recall  f1-score   support

     sadness       0.96      0.98      0.97     24201
         joy       0.94      0.96      0.95     28164
        love       0.87      0.79      0.83      6929
       anger       0.94      0.95      0.94     11441
        fear       0.89      0.90      0.89      9594
    surprise       0.92      0.64      0.75      3033

    accuracy                           0.93     83362
   macro avg       0.92      0.87      0.89     83362
weighted avg       0.93      0.93      0.93     83362



# **Confusion matrix**

In [35]:

print("Confusion Matrix - LSTM:")
print(confusion_matrix(y_test, y_pred_lstm))


Confusion Matrix - LSTM:
[[23828    23     1   168   166    15]
 [   23 27174   836    45    17    69]
 [    1  1472  5455     1     0     0]
 [  490    50     0 10830    71     0]
 [  373    27     0   478  8642    74]
 [   16   259     0     0   830  1928]]


# **Predict the best model**

In [36]:
# Step 9: Predict on new data using the best model (LSTM or Logistic Regression)
def predict_emotion(text, model='logreg'):
    cleaned_text = clean_text(text)
    if model == 'logreg':
        text_tfidf = tfidf.transform([cleaned_text])
        prediction = best_logreg.predict(text_tfidf)[0]
    else:
        text_seq = tokenizer.texts_to_sequences([cleaned_text])
        text_pad = pad_sequences(text_seq, maxlen=max_seq_len)
        prediction = np.argmax(lstm_model.predict(text_pad), axis=-1)[0]

    emotions = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
    return emotions[prediction]


# **Example prediction with Models**

In [37]:
# Example prediction with Logistic Regression
new_tweet = "I am so excited and happy about this!"
print("Predicted emotion (Logistic Regression):", predict_emotion(new_tweet, model='logreg'))


Predicted emotion (Logistic Regression): joy


In [38]:
# Example prediction with LSTM
print("Predicted emotion (LSTM):", predict_emotion(new_tweet, model='lstm'))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Predicted emotion (LSTM): joy
