In [15]:
import pandas as pd
import numpy as np
import pickle
import nltk
import re
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras_preprocessing.text import one_hot
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
train_data = pd.read_csv("/kaggle/input/datass/train.txt", header = None, sep=";", names=["Text", "Emotion"], encoding="utf-8")
test_data = pd.read_csv("/kaggle/input/datass/test.txt", header = None, sep=";", names=["Text", "Emotion"], encoding="utf-8")
val_data = pd.read_csv("/kaggle/input/datass/val.txt", header = None, sep=";", names=["Text", "Emotion"], encoding="utf-8")



In [8]:
data = pd.concat([train_data, test_data, val_data], axis=0, ignore_index=True)

In [9]:
data

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
19995,im having ssa examination tomorrow in the morn...,sadness
19996,i constantly worry about their fight against n...,joy
19997,i feel its important to share this info for th...,joy
19998,i truly feel that if you are passionate enough...,joy


In [98]:
data.Emotion.nunique()

6

In [12]:
lb = LabelEncoder()
data['Emotion'] = lb.fit_transform(data['Emotion'])

In [13]:
data

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,4
1,i can go from feeling so hopeless to so damned...,4
2,im grabbing a minute to post i feel greedy wrong,0
3,i am ever feeling nostalgic about the fireplac...,3
4,i am feeling grouchy,0
...,...,...
19995,im having ssa examination tomorrow in the morn...,4
19996,i constantly worry about their fight against n...,2
19997,i feel its important to share this info for th...,2
19998,i truly feel that if you are passionate enough...,2


In [16]:
def clean_data(text):
    stemmer = PorterStemmer()
    text = re.sub("[^a-zA-Z]", " ",text)
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stopwords]
    return " ".join(text)

data['cleaned_text']  = data['Text'].apply(clean_data)

In [17]:
data

Unnamed: 0,Text,Emotion,cleaned_text
0,i didnt feel humiliated,4,didnt feel humili
1,i can go from feeling so hopeless to so damned...,4,go feel hopeless damn hope around someon care ...
2,im grabbing a minute to post i feel greedy wrong,0,im grab minut post feel greedi wrong
3,i am ever feeling nostalgic about the fireplac...,3,ever feel nostalg fireplac know still properti
4,i am feeling grouchy,0,feel grouchi
...,...,...,...
19995,im having ssa examination tomorrow in the morn...,4,im ssa examin tomorrow morn im quit well prepa...
19996,i constantly worry about their fight against n...,2,constantli worri fight natur push limit inner ...
19997,i feel its important to share this info for th...,2,feel import share info experi thing
19998,i truly feel that if you are passionate enough...,2,truli feel passion enough someth stay true suc...


In [19]:
data=data[['cleaned_text','Emotion']]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['Emotion'], test_size = 0.3, random_state=42,stratify=data['Emotion'])


In [25]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [29]:
classifier={
    'MultinomialNB':MultinomialNB(),
    'LogisticRegression':LogisticRegression(),
    'Random Forest':RandomForestClassifier(),
    'Support Vector Machine': SVC(),
}

for name,clf in classifier.items():
    print(f"\n============{name}============")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Train Accuracy : {clf.score(X_train,y_train)}")
    print(f"Test Accuracy : {accuracy}")
    print("Classification Report")
    print(classification_report(y_test, y_pred))
    


Train Accuracy : 0.7365714285714285
Test Accuracy : 0.6688333333333333
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.32      0.47       813
           1       0.94      0.27      0.42       712
           2       0.62      0.97      0.75      2028
           3       0.93      0.06      0.11       492
           4       0.68      0.90      0.78      1739
           5       0.00      0.00      0.00       216

    accuracy                           0.67      6000
   macro avg       0.68      0.42      0.42      6000
weighted avg       0.72      0.67      0.60      6000




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy : 0.9191428571428572
Test Accuracy : 0.835
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.78      0.82       813
           1       0.87      0.72      0.79       712
           2       0.80      0.94      0.87      2028
           3       0.79      0.54      0.64       492
           4       0.87      0.91      0.89      1739
           5       0.84      0.47      0.60       216

    accuracy                           0.83      6000
   macro avg       0.84      0.73      0.77      6000
weighted avg       0.84      0.83      0.83      6000


Train Accuracy : 0.9972142857142857
Test Accuracy : 0.8488333333333333
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.85      0.85       813
           1       0.82      0.83      0.82       712
           2       0.86      0.88      0.87      2028
           3       0.73      0.69      0.71       492
           4

In [33]:
rf_params = {
    'n_estimators': randint(50, 300),  
    'max_depth': randint(3, 20),  
    'min_samples_split': randint(2, 10),  
    'min_samples_leaf': randint(1, 10),  
    'max_features': ['sqrt', 'log2', None],  
    'bootstrap': [True, False],  
}

rf = RandomForestClassifier()
rf_search = RandomizedSearchCV(rf, rf_params, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)

In [34]:
log_params = {
    'C': uniform(0.01, 10),  
    'penalty': ['l1', 'l2', 'elasticnet'],  
    'solver': ['liblinear', 'saga'],  
}

log_reg = LogisticRegression()
log_search = RandomizedSearchCV(log_reg, log_params, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
log_search.fit(X_train, y_train)

25 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 64, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

------------------------------------

In [35]:
nb_params = {
    'alpha': uniform(0.01, 2),  # Smoothing parameter
    'fit_prior': [True, False],  
}

nb = MultinomialNB()
nb_search = RandomizedSearchCV(nb, nb_params, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
nb_search.fit(X_train, y_train)


In [36]:
print("Best SVM Parameters: ", svm_search.best_params_)
print("Best Random Forest Parameters: ", rf_search.best_params_)
print("Best Logistic Regression Parameters: ", log_search.best_params_)
print("Best Naive Bayes Parameters: ", nb_search.best_params_)

Best SVM Parameters:  {'C': 3.4488521115218393, 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid'}
Best Random Forest Parameters:  {'bootstrap': False, 'max_depth': 14, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 108}
Best Logistic Regression Parameters:  {'C': 1.2303823484477883, 'penalty': 'l1', 'solver': 'saga'}
Best Naive Bayes Parameters:  {'alpha': 0.3768695797323276, 'fit_prior': False}


In [39]:
svm=SVC(C=3.4488521115218393,degree=3,gamma='scale')
svm.fit(X_train,y_train)

In [41]:
print(svm.score(X_train,y_train))
print(svm.score(X_test,y_test))

0.997
0.8331666666666667


In [48]:
rf=RandomForestClassifier(max_depth=15,min_samples_leaf=10,min_samples_split=20,n_estimators=100)

rf.fit(X_train,y_train)

print(rf.score(X_train,y_train))
print(rf.score(X_test,y_test))

0.4142142857142857
0.4076666666666667


In [50]:
# Best Logistic Regression Parameters:  {'C': 1.2303823484477883, 'penalty': 'l1', 'solver': 'saga'}
lr=LogisticRegression(C=1.2,penalty='l1',solver='saga')

lr.fit(X_train,y_train)

print(lr.score(X_train,y_train))
print(lr.score(X_test,y_test))

0.8965714285714286
0.875




In [52]:
# Best Naive Bayes Parameters:  {'alpha': 0.3768695797323276, 'fit_prior': False}

mnb=MultinomialNB(alpha=0.4,fit_prior=False)

mnb.fit(X_train,y_train)

print(mnb.score(X_train,y_train))
print(mnb.score(X_test,y_test))

0.9215
0.7851666666666667


In [57]:

def predict_emotion(input_text):
    cleaned_text = clean_data(input_text)
    input_vectorizer = vectorizer.transform([cleaned_text])
    
    predicted_label = lr.predict(input_vectorizer)[0]
    predicted_emotion = lb.inverse_transform([predicted_label])[0]
    label = np.max(lr.predict(input_vectorizer))
    
    return predicted_emotion

In [59]:
predict_emotion("leave me alone")

'sadness'

In [60]:
predict_emotion("hurrrah!! RCB won the match")

'joy'

In [64]:
predict_emotion("i love you")

'love'

In [66]:
predict_emotion("she breakup with me")

'joy'

In [103]:
import pickle
pickle.dump(lr, open("/kaggle/input/datass/logistic_regression.pkl","wb"))
pickle.dump(lb, open("/kaggle/input/datass/label_encoder.pkl","wb"))
pickle.dump(vectorizer, open("/kaggle/input/datass/tfidfvectorizer.pkl","wb"))

OSError: [Errno 30] Read-only file system: '/kaggle/input/datass/logistic_regression.pkl'

In [72]:
train_data = pd.read_csv("/kaggle/input/datass/train.txt", header = None, sep=";", names=["Text", "Emotion"], encoding="utf-8")
test_data = pd.read_csv("/kaggle/input/datass/test.txt", header = None, sep=";", names=["Text", "Emotion"], encoding="utf-8")
val_data = pd.read_csv("/kaggle/input/datass/val.txt", header = None, sep=";", names=["Text", "Emotion"], encoding="utf-8")

data = pd.concat([train_data, test_data, val_data], axis=0, ignore_index=True)

In [80]:
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

nltk.download('stopwords')

# Load stopwords
stopwords = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [83]:
def text_cleaning(df, column, vocab_size, max_len):
    stemmer = PorterStemmer()
    corpus = []
    
    for text in df[column]:
        text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove non-alphabetic characters
        text = text.lower()
        text = text.split()
        text = [stemmer.stem(word) for word in text if word not in stopwords]
        text = " ".join(text)  # Convert list back to a sentence
        corpus.append(text)

    # Convert text into one-hot encoded sequences
    one_hot_word = [one_hot(sentence, vocab_size) for sentence in corpus]
    
    # Pad sequences to ensure consistent input size
    pad = pad_sequences(sequences=one_hot_word, maxlen=max_len, padding="pre")
    
    return pad

In [84]:
x_train = text_cleaning(data, "Text", vocab_size=11000, max_len=300)

# Convert categorical labels (strings) into numerical labels
label_encoder = LabelEncoder()
data["Emotion"] = label_encoder.fit_transform(data["Emotion"])  # Convert text labels to integers

# Now, apply one-hot encoding
y_train = to_categorical(data["Emotion"])

In [95]:
print(x_train.shape)  # Should be (num_samples, 300)
import numpy as np
x_train = np.array(x_train)

(20000, 300)


In [96]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()

model.add(Embedding(input_dim=11000, output_dim=200, input_length=300))
model.add(Dropout(0.3))  # Increased dropout to reduce overfitting

# First LSTM Layer (Bidirectional for better learning)
model.add(Bidirectional(LSTM(256, return_sequences=True)))  
model.add(Dropout(0.3))  
model.add(BatchNormalization())

# Second LSTM Layer (More complexity)
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))  
model.add(BatchNormalization())

# Third LSTM Layer (Final feature extraction)
model.add(LSTM(128))  
model.add(Dropout(0.3))  
model.add(BatchNormalization())

# Fully Connected Dense Layers
model.add(Dense(128, activation='relu'))  
model.add(Dropout(0.3))  
model.add(Dense(64, activation='relu'))  
model.add(Dropout(0.3))  
model.add(Dense(6, activation='softmax'))  # Output Layer (6 classes)


In [97]:
model.build(input_shape=(None, 300))
model.summary()


In [99]:
# Compile the model
model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy'])

# Early Stopping to prevent overfitting
callback = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)



In [100]:
model.summary()

In [102]:
model.fit(x_train, y_train, epochs=30, batch_size=64, verbose=1, validation_split=0.3)


Epoch 1/30
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 166ms/step - accuracy: 0.3332 - loss: 1.5896 - val_accuracy: 0.3368 - val_loss: 1.5843
Epoch 2/30
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 166ms/step - accuracy: 0.3372 - loss: 1.5891 - val_accuracy: 0.3368 - val_loss: 1.6044
Epoch 3/30
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 163ms/step - accuracy: 0.3305 - loss: 1.5956 - val_accuracy: 0.3368 - val_loss: 1.5903
Epoch 4/30
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 164ms/step - accuracy: 0.3372 - loss: 1.5829 - val_accuracy: 0.3368 - val_loss: 1.5946
Epoch 5/30
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 164ms/step - accuracy: 0.3436 - loss: 1.5770 - val_accuracy: 0.3368 - val_loss: 1.5839
Epoch 6/30
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 164ms/step - accuracy: 0.3296 - loss: 1.5816 - val_accuracy: 0.3368 - val_loss: 1.6003
Epoch 7/30

KeyboardInterrupt: 