In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [None]:
train_df = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/train.txt", sep=';', header=None, names=['Text', 'Emotion'])
test_df = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/test.txt", sep=';', header=None, names=['Text', 'Emotion'])
val_df = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/val.txt", sep=';', header=None, names=['Text', 'Emotion'])

In [None]:
train_df.info()

# **Data Preprocessing**

In [None]:
train_df.isnull().sum()

In [None]:
train_df.duplicated().sum()

In [None]:
duplicate_rows = train_df[train_df.duplicated(keep=False)]

# Displaying the duplicate rows
print(duplicate_rows)

In [None]:
train_df.drop_duplicates(inplace=True)

Seeing the count of each emotion in train dataset

In [None]:
train_df['Emotion'].value_counts()

In [None]:
train_df = train_df.query('Emotion != "surprise" and Emotion != "love"')
val_df = val_df.query('Emotion != "surprise" and Emotion != "love"')


In [None]:
test_df = test_df.query('Emotion != "surprise" and Emotion != "love"')

In [None]:
print(train_df.shape)
print(val_df.shape)

Concating the training and validation datase

In [None]:
training_dataset = pd.concat([train_df, val_df], ignore_index=True)
training_dataset

In [None]:
training_dataset['Emotion'].value_counts()

In [None]:
training_dataset.isnull().sum()

In [None]:
training_dataset.duplicated().sum()

In [None]:
training_dataset.shape

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

In [None]:
from nltk.stem import SnowballStemmer
snowballstemmer=SnowballStemmer('english')

In [None]:
corpus = []

for index, row in training_dataset.iterrows():
    text = re.sub('[^a-zA-Z]', ' ', row['Text'])
    text = text.lower()
    text = text.split()
    text = [snowballstemmer.stem(word) for word in text if word not in stop_words]
    text = ' '.join(text)
    corpus.append(text)

In [None]:
corpus[:]

# **Word2Vec**

In [None]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [None]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))
    

In [None]:
words

In [None]:
model=gensim.models.Word2Vec(words,vector_size=150,window=15,epochs=35)

Seeing all vocabulary

In [None]:
model.wv.index_to_key

In [None]:
model.corpus_count

In [None]:
model.wv.similar_by_word('good')

In [None]:
model.wv['good'].shape

In [None]:
def avg_word2vec(words):
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)


In [None]:
X = []
for i in range(len(words)):
    X.append(avg_word2vec(words[i]))

In [None]:
X = np.array(X)

In [None]:
X_train=X[:14123]

In [None]:
X_train.shape

In [None]:
X_train

In [None]:
X_val=X[14123:]
X_val.shape

In [None]:
X_val

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y=encoder.fit_transform(training_dataset['Emotion'])
y

In [None]:
y_train=y[:14123]
y_train.shape

In [None]:
y_val=y[14123:]
y_val.shape

# **ML Model Training**

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

Lightgbm

In [None]:
lgbm = LGBMClassifier(device='gpu',verbosity=-1)

cv_scores = cross_val_score(lgbm, X_train, y_train, cv=5, scoring='accuracy')

print(f'Cross-validation scores: {cv_scores}')
print(f'Mean CV accuracy: {cv_scores.mean()}')

XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(tree_method='gpu_hist')

cv_scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring='accuracy')

print(f'Cross-validation scores: {cv_scores}')
print(f'Mean CV accuracy: {cv_scores.mean()}')

In [None]:
y=y[:,0]
y.shape

In [None]:
y

# **Hyperparameter Tuning**

In [None]:
import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from xgboost import XGBClassifier
def objective(trial: Trial, X, y) -> float:
    lgb_params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1.0, log=True),
        "verbosity": -1,
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'num_leaves': trial.suggest_int('num_leaves', 10, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 20),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-7, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-7, 10.0, log=True),
        'device': 'gpu'  # Use GPU
    }

    model = LGBMClassifier(**lgb_params)
    
    return cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

In [None]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(lambda trial: objective(trial, X, y), n_trials=100)


In [None]:
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [None]:
import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from xgboost import XGBClassifier
def objective(trial: Trial, X, y) -> float:
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",  # Use binary classification objective
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-6, 20.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-6, 20.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        param["max_depth"] = trial.suggest_int("max_depth", 2, 12, step=1)
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-4, 2, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-6, 10.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    param["tree_method"] = "gpu_hist"    
    model = XGBClassifier(**param)
    
    return cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()


In [None]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(lambda trial: objective(trial, X, y), n_trials=100)


In [None]:
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

Preprocessing Test Dataset for checking accuracy

In [None]:
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [snowballstemmer.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

# Applying preprocessing to test data
test_df['Processed_Text'] = test_df['Text'].apply(preprocess_text)

X_test = test_df['Processed_Text']
y_test = test_df['Emotion']

In [None]:
def transform_text(text, model):
    words = simple_preprocess(text)
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Convert text to vectors
X_test_vectors = np.array([transform_text(text, model) for text in X_test])

In [None]:
params = {
    'learning_rate': 0.17457696342022835,
    'n_estimators': 453,
    'num_leaves': 42,
    'verbosity':-1,
    'min_child_samples': 14,
    'max_depth': 6,
    'subsample': 0.5989313145956421,
    'colsample_bytree': 0.9815244345748503,
    'reg_alpha': 0.015501257178977755,
    'reg_lambda': 1.5344414752285148e-07,
    'objective': 'multiclass',  
    'metric': 'multi_logloss',
    'device': 'gpu' 
}

lgbm=LGBMClassifier(**params)
lgbm.fit(X_train, y_train)


In [None]:
y_pred = lgbm.predict(X_test_vectors)

In [None]:
y_test_encoded = encoder.transform(y_test)
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Test set accuracy: {accuracy}")

In [None]:
params = {
    'booster': 'gbtree',
    'lambda': 3.9793910587322068,
    'alpha': 0.0004840911953306867,
    'subsample':  0.8907507700341791,
    'colsample_bytree': 0.8214397498178605,
    'max_depth': 8,
    'min_child_weight': 5,
    'eta': 0.1912552566038664,
    'gamma': 0.0006355355712833726,
    'grow_policy': 'depthwise',
    'tree_method': 'gpu_hist'  
}


xgb = XGBClassifier(**params)

xgb.fit(X_train, y_train)


In [None]:
y_pred = xgb.predict(X_test_vectors)

In [None]:
y_test_encoded = encoder.transform(y_test)
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Test set accuracy: {accuracy}")

# Model Training with RNN

In [None]:
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()


In [None]:
tokenizer.fit_on_texts(corpus)

In [None]:
sequences = tokenizer.texts_to_sequences(corpus)
sequences

In [None]:
sequence_lengths = [len(seq) for seq in sequences]
maxlen = np.max(sequence_lengths)
maxlen

In [None]:
X = pad_sequences(sequences, maxlen=maxlen)

In [None]:
X_train=X[:14123]
X_val=X[14123:]

In [None]:
X_train.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y=encoder.fit_transform(training_dataset['Emotion'])
y

In [None]:
y_train=y[:14123]
y_val=y[14123:]

In [None]:
from collections import Counter

# Flatten the list of tokenized sequences to a single list of tokens
all_tokens = [token for sequence in X for token in sequence]

# Finding unique tokens
unique_tokens = set(all_tokens)
print(f"Number of unique tokens: {len(unique_tokens)}")


In [None]:
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(Embedding(input_dim=10087, output_dim=150, input_length=15))
model.add(SimpleRNN(32,return_sequences=False))
model.add(Dense(4, activation='softmax'))

model.summary()

In [None]:
early_stopping = EarlyStopping(
    monitor='val_accuracy',  
    patience=5,            
    restore_best_weights=True 
)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,epochs=50,validation_data=(X_val,y_val))

In [None]:
plt.plot(history.history['accuracy'],color='red',label='train')
plt.plot(history.history['val_accuracy'],color='blue',label='validation')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['loss'],color='red',label='train')
plt.plot(history.history['val_loss'],color='blue',label='validation')
plt.legend()
plt.show()

Preprocessing Test Dataset for checking accuracy

In [None]:
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [snowballstemmer.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

# Applying preprocessing to test data
test_df['Processed_Text'] = test_df['Text'].apply(preprocess_text)

X_test = test_df['Processed_Text']
y_test = test_df['Emotion']

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_df['Processed_Text'])
X_test = pad_sequences(test_sequences, maxlen=maxlen)  

y_pred = model.predict(X_test)

In [None]:
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_encoded = encoder.transform(test_df['Emotion'])
accuracy = accuracy_score(y_test_encoded, y_pred_labels)
print(f"Test set accuracy: {accuracy}")

# **Saving the Trained Model and Label Encoder**

In [None]:
import pickle
model.save('emotion_classification_rnn.h5')
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)