In [None]:
import pandas as pd
import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset

# Loading Dataset
dataset = load_dataset("dair-ai/emotion", split="train").train_test_split(test_size=0.2, seed=42)
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['test'])

lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)

df_train['clean_text'] = df_train['text'].apply(clean_and_lemmatize)
df_val['clean_text'] = df_val['text'].apply(clean_and_lemmatize)

# Label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(df_train['label'])
y_val = le.transform(df_val['label'])

# Tokenization + Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df_train['clean_text'])

X_train = tokenizer.texts_to_sequences(df_train['clean_text'])
X_val = tokenizer.texts_to_sequences(df_val['clean_text'])

X_train = pad_sequences(X_train, maxlen=100)
X_val = pad_sequences(X_val, maxlen=100)