In [1]:
import pandas as pd
import re
import nltk
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("emotion_sentimen_dataset.csv")  # Ensure file is in the working directory

# Text cleaning function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
    return ' '.join(text)

# Apply text cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['Emotion'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['encoded_label'], test_size=0.2, random_state=42)

# TF-IDF Vectorization with Bigrams
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)  # Unigrams & Bigrams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# Save model, vectorizer, and label encoder
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))
pickle.dump(lr_model, open("model.pkl", "wb"))
pickle.dump(label_encoder, open("label_encoder.pkl", "wb"))

print("Model training complete and files saved!")



[nltk_data] Downloading package stopwords to C:\Users\Rhea
[nltk_data]     Dmello\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Rhea
[nltk_data]     Dmello\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Model training complete and files saved!
