In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
import re

class EmotionClassifier:
    def __init__(self, max_words=10000, max_len=100):
        self.max_words = max_words
        self.max_len = max_len
        self.tokenizer = None
        self.model = None
        self.label_encoder = None
    
    def preprocess_text(self, text):
        if pd.isna(text):
            return ""
        # Convert to string and lowercase
        text = str(text).lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text
    
    def create_model(self, num_classes):
        model = Sequential([
            Embedding(self.max_words, 128, input_length=self.max_len),
            SpatialDropout1D(0.2),
            LSTM(128, return_sequences=True),
            LSTM(64),
            Dense(64, activation='relu'),
            Dropout(0.5),
            Dense(num_classes, activation='softmax')
        ])
        model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
        return model
    
    def train(self, data_path, epochs=10, batch_size=32, validation_split=0.2):
        try:
            # Load data
            print("Loading data...")
            df = pd.read_csv(data_path)
            
            # Preprocess texts
            print("Preprocessing texts...")
            X = df['sentence'].apply(self.preprocess_text)
            
            # Encode labels
            print("Encoding labels...")
            self.label_encoder = LabelEncoder()
            y = self.label_encoder.fit_transform(df['emotion'])
            y = tf.keras.utils.to_categorical(y)
            
            # Split data
            print("Splitting data...")
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=validation_split, random_state=42
            )
            
            # Tokenize texts
            print("Tokenizing texts...")
            self.tokenizer = Tokenizer(num_words=self.max_words)
            self.tokenizer.fit_on_texts(X_train)
            
            # Convert texts to sequences
            X_train_seq = self.tokenizer.texts_to_sequences(X_train)
            X_val_seq = self.tokenizer.texts_to_sequences(X_val)
            
            # Pad sequences
            print("Padding sequences...")
            X_train_pad = pad_sequences(X_train_seq, maxlen=self.max_len)
            X_val_pad = pad_sequences(X_val_seq, maxlen=self.max_len)
            
            # Create and train model
            print("Creating model...")
            self.model = self.create_model(len(self.label_encoder.classes_))
            
            # Add early stopping
            early_stopping = tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=3,
                restore_best_weights=True
            )
            
            # Train model
            print("Training model...")
            history = self.model.fit(
                X_train_pad, y_train,
                validation_data=(X_val_pad, y_val),
                epochs=epochs,
                batch_size=batch_size,
                callbacks=[early_stopping]
            )
            
            # Evaluate model
            val_loss, val_accuracy = self.model.evaluate(X_val_pad, y_val)
            print(f"\nValidation accuracy: {val_accuracy*100:.2f}%")
            
            return history
            
        except Exception as e:
            print(f"An error occurred during training: {str(e)}")
            raise
    
    def predict(self, text):
        try:
            # Preprocess text
            processed_text = self.preprocess_text(text)
            
            # Convert to sequence
            sequence = self.tokenizer.texts_to_sequences([processed_text])
            
            # Pad sequence
            padded = pad_sequences(sequence, maxlen=self.max_len)
            
            # Predict
            prediction = self.model.predict(padded)
            predicted_class = self.label_encoder.inverse_transform([np.argmax(prediction)])
            
            # Get probability
            probability = np.max(prediction) * 100
            
            return predicted_class[0], probability
            
        except Exception as e:
            print(f"An error occurred during prediction: {str(e)}")
            raise

# Usage example
def main():
    # Initialize classifier
    classifier = EmotionClassifier(max_words=10000, max_len=100)
    
    try:
        # Train model
        history = classifier.train(
            data_path=r"C:\Users\soumy\Downloads\Sentimental Analysis\archive\combined_emotion.csv",
            epochs=10,
            batch_size=32
        )
        
        # Example predictions
        test_sentences = [
            "I am so happy today!",
            "I am anxious",
            "This is really frustrating"
            "This made me feel very sad",
            "I'm really angry about what happened",
            "I'm feeling quite scared right now",
            "I finally got my dream job!",
            "I told you not to touch my things!",
            "I think someone is following me.",
            "I can’t believe I won the lottery!"
        ]
        
        print("\nTesting predictions:")
        for sentence in test_sentences:
            emotion, confidence = classifier.predict(sentence)
            print(f"\nText: {sentence}")
            print(f"Predicted emotion: {emotion}")
            print(f"Confidence: {confidence:.2f}%")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Loading data...
Preprocessing texts...
Encoding labels...
Splitting data...
Tokenizing texts...
Padding sequences...
Creating model...
Training model...




Epoch 1/10
[1m10569/10569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1992s[0m 187ms/step - accuracy: 0.8303 - loss: 0.4055 - val_accuracy: 0.9377 - val_loss: 0.1005
Epoch 2/10
[1m10569/10569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1548s[0m 144ms/step - accuracy: 0.9394 - loss: 0.0979 - val_accuracy: 0.9412 - val_loss: 0.0920
Epoch 3/10
[1m10569/10569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2019s[0m 191ms/step - accuracy: 0.9413 - loss: 0.0907 - val_accuracy: 0.9418 - val_loss: 0.0897
Epoch 4/10
[1m10569/10569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2505s[0m 237ms/step - accuracy: 0.9424 - loss: 0.0867 - val_accuracy: 0.9417 - val_loss: 0.0895
Epoch 5/10
[1m10569/10569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1722s[0m 163ms/step - accuracy: 0.9419 - loss: 0.0859 - val_accuracy: 0.9418 - val_loss: 0.0933
Epoch 6/10
[1m10569/10569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1713s[0m 158ms/step - accuracy: 0.9441 - loss: 0.0834 - val_accuracy: 