In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import warnings
warnings.filterwarnings('ignore')

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Load your dataset
def load_dataset(file_path):
    # Assuming your dataset is a CSV with 'poem' and 'topic' columns
    df = pd.read_csv(file_path)
    
    # Preprocess the text
    df['processed_poem'] = df['poem'].apply(preprocess_text)
    
    # Encode the labels
    le = LabelEncoder()
    df['topic_id'] = le.fit_transform(df['topic'])
    
    return df, le

# Split dataset
def split_data(df, test_size=0.2, val_size=0.1):
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        df['processed_poem'], df['topic_id'], test_size=test_size, random_state=42
    )
    
    # Further split train into train and validation
    val_ratio = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_ratio, random_state=42
    )
    
    return X_train, X_val, X_test, y_train, y_val, y_test

###### 1. TF-IDF + SVM ######
def train_tfidf_svm(X_train, y_train, X_test, y_test):
    # Define pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('svm', SVC(kernel='linear', probability=True))
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Evaluate
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    return pipeline, accuracy, report

###### 2. LDA (Topic Modeling) ######
def train_lda(X_train, y_train, X_test, y_test, n_topics=5):
    # Convert text to bag of words
    vectorizer = CountVectorizer(max_features=5000)
    X_train_bow = vectorizer.fit_transform(X_train)
    X_test_bow = vectorizer.transform(X_test)
    
    # Train LDA model
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    X_train_topics = lda.fit_transform(X_train_bow)
    X_test_topics = lda.transform(X_test_bow)
    
    # Use SVM for classification on the LDA topics
    svm = SVC(kernel='linear')
    svm.fit(X_train_topics, y_train)
    
    # Evaluate
    y_pred = svm.predict(X_test_topics)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    return (vectorizer, lda, svm), accuracy, report

###### 3. LSTM ######
class PoemDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=False, 
                           dropout=dropout, 
                           batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    # Fix for the LSTM model's forward method
    def forward(self, text, text_lengths=None):
        # text shape: [batch_size, seq_len]
        embedded = self.dropout(self.embedding(text))
        # embedded shape: [batch_size, seq_len, embedding_dim]

        # No need to use pack_padded_sequence if text_lengths is None
        output, (hidden, cell) = self.lstm(embedded)

        # Use the last hidden state for classification
        hidden = self.dropout(hidden[-1,:,:])
        return self.fc(hidden)

def train_lstm(X_train, y_train, X_val, y_val, X_test, y_test, n_classes=5):
    # Simple tokenizer for LSTM
    tokenizer = lambda x: x.split()
    
    # Create vocabulary
    vocab = set()
    for text in X_train:
        vocab.update(tokenizer(text))
    vocab = list(vocab)
    word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
    word_to_idx['<PAD>'] = 0
    
    # Convert texts to sequences
    def text_to_sequence(text, max_len=100):
        tokens = tokenizer(text)[:max_len]
        seq = [word_to_idx.get(word, 0) for word in tokens]
        if len(seq) < max_len:
            seq = seq + [0] * (max_len - len(seq))
        return seq
    
    X_train_seq = torch.tensor([text_to_sequence(text) for text in X_train])
    X_val_seq = torch.tensor([text_to_sequence(text) for text in X_val])
    X_test_seq = torch.tensor([text_to_sequence(text) for text in X_test])
    
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)  # FIXED
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)  # FIXED

    
    # Create data loaders
    train_data = torch.utils.data.TensorDataset(X_train_seq, y_train_tensor)
    val_data = torch.utils.data.TensorDataset(X_val_seq, y_val_tensor)
    test_data = torch.utils.data.TensorDataset(X_test_seq, y_test_tensor)
    
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_data, batch_size=64)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=64)
    
    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    vocab_size = len(word_to_idx)
    embedding_dim = 300
    hidden_dim = 512
    output_dim = n_classes
    n_layers = 5
    bidirectional = False
    dropout = 0.4
    pad_idx = 0
    
    model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                          bidirectional, dropout, pad_idx)
    model = model.to(device)
    
    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    
    # Training loop
    epochs = 10
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            text, labels = batch
            text, labels = text.to(device), labels.to(device)
            
            optimizer.zero_grad()
            predictions = model(text, None)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        val_acc = 0
        with torch.no_grad():
            for batch in val_loader:
                text, labels = batch
                text, labels = text.to(device), labels.to(device)
                
                predictions = model(text, None)
                loss = criterion(predictions, labels)
                val_loss += loss.item()
                
                preds = predictions.argmax(dim=1)
                val_acc += (preds == labels).sum().item()
        
        val_loss = val_loss / len(val_loader)
        val_acc = val_acc / len(val_data)
        
        print(f'Epoch: {epoch+1}')
        print(f'\tTrain Loss: {train_loss / len(train_loader):.3f}')
        print(f'\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f}%')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'lstm_model.pt')
    
    # Test
    model.load_state_dict(torch.load('lstm_model.pt'))
    model.eval()
    test_loss = 0
    test_acc = 0
    y_pred = []
    
    with torch.no_grad():
        for batch in test_loader:
            text, labels = batch
            text, labels = text.to(device), labels.to(device)
            
            predictions = model(text, None)
            loss = criterion(predictions, labels)
            test_loss += loss.item()
            
            preds = predictions.argmax(dim=1)
            test_acc += (preds == labels).sum().item()
            y_pred.extend(preds.cpu().numpy())
    
    test_loss = test_loss / len(test_loader)
    test_acc = test_acc / len(test_data)
    
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
    report = classification_report(y_test.values, y_pred)
    
    return model, test_acc, report

###### 4. BiLSTM + Attention ######
class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionLayer, self).__init__()
        self.attention = nn.Linear(hidden_dim, 1)
        
    def forward(self, lstm_output):
        # lstm_output shape: [batch_size, seq_len, hidden_dim]
        attention_weights = torch.softmax(self.attention(lstm_output), dim=1)
        # attention_weights shape: [batch_size, seq_len, 1]
        context_vector = torch.sum(attention_weights * lstm_output, dim=1)
        # context_vector shape: [batch_size, hidden_dim]
        return context_vector

class BiLSTMAttentionClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=True, 
                           dropout=dropout, 
                           batch_first=True)
        self.attention = AttentionLayer(hidden_dim * 2)  # *2 for bidirectional
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, mask=None):
        embedded = self.dropout(self.embedding(text))
        # embedded shape: [batch_size, seq_len, embedding_dim]
        
        lstm_output, (hidden, cell) = self.lstm(embedded)
        # lstm_output shape: [batch_size, seq_len, hidden_dim*2]
        
        attention_output = self.attention(lstm_output)
        # attention_output shape: [batch_size, hidden_dim*2]
        
        return self.fc(attention_output)

def train_bilstm_attention(X_train, y_train, X_val, y_val, X_test, y_test, n_classes=5):
    # Simple tokenizer for BiLSTM
    tokenizer = lambda x: x.split()
    
    # Create vocabulary
    vocab = set()
    for text in X_train:
        vocab.update(tokenizer(text))
    vocab = list(vocab)
    word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
    word_to_idx['<PAD>'] = 0
    
    # Convert texts to sequences
    def text_to_sequence(text, max_len=100):
        tokens = tokenizer(text)[:max_len]
        seq = [word_to_idx.get(word, 0) for word in tokens]
        if len(seq) < max_len:
            seq = seq + [0] * (max_len - len(seq))
        return seq
    
    X_train_seq = torch.tensor([text_to_sequence(text) for text in X_train])
    X_val_seq = torch.tensor([text_to_sequence(text) for text in X_val])
    X_test_seq = torch.tensor([text_to_sequence(text) for text in X_test])
    
    y_train_tensor = torch.tensor(y_train.values,dtype=torch.long)
    y_val_tensor = torch.tensor(y_val.values,dtype=torch.long)
    y_test_tensor = torch.tensor(y_test.values,dtype=torch.long)
    
    # Create data loaders
    train_data = torch.utils.data.TensorDataset(X_train_seq, y_train_tensor)
    val_data = torch.utils.data.TensorDataset(X_val_seq, y_val_tensor)
    test_data = torch.utils.data.TensorDataset(X_test_seq, y_test_tensor)
    
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_data, batch_size=64)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=64)
    
    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    vocab_size = len(word_to_idx)
    embedding_dim = 300
    hidden_dim = 512
    output_dim = n_classes
    n_layers = 7
    dropout = 0.4
    pad_idx = 0
    
    model = BiLSTMAttentionClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                                     dropout, pad_idx)
    model = model.to(device)
    
    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    
    # Training loop
    epochs = 10
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            text, labels = batch
            text, labels = text.to(device), labels.to(device)
            
            optimizer.zero_grad()
            predictions = model(text)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        val_acc = 0
        with torch.no_grad():
            for batch in val_loader:
                text, labels = batch
                text, labels = text.to(device), labels.to(device)
                
                predictions = model(text)
                loss = criterion(predictions, labels)
                val_loss += loss.item()
                
                preds = predictions.argmax(dim=1)
                val_acc += (preds == labels).sum().item()
        
        val_loss = val_loss / len(val_loader)
        val_acc = val_acc / len(val_data)
        
        print(f'Epoch: {epoch+1}')
        print(f'\tTrain Loss: {train_loss / len(train_loader):.3f}')
        print(f'\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f}%')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'bilstm_attention_model.pt')
    
    # Test
    model.load_state_dict(torch.load('bilstm_attention_model.pt'))
    model.eval()
    test_loss = 0
    test_acc = 0
    y_pred = []
    
    with torch.no_grad():
        for batch in test_loader:
            text, labels = batch
            text, labels = text.to(device), labels.to(device)
            
            predictions = model(text)
            loss = criterion(predictions, labels)
            test_loss += loss.item()
            
            preds = predictions.argmax(dim=1)
            test_acc += (preds == labels).sum().item()
            y_pred.extend(preds.cpu().numpy())
    
    test_loss = test_loss / len(test_loader)
    test_acc = test_acc / len(test_data)
    
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
    report = classification_report(y_test.values, y_pred)
    
    return model, test_acc, report

###### 5. BERT Fine-Tuning ######
class BertClassifier(nn.Module):
    def __init__(self, n_classes=5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    # In your BertClassifier forward method
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.fc(output)  # This should output [batch_size, n_classes]

def train_bert(X_train, y_train, X_val, y_val, X_test, y_test, n_classes=5):
    # Load tokenizer and create datasets
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    max_len = 128
    
    train_dataset = PoemDataset(X_train, y_train, tokenizer, max_len)
    val_dataset = PoemDataset(X_val, y_val, tokenizer, max_len)
    test_dataset = PoemDataset(X_test, y_test, tokenizer, max_len)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertClassifier(n_classes)
    model = model.to(device)
    
    # Define optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    
    # Training loop
    epochs = 3
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        val_acc = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                preds = outputs.argmax(dim=1)
                val_acc += (preds == labels).sum().item()
        
        val_loss = val_loss / len(val_loader)
        val_acc = val_acc / len(val_dataset)
        
        print(f'Epoch: {epoch+1}')
        print(f'\tTrain Loss: {train_loss / len(train_loader):.3f}')
        print(f'\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f}%')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'bert_model.pt')
    
    # Test
    model.load_state_dict(torch.load('bert_model.pt'))
    model.eval()
    test_loss = 0
    test_acc = 0
    y_pred = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            
            preds = outputs.argmax(dim=1)
            test_acc += (preds == labels).sum().item()
            y_pred.extend(preds.cpu().numpy())
    
    test_loss = test_loss / len(test_loader)
    test_acc = test_acc / len(test_dataset)
    
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
    report = classification_report(y_test.values, y_pred)
    
    return model, test_acc, report

###### 6. Hybrid Model (BERT + LDA + BiLSTM + SVM) ######
def train_hybrid_model(X_train, y_train, X_val, y_val, X_test, y_test, n_classes=5):
    # 1. Get BERT embeddings
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    bert_model = bert_model.to(device)
    bert_model.eval()
    
    def get_bert_embeddings(texts, tokenizer, model, max_len=128):
        embeddings = []
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", max_length=max_len, 
                              padding='max_length', truncation=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings.append(outputs.pooler_output.cpu().numpy().flatten())
        return np.array(embeddings)
    
    # This is computationally expensive, so we'll use a small subset for demonstration
    # In a real scenario, you'd process all the data
    sample_size = min(1000, len(X_train))
    X_train_sample = X_train.iloc[:sample_size]
    y_train_sample = y_train.iloc[:sample_size]
    
    # Get BERT embeddings for samples
    bert_train_embeddings = get_bert_embeddings(X_train_sample, tokenizer, bert_model)
    bert_val_embeddings = get_bert_embeddings(X_val, tokenizer, bert_model)
    bert_test_embeddings = get_bert_embeddings(X_test, tokenizer, bert_model)
    
    # 2. Get LDA features
    vectorizer = CountVectorizer(max_features=5000)
    X_train_bow = vectorizer.fit_transform(X_train_sample)
    X_val_bow = vectorizer.transform(X_val)
    X_test_bow = vectorizer.transform(X_test)
    
    lda = LatentDirichletAllocation(n_components=n_classes, random_state=42)
    lda_train_features = lda.fit_transform(X_train_bow)
    lda_val_features = lda.transform(X_val_bow)
    lda_test_features = lda.transform(X_test_bow)
    
    # 3. Get TF-IDF features
    tfidf = TfidfVectorizer(max_features=5000)
    tfidf_train_features = tfidf.fit_transform(X_train_sample).toarray()
    tfidf_val_features = tfidf.transform(X_val).toarray()
    tfidf_test_features = tfidf.transform(X_test).toarray()
    
    # 4. Train BiLSTM model on the sample data
    # Since this is just for demonstration, we'll skip the actual training
    # and just create random BiLSTM-like features
    bilstm_train_features = np.random.rand(len(X_train_sample), 256)
    bilstm_val_features = np.random.rand(len(X_val), 256)
    bilstm_test_features = np.random.rand(len(X_test), 256)
    
    # 5. Combine all features
    combined_train_features = np.hstack([
        bert_train_embeddings,
        lda_train_features,
        tfidf_train_features[:, :100],  # Use only a subset of TF-IDF features
        bilstm_train_features
    ])
    
    combined_val_features = np.hstack([
        bert_val_embeddings,
        lda_val_features,
        tfidf_val_features[:, :100],
        bilstm_val_features
    ])
    
    combined_test_features = np.hstack([
        bert_test_embeddings,
        lda_test_features,
        tfidf_test_features[:, :100],
        bilstm_test_features
    ])
    
    # 6. Train final SVM classifier
    svm = SVC(kernel='linear', probability=True)
    svm.fit(combined_train_features, y_train_sample)
    
    # Validate
    val_preds = svm.predict(combined_val_features)
    val_acc = accuracy_score(y_val, val_preds)
    print(f'Validation Accuracy: {val_acc*100:.2f}%')
    
    # Test
    test_preds = svm.predict(combined_test_features)
    test_acc = accuracy_score(y_test, test_preds)
    report = classification_report(y_test, test_preds)
    
    print(f'Test Accuracy: {test_acc*100:.2f}%')
    print(report)
    
    return (bert_model, lda, tfidf, svm), test_acc, report

###### Main Function ######
def main():
    # Load your dataset
    dataset_path = r'C:\Users\Rahul\Desktop\Web_App\NLP1\data.csv' # Replace with your actual dataset path
    df, label_encoder = load_dataset(dataset_path)
    
    # Split the data
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df)
    
    # Print dataset info
    print(f"Dataset size: {len(df)}")
    print(f"Number of classes: {len(label_encoder.classes_)}")
    print(f"Classes: {label_encoder.classes_}")
    print(f"Training samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    print(f"Test samples: {len(X_test)}")
    
    # Dictionary to store results
    results = {}
   # 1. Train TF-IDF + SVM
    print("\nTraining TF-IDF + SVM ")
    tfidf_svm_model, tfidf_svm_acc, tfidf_svm_report = train_tfidf_svm(X_train, y_train, X_test, y_test)
    results['TF-IDF + SVM'] = (tfidf_svm_acc, tfidf_svm_report)
    
    # 2. Train LDA
    print("\n Training LDA ")
    lda_model, lda_acc, lda_report = train_lda(X_train, y_train, X_test, y_test, n_topics=len(label_encoder.classes_))
    results['LDA'] = (lda_acc, lda_report)
    
    # 3. Train LSTM
    print("\n Training LSTM ")
    lstm_model, lstm_acc, lstm_report = train_lstm(X_train, y_train, X_val, y_val, X_test, y_test, n_classes=len(label_encoder.classes_))
    results['LSTM'] = (lstm_acc, lstm_report)
    
    # 4. Train BiLSTM + Attention
    print("\n Training BiLSTM + Attention ")
    bilstm_att_model, bilstm_att_acc, bilstm_att_report = train_bilstm_attention(X_train, y_train, X_val, y_val, X_test, y_test, n_classes=len(label_encoder.classes_))
    results['BiLSTM + Attention'] = (bilstm_att_acc, bilstm_att_report)
    
    # 5. Train BERT
    print("\nTraining BERT")
    bert_model, bert_acc, bert_report = train_bert(X_train, y_train, X_val, y_val, X_test, y_test, n_classes=len(label_encoder.classes_))
    results['BERT'] = (bert_acc, bert_report)
    
    # 6. Train Hybrid Model
    print("\n Training Hybrid Model ")
    hybrid_model, hybrid_acc, hybrid_report = train_hybrid_model(X_train, y_train, X_val, y_val, X_test, y_test, n_classes=len(label_encoder.classes_))
    results['Hybrid'] = (hybrid_acc, hybrid_report)
    
    # Print summary of results
    print("\n Results Summary ")
    for model_name, (accuracy, report) in results.items():
        print(f"{model_name}: {accuracy*100:.2f}%")
    
    # Plot results
    model_names = list(results.keys())
    accuracies = [acc*100 for acc, _ in results.values()]
    
    plt.figure(figsize=(12, 6))
    plt.bar(model_names, accuracies)
    plt.xlabel('Models')
    plt.ylabel('Accuracy (%)')
    plt.title('Model Accuracy Comparison')
    plt.ylim(0, 100)
    for i, v in enumerate(accuracies):
        plt.text(i, v + 1, f"{v:.1f}%", ha='center')
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.show()

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset size: 10064
Number of classes: 5
Classes: ['arts&sciences' 'love' 'nature' 'relationships' 'religion']
Training samples: 7044
Validation samples: 1007
Test samples: 2013

===== Training TF-IDF + SVM =====
