# Advanced Sentiment Analysis: Comprehensive NLP System with Multiple Architectures
# Deep Learning-Based Text Classification with LSTM, CNN, and Ensemble Methods

"""
Advanced Sentiment Analysis System

This notebook provides a comprehensive implementation of sentiment analysis using:
- LSTM with Attention Mechanisms
- CNN Text Classifiers  
- Ensemble Methods
- Production Deployment Pipeline

Authors: PyTorch Mastery Hub Team
Institution: Deep Learning Research Institute
Course: Advanced Natural Language Processing
Date: December 2024
"""

# ================================================================================
# 1. SETUP AND ENVIRONMENT CONFIGURATION
# ================================================================================

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import re
import string
import time
from collections import Counter, defaultdict
import pickle
import json
import os
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Configure plotting environment
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Set device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🎭 Advanced Sentiment Analysis System Initialized")
print(f"   Device: {device}")
print(f"   PyTorch Version: {torch.__version__}")
print(f"   CUDA Available: {torch.cuda.is_available()}")

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

print("✅ Environment configured with deterministic settings")

# Create results directory
notebook_results_dir = Path('results/sentiment_analysis')
notebook_results_dir.mkdir(parents=True, exist_ok=True)
print(f"📁 Results will be saved to: {notebook_results_dir}")

# ================================================================================
# 2. TEXT PREPROCESSING PIPELINE
# ================================================================================

class TextPreprocessor:
    """
    Comprehensive text preprocessing pipeline for sentiment analysis.
    
    Handles text cleaning, tokenization, vocabulary building, and sequence conversion
    with support for various text normalization techniques.
    """
    
    def __init__(self, max_vocab_size=10000, min_freq=2, clean_html=True, normalize_contractions=True):
        self.max_vocab_size = max_vocab_size
        self.min_freq = min_freq
        self.clean_html = clean_html
        self.normalize_contractions = normalize_contractions
        
        # Initialize vocabularies with special tokens
        self.word_to_idx = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
        self.idx_to_word = {0: '<PAD>', 1: '<UNK>', 2: '<SOS>', 3: '<EOS>'}
        self.vocab_size = 4
        
        # Common contractions for normalization
        self.contractions = {
            "won't": "will not", "can't": "cannot", "n't": " not",
            "'re": " are", "'ve": " have", "'ll": " will", "'d": " would",
            "'m": " am", "it's": "it is", "that's": "that is",
            "what's": "what is", "where's": "where is", "how's": "how is"
        }
        
        print(f"📚 TextPreprocessor initialized with max_vocab_size={max_vocab_size}")
        
    def clean_text(self, text):
        """Comprehensive text cleaning and normalization."""
        if not isinstance(text, str):
            return ""
        
        text = text.lower().strip()
        
        # Remove HTML tags if specified
        if self.clean_html:
            text = re.sub(r'<[^>]+>', '', text)
        
        # Remove URLs and email addresses
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove user mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        
        # Normalize contractions
        if self.normalize_contractions:
            for contraction, expansion in self.contractions.items():
                text = text.replace(contraction, expansion)
        
        # Remove excessive punctuation but preserve sentence structure
        text = re.sub(r'[!]{2,}', '!', text)
        text = re.sub(r'[?]{2,}', '?', text)
        text = re.sub(r'[.]{3,}', '...', text)
        
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^a-zA-Z0-9\s\.\!\?\,\;\:\-\(\)]', '', text)
        
        # Normalize whitespace
        text = ' '.join(text.split())
        
        return text
    
    def tokenize(self, text):
        """Advanced tokenization with punctuation handling."""
        tokens = re.findall(r'\b\w+\b|[.!?]', text)
        
        # Filter out very short tokens and numbers-only tokens
        filtered_tokens = []
        for token in tokens:
            if len(token) >= 2 or token in ['.', '!', '?']:
                if not (token.isdigit() and len(token) > 1):
                    filtered_tokens.append(token)
        
        return filtered_tokens
    
    def build_vocab(self, texts):
        """Build vocabulary from training texts."""
        print(f"🔨 Building vocabulary from {len(texts)} texts...")
        
        word_counts = Counter()
        total_tokens = 0
        
        for text in tqdm(texts, desc="Processing texts for vocabulary"):
            cleaned = self.clean_text(text)
            tokens = self.tokenize(cleaned)
            word_counts.update(tokens)
            total_tokens += len(tokens)
        
        print(f"   Total tokens processed: {total_tokens:,}")
        print(f"   Unique words found: {len(word_counts):,}")
        
        # Add words that meet frequency threshold
        words_added = 0
        for word, count in word_counts.most_common(self.max_vocab_size - 4):
            if count >= self.min_freq:
                self.word_to_idx[word] = self.vocab_size
                self.idx_to_word[self.vocab_size] = word
                self.vocab_size += 1
                words_added += 1
            else:
                break
        
        print(f"   Final vocabulary size: {self.vocab_size}")
        print(f"   Words added: {words_added}")
        print(f"   Most common words: {word_counts.most_common(10)}")
        
        return {
            'total_tokens': total_tokens,
            'unique_words': len(word_counts),
            'vocabulary_size': self.vocab_size,
            'words_added': words_added
        }
    
    def text_to_sequence(self, text, max_length=None):
        """Convert text to sequence of indices."""
        cleaned = self.clean_text(text)
        tokens = self.tokenize(cleaned)
        
        sequence = [self.word_to_idx.get(token, self.word_to_idx['<UNK>']) 
                   for token in tokens]
        
        if max_length and len(sequence) > max_length:
            sequence = sequence[:max_length]
        
        return sequence

# ================================================================================
# 3. DATASET GENERATION AND PREPARATION
# ================================================================================

def generate_movie_reviews(n_samples=3000):
    """Generate synthetic movie review dataset."""
    print(f"🎬 Generating {n_samples} synthetic movie reviews...")
    
    sentiment_words = {
        'positive': ['amazing', 'outstanding', 'brilliant', 'fantastic', 'excellent', 
                    'wonderful', 'great', 'good', 'enjoyable', 'entertaining'],
        'negative': ['terrible', 'awful', 'horrible', 'bad', 'disappointing', 
                    'boring', 'dull', 'poor', 'mediocre', 'unimpressive']
    }
    
    movie_aspects = ['plot', 'acting', 'direction', 'cinematography', 'dialogue', 
                    'characters', 'music', 'visual effects', 'pacing']
    
    templates = [
        "This movie was {sentiment}.",
        "I {verb} this film. The {aspect} was {sentiment}.",
        "The {aspect1} was {sentiment1} and the {aspect2} was {sentiment2}.",
        "{sentiment} {aspect}! {verb} the entire experience.",
        "What a {sentiment} movie. The {aspect} really {verb}."
    ]
    
    reviews = []
    labels = []
    
    for i in range(n_samples):
        sentiment_label = np.random.choice([0, 1])  # 0: negative, 1: positive
        sentiment_type = 'positive' if sentiment_label == 1 else 'negative'
        
        template = np.random.choice(templates)
        sentiment_word = np.random.choice(sentiment_words[sentiment_type])
        
        # Fill template
        try:
            review = template.format(
                sentiment=sentiment_word,
                sentiment1=sentiment_word,
                sentiment2=np.random.choice(sentiment_words[sentiment_type]),
                aspect=np.random.choice(movie_aspects),
                aspect1=np.random.choice(movie_aspects),
                aspect2=np.random.choice(movie_aspects),
                verb='loved' if sentiment_type == 'positive' else 'hated'
            )
        except KeyError:
            review = f"This movie was {sentiment_word}."
        
        reviews.append(review)
        labels.append(sentiment_label)
    
    print(f"   Generated {len(reviews)} reviews")
    print(f"   Positive: {sum(labels)}, Negative: {len(labels) - sum(labels)}")
    
    return reviews, labels

# Generate dataset
texts, labels = generate_movie_reviews(n_samples=4000)

# Split dataset
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.4, random_state=42, stratify=labels
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)

print(f"\n📂 Dataset Split:")
print(f"   Training: {len(train_texts)} samples")
print(f"   Validation: {len(val_texts)} samples") 
print(f"   Test: {len(test_texts)} samples")

# Sample reviews
print(f"\n📝 Sample Reviews:")
for i, (text, label) in enumerate(zip(train_texts[:3], train_labels[:3])):
    sentiment = "Positive" if label == 1 else "Negative"
    print(f"   {i+1}. [{sentiment}] {text}")

# ================================================================================
# 4. TEXT PREPROCESSING AND VOCABULARY
# ================================================================================

# Create preprocessor and build vocabulary
preprocessor = TextPreprocessor(max_vocab_size=15000, min_freq=2)
vocab_stats = preprocessor.build_vocab(train_texts)

# Test preprocessing
sample_text = "This movie was absolutely AMAZING!!! Best film ever!!!"
print(f"\n🧪 Preprocessing Test:")
print(f"   Original: '{sample_text}'")
print(f"   Cleaned: '{preprocessor.clean_text(sample_text)}'")
print(f"   Sequence: {preprocessor.text_to_sequence(sample_text)[:10]}")

# ================================================================================
# 5. DATASET AND DATALOADER IMPLEMENTATION
# ================================================================================

class SentimentDataset(Dataset):
    """Dataset class for sentiment analysis."""
    
    def __init__(self, texts, labels, preprocessor, max_length=128):
        self.texts = texts
        self.labels = labels
        self.preprocessor = preprocessor
        self.max_length = max_length
        
        # Convert texts to sequences
        self.sequences = []
        self.sequence_lengths = []
        
        for text in tqdm(texts, desc="Converting texts to sequences"):
            sequence = preprocessor.text_to_sequence(text, max_length)
            self.sequences.append(sequence)
            self.sequence_lengths.append(len(sequence))
        
        print(f"   Average sequence length: {np.mean(self.sequence_lengths):.1f}")
        print(f"   Max sequence length: {np.max(self.sequence_lengths)}")
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return {
            'sequence': torch.tensor(self.sequences[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.long),
            'text': self.texts[idx],
            'original_length': len(self.sequences[idx])
        }

def collate_fn(batch):
    """Custom collate function for padding sequences."""
    sequences = [item['sequence'] for item in batch]
    labels = torch.stack([item['label'] for item in batch])
    texts = [item['text'] for item in batch]
    original_lengths = torch.tensor([item['original_length'] for item in batch])
    
    # Pad sequences
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    
    # Create attention mask
    attention_mask = torch.zeros_like(padded_sequences, dtype=torch.bool)
    for i, length in enumerate(original_lengths):
        attention_mask[i, :min(length, padded_sequences.size(1))] = True
    
    return {
        'sequences': padded_sequences,
        'labels': labels,
        'texts': texts,
        'attention_mask': attention_mask,
        'lengths': original_lengths
    }

# Create datasets
max_sequence_length = 128

train_dataset = SentimentDataset(train_texts, train_labels, preprocessor, max_sequence_length)
val_dataset = SentimentDataset(val_texts, val_labels, preprocessor, max_sequence_length)
test_dataset = SentimentDataset(test_texts, test_labels, preprocessor, max_sequence_length)

# Create data loaders
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print(f"\n🔄 Data Loaders Created:")
print(f"   Batch size: {batch_size}")
print(f"   Training batches: {len(train_loader)}")
print(f"   Validation batches: {len(val_loader)}")
print(f"   Test batches: {len(test_loader)}")

# ================================================================================
# 6. NEURAL ARCHITECTURE IMPLEMENTATIONS
# ================================================================================

class BahdanauAttention(nn.Module):
    """Bahdanau attention mechanism for LSTM."""
    
    def __init__(self, hidden_dim, attention_dim=None):
        super(BahdanauAttention, self).__init__()
        
        if attention_dim is None:
            attention_dim = hidden_dim
            
        self.hidden_dim = hidden_dim
        self.attention_dim = attention_dim
        
        self.lstm_projection = nn.Linear(hidden_dim, attention_dim, bias=False)
        self.attention_vector = nn.Linear(attention_dim, 1, bias=False)
        
        # Initialize weights
        nn.init.xavier_uniform_(self.lstm_projection.weight)
        nn.init.xavier_uniform_(self.attention_vector.weight)
        
    def forward(self, lstm_outputs, attention_mask=None):
        """
        Compute attention weights and attended representation.
        
        Args:
            lstm_outputs: (batch_size, seq_len, hidden_dim)
            attention_mask: (batch_size, seq_len)
            
        Returns:
            attended_output: (batch_size, hidden_dim)
            attention_weights: (batch_size, seq_len)
        """
        # Project LSTM outputs to attention space
        projected = torch.tanh(self.lstm_projection(lstm_outputs))
        
        # Compute attention scores
        attention_scores = self.attention_vector(projected).squeeze(2)
        
        # Apply attention mask if provided
        if attention_mask is not None:
            attention_scores = attention_scores.masked_fill(~attention_mask, -1e9)
        
        # Compute attention weights
        attention_weights = F.softmax(attention_scores, dim=1)
        
        # Compute attended output
        attended_output = torch.bmm(attention_weights.unsqueeze(1), lstm_outputs)
        attended_output = attended_output.squeeze(1)
        
        return attended_output, attention_weights

class LSTMSentimentClassifier(nn.Module):
    """LSTM-based sentiment classifier with attention mechanism."""
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2, 
                 num_classes=2, dropout=0.3, bidirectional=True, use_attention=True):
        super(LSTMSentimentClassifier, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.use_attention = use_attention
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # LSTM layer
        self.lstm = nn.LSTM(
            embedding_dim, hidden_dim, num_layers,
            batch_first=True, 
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional
        )
        
        # Calculate LSTM output dimension
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        # Attention mechanism
        if use_attention:
            self.attention = BahdanauAttention(lstm_output_dim)
            final_dim = lstm_output_dim
        else:
            final_dim = lstm_output_dim
        
        # Classification head
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Sequential(
            nn.Linear(final_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout // 2),
            nn.Linear(hidden_dim // 2, num_classes)
        )
        
        # Initialize weights
        self._init_weights()
        
    def _init_weights(self):
        """Initialize model weights."""
        # Initialize embeddings
        nn.init.uniform_(self.embedding.weight, -0.1, 0.1)
        self.embedding.weight.data[0].fill_(0)  # Padding token
        
        # Initialize classifier
        for module in self.classifier:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.zeros_(module.bias)
    
    def forward(self, x, attention_mask=None, return_attention=False):
        """Forward pass through the model."""
        # Embedding
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        
        # LSTM processing
        lstm_outputs, (hidden, cell) = self.lstm(embedded)
        
        # Get final representation
        if self.use_attention:
            final_representation, attention_weights = self.attention(lstm_outputs, attention_mask)
        else:
            if self.bidirectional:
                hidden = hidden.view(self.num_layers, 2, x.size(0), self.hidden_dim)
                final_representation = torch.cat([hidden[-1, 0], hidden[-1, 1]], dim=1)
            else:
                final_representation = hidden[-1]
            attention_weights = None
        
        # Classification
        final_representation = self.dropout(final_representation)
        logits = self.classifier(final_representation)
        
        if return_attention and self.use_attention:
            return logits, attention_weights
        else:
            return logits, attention_weights if self.use_attention else None
    
    def get_model_info(self):
        """Get model information."""
        total_params = sum(p.numel() for p in self.parameters())
        return {
            'model_type': 'LSTM with Attention' if self.use_attention else 'LSTM',
            'vocab_size': self.vocab_size,
            'embedding_dim': self.embedding_dim,
            'hidden_dim': self.hidden_dim,
            'num_layers': self.num_layers,
            'bidirectional': self.bidirectional,
            'use_attention': self.use_attention,
            'total_parameters': total_params
        }

class CNNSentimentClassifier(nn.Module):
    """CNN-based sentiment classifier with multiple filter sizes."""
    
    def __init__(self, vocab_size, embedding_dim, num_filters=100, 
                 filter_sizes=[2, 3, 4, 5], num_classes=2, dropout=0.3):
        super(CNNSentimentClassifier, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.num_filters = num_filters
        self.filter_sizes = filter_sizes
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # Convolutional layers
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        
        # Classification head
        total_filters = len(filter_sizes) * num_filters
        self.dropout = nn.Dropout(dropout)
        
        self.classifier = nn.Sequential(
            nn.Linear(total_filters, total_filters // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(total_filters // 2, total_filters // 4),
            nn.ReLU(),
            nn.Dropout(dropout // 2),
            nn.Linear(total_filters // 4, num_classes)
        )
        
        self._init_weights()
        
    def _init_weights(self):
        """Initialize model weights."""
        nn.init.uniform_(self.embedding.weight, -0.1, 0.1)
        self.embedding.weight.data[0].fill_(0)
        
        for conv in self.convs:
            nn.init.kaiming_normal_(conv.weight, mode='fan_out', nonlinearity='relu')
            nn.init.zeros_(conv.bias)
        
        for module in self.classifier:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.zeros_(module.bias)
    
    def forward(self, x, attention_mask=None, return_attention=False):
        """Forward pass through CNN model."""
        # Embedding
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        
        # Transpose for conv1d: (batch_size, embedding_dim, seq_len)
        embedded = embedded.transpose(1, 2)
        
        # Apply convolutional filters
        conv_outputs = []
        for conv in self.convs:
            conv_out = F.relu(conv(embedded))
            pooled = F.max_pool1d(conv_out, kernel_size=conv_out.size(2))
            pooled = pooled.squeeze(2)
            conv_outputs.append(pooled)
        
        # Concatenate all conv outputs
        concatenated = torch.cat(conv_outputs, dim=1)
        
        # Classification
        concatenated = self.dropout(concatenated)
        logits = self.classifier(concatenated)
        
        return logits, None
    
    def get_model_info(self):
        """Get model information."""
        total_params = sum(p.numel() for p in self.parameters())
        return {
            'model_type': 'CNN Text Classifier',
            'vocab_size': self.vocab_size,
            'embedding_dim': self.embedding_dim,
            'num_filters': self.num_filters,
            'filter_sizes': self.filter_sizes,
            'total_parameters': total_params
        }

class EnsembleClassifier(nn.Module):
    """Ensemble classifier combining multiple models."""
    
    def __init__(self, models, ensemble_method='weighted_average', num_classes=2):
        super(EnsembleClassifier, self).__init__()
        
        self.models = nn.ModuleDict(models)
        self.ensemble_method = ensemble_method
        self.model_names = list(models.keys())
        
        if ensemble_method == 'weighted_average':
            self.model_weights = nn.Parameter(torch.ones(len(models)) / len(models))
        
        print(f"🔗 Ensemble created with models: {self.model_names}")
        
    def forward(self, x, attention_mask=None, return_attention=False):
        """Forward pass through ensemble model."""
        model_outputs = {}
        model_attentions = {}
        
        for name, model in self.models.items():
            logits, attention = model(x, attention_mask, return_attention)
            model_outputs[name] = logits
            if attention is not None:
                model_attentions[name] = attention
        
        # Combine predictions
        if self.ensemble_method == 'weighted_average':
            weights = F.softmax(self.model_weights, dim=0)
            weighted_outputs = []
            for i, (name, logits) in enumerate(model_outputs.items()):
                weighted_outputs.append(weights[i] * logits)
            ensemble_logits = torch.stack(weighted_outputs, dim=0).sum(dim=0)
        else:
            ensemble_logits = torch.stack(list(model_outputs.values()), dim=0).mean(dim=0)
        
        if return_attention and model_attentions:
            return ensemble_logits, model_attentions
        else:
            return ensemble_logits, None
    
    def get_model_info(self):
        """Get ensemble model information."""
        total_params = sum(p.numel() for p in self.parameters())
        individual_params = {name: sum(p.numel() for p in model.parameters()) 
                           for name, model in self.models.items()}
        
        return {
            'model_type': f'Ensemble ({self.ensemble_method})',
            'individual_models': self.model_names,
            'individual_parameters': individual_params,
            'total_parameters': total_params
        }

# Create models
vocab_size = preprocessor.vocab_size
embedding_dim = 128
hidden_dim = 256
dropout = 0.3

print("\n🧠 Creating Models...")

# LSTM Model
lstm_model = LSTMSentimentClassifier(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    num_layers=2,
    dropout=dropout,
    bidirectional=True,
    use_attention=True
).to(device)

# CNN Model
cnn_model = CNNSentimentClassifier(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    num_filters=128,
    filter_sizes=[2, 3, 4, 5],
    dropout=dropout
).to(device)

# Ensemble Model
ensemble_models = {'lstm': lstm_model, 'cnn': cnn_model}
ensemble_model = EnsembleClassifier(
    models=ensemble_models,
    ensemble_method='weighted_average'
).to(device)

# Test models
sample_batch = next(iter(train_loader))
sample_input = sample_batch['sequences'][:4].to(device)
sample_mask = sample_batch['attention_mask'][:4].to(device)

with torch.no_grad():
    lstm_out, lstm_att = lstm_model(sample_input, sample_mask, return_attention=True)
    cnn_out, _ = cnn_model(sample_input, sample_mask)
    ensemble_out, _ = ensemble_model(sample_input, sample_mask)

# Print model information
for name, model in [("LSTM", lstm_model), ("CNN", cnn_model), ("Ensemble", ensemble_model)]:
    info = model.get_model_info()
    print(f"\n✅ {name} Model:")
    print(f"   Type: {info['model_type']}")
    print(f"   Parameters: {info['total_parameters']:,}")

# ================================================================================
# 7. TRAINING FRAMEWORK
# ================================================================================

class ModelTrainer:
    """Comprehensive model trainer with advanced features."""
    
    def __init__(self, model, device, model_name="model", patience=5):
        self.model = model
        self.device = device
        self.model_name = model_name
        self.patience = patience
        
        self.history = {
            'train_loss': [], 'val_loss': [],
            'train_acc': [], 'val_acc': [],
            'learning_rates': []
        }
        
        self.best_val_loss = float('inf')
        self.best_model_state = None
        self.patience_counter = 0
        
    def calculate_metrics(self, predictions, labels):
        """Calculate evaluation metrics."""
        y_true = labels.cpu().numpy()
        y_pred = predictions.cpu().numpy()
        
        accuracy = accuracy_score(y_true, y_pred)
        report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
        
        return {
            'accuracy': accuracy,
            'f1_macro': report['macro avg']['f1-score'],
            'classification_report': report
        }
    
    def train_epoch(self, dataloader, criterion, optimizer):
        """Train for one epoch."""
        self.model.train()
        total_loss = 0
        all_predictions = []
        all_labels = []
        
        for batch in tqdm(dataloader, desc=f"Training {self.model_name}"):
            sequences = batch['sequences'].to(self.device)
            labels = batch['labels'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            
            optimizer.zero_grad()
            
            logits, _ = self.model(sequences, attention_mask)
            loss = criterion(logits, labels)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            all_predictions.extend(predictions.cpu())
            all_labels.extend(labels.cpu())
        
        avg_loss = total_loss / len(dataloader)
        metrics = self.calculate_metrics(torch.tensor(all_predictions), torch.tensor(all_labels))
        
        return avg_loss, metrics
    
    def evaluate_epoch(self, dataloader, criterion):
        """Evaluate the model."""
        self.model.eval()
        total_loss = 0
        all_predictions = []
        all_labels = []
        
        with torch.no_grad():
            for batch in tqdm(dataloader, desc=f"Evaluating {self.model_name}"):
                sequences = batch['sequences'].to(self.device)
                labels = batch['labels'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                
                logits, _ = self.model(sequences, attention_mask)
                loss = criterion(logits, labels)
                
                total_loss += loss.item()
                predictions = torch.argmax(logits, dim=1)
                all_predictions.extend(predictions.cpu())
                all_labels.extend(labels.cpu())
        
        avg_loss = total_loss / len(dataloader)
        metrics = self.calculate_metrics(torch.tensor(all_predictions), torch.tensor(all_labels))
        
        return avg_loss, metrics
    
    def train(self, train_loader, val_loader, num_epochs, lr=0.001):
        """Complete training loop."""
        print(f"\n🚀 Training {self.model_name} for {num_epochs} epochs...")
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(self.model.parameters(), lr=lr, weight_decay=1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.7)
        
        for epoch in range(num_epochs):
            # Training
            train_loss, train_metrics = self.train_epoch(train_loader, criterion, optimizer)
            
            # Validation
            val_loss, val_metrics = self.evaluate_epoch(val_loader, criterion)
            
            # Update scheduler
            scheduler.step(val_loss)
            current_lr = optimizer.param_groups[0]['lr']
            
            # Store history
            self.history['train_loss'].append(train_loss)
            self.history['val_loss'].append(val_loss)
            self.history['train_acc'].append(train_metrics['accuracy'])
            self.history['val_acc'].append(val_metrics['accuracy'])
            self.history['learning_rates'].append(current_lr)
            
            # Early stopping
            if val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                self.best_model_state = self.model.state_dict().copy()
                self.patience_counter = 0
                improvement = "✅"
            else:
                self.patience_counter += 1
                improvement = "⏸️" if self.patience_counter >= self.patience else ""
            
            print(f"Epoch {epoch+1:2d}: Train Loss={train_loss:.4f}, Train Acc={train_metrics['accuracy']:.4f}, "
                  f"Val Loss={val_loss:.4f}, Val Acc={val_metrics['accuracy']:.4f}, "
                  f"F1={val_metrics['f1_macro']:.4f} {improvement}")
            
            if self.patience_counter >= self.patience:
                print(f"Early stopping after {epoch+1} epochs")
                break
        
        # Load best model
        if self.best_model_state:
            self.model.load_state_dict(self.best_model_state)
            print(f"✅ Best model loaded (Val Loss: {self.best_val_loss:.4f})")
        
        return self.best_val_loss

# ================================================================================
# 8. MODEL TRAINING
# ================================================================================

print("\n" + "="*80)
print("🚀 TRAINING ALL MODELS")
print("="*80)

# Training configuration
training_config = {
    'num_epochs': 15,
    'learning_rate': 0.001
}

# Train LSTM
print("\n🧠 Training LSTM Model...")
lstm_trainer = ModelTrainer(lstm_model, device, "LSTM", patience=7)
lstm_best_loss = lstm_trainer.train(train_loader, val_loader, **training_config)

# Train CNN
print("\n🔬 Training CNN Model...")
cnn_trainer = ModelTrainer(cnn_model, device, "CNN", patience=7)
cnn_best_loss = cnn_trainer.train(train_loader, val_loader, **training_config)

# Train Ensemble
print("\n🔗 Training Ensemble Model...")
ensemble_trainer = ModelTrainer(ensemble_model, device, "Ensemble", patience=7)
ensemble_best_loss = ensemble_trainer.train(train_loader, val_loader, 
                                          num_epochs=training_config['num_epochs'],
                                          lr=training_config['learning_rate'] * 0.5)

print("\n✅ All models trained successfully!")

# ================================================================================
# 9. MODEL EVALUATION
# ================================================================================

def comprehensive_evaluation(model, dataloader, device, model_name):
    """Comprehensive model evaluation."""
    print(f"\n📊 Evaluating {model_name}...")
    
    model.eval()
    all_predictions = []
    all_labels = []
    all_probabilities = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Evaluating {model_name}"):
            sequences = batch['sequences'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            logits, _ = model(sequences, attention_mask)
            probabilities = F.softmax(logits, dim=1)
            predictions = torch.argmax(logits, dim=1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    report = classification_report(all_labels, all_predictions, 
                                 target_names=['Negative', 'Positive'], output_dict=True)
    cm = confusion_matrix(all_labels, all_predictions)
    
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   F1 Score: {report['macro avg']['f1-score']:.4f}")
    print(f"   Precision: {report['macro avg']['precision']:.4f}")
    print(f"   Recall: {report['macro avg']['recall']:.4f}")
    
    return {
        'accuracy': accuracy,
        'classification_report': report,
        'confusion_matrix': cm,
        'predictions': all_predictions,
        'probabilities': all_probabilities
    }

print("\n" + "="*80)
print("📊 COMPREHENSIVE MODEL EVALUATION")
print("="*80)

# Evaluate all models
lstm_results = comprehensive_evaluation(lstm_model, test_loader, device, "LSTM")
cnn_results = comprehensive_evaluation(cnn_model, test_loader, device, "CNN")
ensemble_results = comprehensive_evaluation(ensemble_model, test_loader, device, "Ensemble")

# Model comparison
models_performance = {
    'LSTM': {
        'Accuracy': lstm_results['accuracy'],
        'F1 Score': lstm_results['classification_report']['macro avg']['f1-score'],
        'Precision': lstm_results['classification_report']['macro avg']['precision'],
        'Recall': lstm_results['classification_report']['macro avg']['recall']
    },
    'CNN': {
        'Accuracy': cnn_results['accuracy'],
        'F1 Score': cnn_results['classification_report']['macro avg']['f1-score'],
        'Precision': cnn_results['classification_report']['macro avg']['precision'],
        'Recall': cnn_results['classification_report']['macro avg']['recall']
    },
    'Ensemble': {
        'Accuracy': ensemble_results['accuracy'],
        'F1 Score': ensemble_results['classification_report']['macro avg']['f1-score'],
        'Precision': ensemble_results['classification_report']['macro avg']['precision'],
        'Recall': ensemble_results['classification_report']['macro avg']['recall']
    }
}

comparison_df = pd.DataFrame(models_performance).T
print(f"\n📈 Model Comparison:")
print(comparison_df.round(4))

# Find best model
best_model_name = comparison_df['F1 Score'].idxmax()
best_f1_score = comparison_df.loc[best_model_name, 'F1 Score']
print(f"\n🏆 Best Model: {best_model_name} (F1: {best_f1_score:.4f})")

# ================================================================================
# 10. VISUALIZATION AND ANALYSIS
# ================================================================================

# Plot training histories
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

trainers = [('LSTM', lstm_trainer), ('CNN', cnn_trainer), ('Ensemble', ensemble_trainer)]
colors = ['blue', 'green', 'orange']

# Loss curves
for i, (name, trainer) in enumerate(trainers):
    epochs = range(1, len(trainer.history['train_loss']) + 1)
    axes[0, i].plot(epochs, trainer.history['train_loss'], 'b-', label='Train Loss')
    axes[0, i].plot(epochs, trainer.history['val_loss'], 'r-', label='Val Loss')
    axes[0, i].set_title(f'{name} - Loss Curves')
    axes[0, i].set_xlabel('Epoch')
    axes[0, i].set_ylabel('Loss')
    axes[0, i].legend()
    axes[0, i].grid(True, alpha=0.3)

# Accuracy curves
for i, (name, trainer) in enumerate(trainers):
    epochs = range(1, len(trainer.history['train_acc']) + 1)
    axes[1, i].plot(epochs, trainer.history['train_acc'], 'b-', label='Train Acc')
    axes[1, i].plot(epochs, trainer.history['val_acc'], 'r-', label='Val Acc')
    axes[1, i].set_title(f'{name} - Accuracy Curves')
    axes[1, i].set_xlabel('Epoch')
    axes[1, i].set_ylabel('Accuracy')
    axes[1, i].legend()
    axes[1, i].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(notebook_results_dir / 'training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
results = [('LSTM', lstm_results), ('CNN', cnn_results), ('Ensemble', ensemble_results)]

for i, (name, result) in enumerate(results):
    cm = result['confusion_matrix']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Negative', 'Positive'], 
                yticklabels=['Negative', 'Positive'], ax=axes[i])
    axes[i].set_title(f'{name} Confusion Matrix')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')

plt.tight_layout()
plt.savefig(notebook_results_dir / 'confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

# ================================================================================
# 11. ATTENTION VISUALIZATION
# ================================================================================

class AttentionVisualizer:
    """Visualize attention weights for LSTM model."""
    
    def __init__(self, model, preprocessor, device):
        self.model = model
        self.preprocessor = preprocessor
        self.device = device
        
    def visualize_attention(self, text, max_length=64):
        """Visualize attention weights for a single text."""
        # Check if model has attention
        if not hasattr(self.model, 'attention'):
            print("Model does not have attention mechanism")
            return
        
        self.model.eval()
        
        # Preprocess text
        sequence = self.preprocessor.text_to_sequence(text, max_length)
        tokens = self.preprocessor.clean_text(text).split()[:len(sequence)]
        
        # Convert to tensor
        input_tensor = torch.tensor([sequence]).to(self.device)
        attention_mask = torch.ones(1, len(sequence), dtype=torch.bool).to(self.device)
        
        with torch.no_grad():
            logits, attention_weights = self.model(input_tensor, attention_mask, return_attention=True)
            
            if attention_weights is None:
                print("No attention weights returned")
                return
            
            probabilities = F.softmax(logits, dim=1)
            predicted_class = torch.argmax(logits, dim=1).item()
            confidence = probabilities[0, predicted_class].item()
        
        # Plot attention
        attention = attention_weights.squeeze(0).cpu().numpy()[:len(tokens)]
        
        plt.figure(figsize=(12, 6))
        
        # Attention heatmap
        plt.subplot(2, 1, 1)
        plt.imshow(attention.reshape(1, -1), cmap='Reds', aspect='auto')
        plt.xticks(range(len(tokens)), tokens, rotation=45, ha='right')
        plt.yticks([])
        plt.title(f'Attention Weights\nPrediction: {"Positive" if predicted_class == 1 else "Negative"} '
                 f'(Confidence: {confidence:.3f})')
        plt.colorbar(label='Attention Weight')
        
        # Attention bar chart
        plt.subplot(2, 1, 2)
        bars = plt.bar(range(len(tokens)), attention, alpha=0.7)
        plt.xticks(range(len(tokens)), tokens, rotation=45, ha='right')
        plt.ylabel('Attention Weight')
        plt.title('Attention Weights by Token')
        
        # Highlight top words
        top_indices = np.argsort(attention)[-3:]
        for idx in top_indices:
            bars[idx].set_color('red')
        
        plt.tight_layout()
        plt.show()
        
        # Print top attended words
        word_attention_pairs = list(zip(tokens, attention))
        word_attention_pairs.sort(key=lambda x: x[1], reverse=True)
        
        print(f"Text: '{text}'")
        print(f"Prediction: {'Positive' if predicted_class == 1 else 'Negative'} (Confidence: {confidence:.3f})")
        print("Top 5 attended words:")
        for i, (word, weight) in enumerate(word_attention_pairs[:5]):
            print(f"  {i+1}. '{word}': {weight:.4f}")

# Visualize attention for sample texts
if hasattr(lstm_model, 'attention'):
    print("\n🔍 Attention Visualization:")
    visualizer = AttentionVisualizer(lstm_model, preprocessor, device)
    
    sample_texts = [
        "This movie was absolutely fantastic and amazing!",
        "What a terrible and boring film this was.",
        "The movie was okay but the acting was great."
    ]
    
    for text in sample_texts:
        visualizer.visualize_attention(text)
        print("-" * 60)

# ================================================================================
# 12. PRODUCTION DEPLOYMENT
# ================================================================================

class ProductionSentimentAnalyzer:
    """Production-ready sentiment analysis system."""
    
    def __init__(self, model, preprocessor, device, confidence_threshold=0.8):
        self.model = model
        self.preprocessor = preprocessor
        self.device = device
        self.confidence_threshold = confidence_threshold
        self.model.eval()
        
        print(f"🚀 Production analyzer initialized with confidence threshold: {confidence_threshold}")
    
    def predict(self, text, return_explanation=False):
        """Predict sentiment for a single text."""
        try:
            # Preprocess
            sequence = self.preprocessor.text_to_sequence(text, max_length=128)
            
            if len(sequence) == 0:
                return {
                    'text': text,
                    'sentiment': 'unknown',
                    'confidence': 0.0,
                    'error': 'Empty sequence after preprocessing'
                }
            
            # Convert to tensor
            input_tensor = torch.tensor([sequence]).to(self.device)
            attention_mask = torch.ones(1, len(sequence), dtype=torch.bool).to(self.device)
            
            with torch.no_grad():
                logits, attention_weights = self.model(input_tensor, attention_mask, return_attention=True)
                probabilities = F.softmax(logits, dim=1)
                predicted_class = torch.argmax(logits, dim=1).item()
                confidence = probabilities[0, predicted_class].item()
            
            result = {
                'text': text,
                'sentiment': 'positive' if predicted_class == 1 else 'negative',
                'confidence': confidence,
                'high_confidence': confidence > self.confidence_threshold,
                'probabilities': {
                    'negative': probabilities[0, 0].item(),
                    'positive': probabilities[0, 1].item()
                }
            }
            
            # Add explanation if requested
            if return_explanation:
                if confidence > 0.9:
                    explanation = f"Very high confidence {result['sentiment']} sentiment"
                elif confidence > 0.8:
                    explanation = f"High confidence {result['sentiment']} sentiment"
                elif confidence > 0.6:
                    explanation = f"Moderate confidence {result['sentiment']} sentiment"
                else:
                    explanation = f"Low confidence - manual review recommended"
                
                result['explanation'] = explanation
            
            return result
            
        except Exception as e:
            return {
                'text': text,
                'sentiment': 'unknown',
                'confidence': 0.0,
                'error': str(e)
            }
    
    def predict_batch(self, texts):
        """Predict sentiment for multiple texts."""
        results = []
        for text in tqdm(texts, desc="Processing batch"):
            result = self.predict(text)
            results.append(result)
        return results

# Create production analyzer
production_analyzer = ProductionSentimentAnalyzer(
    model=ensemble_model,  # Use best performing model
    preprocessor=preprocessor,
    device=device,
    confidence_threshold=0.8
)

# Test production system
test_texts = [
    "This movie is absolutely fantastic! Great acting and amazing plot.",
    "Terrible movie, waste of time and money. Completely boring.",
    "The film was okay, nothing special but not bad either.",
    "Brilliant cinematography and outstanding performances!",
    "I fell asleep during this confusing and dull movie."
]

print("\n🧪 Testing Production System:")
print("="*60)

for text in test_texts:
    result = production_analyzer.predict(text, return_explanation=True)
    
    if 'error' not in result:
        confidence_indicator = "✅" if result['high_confidence'] else "⚠️"
        print(f"Text: '{text[:50]}...'")
        print(f"Sentiment: {result['sentiment'].upper()} {confidence_indicator}")
        print(f"Confidence: {result['confidence']:.3f}")
        if 'explanation' in result:
            print(f"Explanation: {result['explanation']}")
        print("-" * 60)
    else:
        print(f"Error processing: '{text[:50]}...'")
        print(f"Error: {result['error']}")
        print("-" * 60)

# ================================================================================
# 13. RESULTS SUMMARY
# ================================================================================

print("\n" + "="*80)
print("🎉 ADVANCED SENTIMENT ANALYSIS PROJECT COMPLETED")
print("="*80)

final_summary = f"""
📊 **PROJECT SUMMARY**

🎯 **PERFORMANCE ACHIEVEMENTS**
   • Best Model: {best_model_name}
   • Accuracy: {comparison_df.loc[best_model_name, 'Accuracy']:.3f}
   • F1-Score: {comparison_df.loc[best_model_name, 'F1 Score']:.3f}
   • Precision: {comparison_df.loc[best_model_name, 'Precision']:.3f}
   • Recall: {comparison_df.loc[best_model_name, 'Recall']:.3f}

🧠 **MODELS IMPLEMENTED**
   • LSTM with Attention: {lstm_model.get_model_info()['total_parameters']:,} parameters
   • CNN Text Classifier: {cnn_model.get_model_info()['total_parameters']:,} parameters  
   • Ensemble Method: Weighted combination

📈 **TRAINING RESULTS**
   • LSTM Best Validation Loss: {lstm_best_loss:.4f}
   • CNN Best Validation Loss: {cnn_best_loss:.4f}
   • Ensemble Best Validation Loss: {ensemble_best_loss:.4f}

🔍 **FEATURES IMPLEMENTED**
   ✅ Advanced text preprocessing pipeline
   ✅ Multiple neural architectures (LSTM + CNN + Ensemble)
   ✅ Attention mechanism with visualization
   ✅ Comprehensive training framework
   ✅ Model evaluation and comparison
   ✅ Production-ready deployment system

🚀 **PRODUCTION READINESS**
   ✅ Real-time inference pipeline
   ✅ Confidence scoring and thresholding
   ✅ Batch processing capabilities
   ✅ Error handling and validation
   ✅ Comprehensive evaluation metrics

📦 **DELIVERABLES**
   ✅ Trained models with state preservation
   ✅ Production inference system
   ✅ Attention visualization tools
   ✅ Comprehensive evaluation results
   ✅ Training curves and analysis plots

📁 **Results saved to: {notebook_results_dir}**
"""

print(final_summary)

# Save models and results
torch.save({
    'lstm_model': lstm_model.state_dict(),
    'cnn_model': cnn_model.state_dict(),
    'ensemble_model': ensemble_model.state_dict(),
    'preprocessor': preprocessor,
    'model_configs': {
        'lstm': lstm_model.get_model_info(),
        'cnn': cnn_model.get_model_info(),
        'ensemble': ensemble_model.get_model_info()
    },
    'training_config': training_config,
    'evaluation_results': {
        'lstm': lstm_results,
        'cnn': cnn_results,
        'ensemble': ensemble_results
    }
}, notebook_results_dir / 'sentiment_analysis_complete.pth')

# Save comparison results
comparison_df.to_csv(notebook_results_dir / 'model_comparison.csv')

print("✅ All models and results saved successfully!")
print("🎊 Sentiment Analysis System Complete! 🎊")