In [3]:
import mlflow
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import pickle

In [4]:
mlflow.set_tracking_uri("http://ec2-13-204-157-189.ap-south-1.compute.amazonaws.com:5000/")

In [5]:
df = pd.read_csv('../preprocessing_eda/data/processed/Preprocessed_Data.csv')
df.head()

Unnamed: 0,clean_text,category
0,modi promised “minimum government maximum gove...,-1.0
1,talk nonsense continue drama vote modi,0.0
2,say vote modi welcome bjp told rahul main camp...,1.0
3,asking supporter prefix chowkidar name modi gr...,1.0
4,answer among powerful world leader today trump...,1.0


In [6]:
df = df.dropna(subset=['clean_text', 'category'])

In [7]:
mlflow.set_experiment("Pytorch LSTM")

MlflowException: Cannot set a deleted experiment 'Pytorch LSTM' as the active experiment. You can restore the experiment, or permanently delete the experiment to create a new one.

In [8]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels[idx]
        
        # Tokenize and pad/truncate
        tokens = self.tokenizer.texts_to_sequences([text])[0]
        
        # Pad or truncate to max_length
        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        else:
            tokens = tokens + [0] * (self.max_length - len(tokens))
        
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)


In [9]:
# Simple Tokenizer class
class SimpleTokenizer:
    def __init__(self, max_features=10000):
        self.max_features = max_features
        self.word_to_index = {}
        self.index_to_word = {}
        
    def fit_on_texts(self, texts):
        # Count word frequencies
        word_counts = Counter()
        for text in texts:
            words = text.lower().split()
            word_counts.update(words)
        
        # Create vocabulary with most common words
        most_common = word_counts.most_common(self.max_features - 1)
        
        # Reserve index 0 for padding
        self.word_to_index = {'<PAD>': 0}
        self.index_to_word = {0: '<PAD>'}
        
        for i, (word, _) in enumerate(most_common, 1):
            self.word_to_index[word] = i
            self.index_to_word[i] = word
    
    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            words = text.lower().split()
            sequence = [self.word_to_index.get(word, 0) for word in words]
            sequences.append(sequence)
        return sequences

In [10]:
# PyTorch LSTM Model
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_classes, dropout_rate, bidirectional=True):
        super(SentimentLSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # LSTM layer
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_rate if num_layers > 1 else 0,
            bidirectional=bidirectional
        )
        
        # Calculate input dimension for fully connected layer
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        # Fully connected layers
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(lstm_output_dim, 64)
        self.fc2 = nn.Linear(64, num_classes)
        
    def forward(self, x):
        # Embedding
        embedded = self.embedding(x)  # (batch_size, seq_len, embedding_dim)
        
        # LSTM
        lstm_out, (hidden, cell) = self.lstm(embedded)
        
        # Use the last hidden state (for classification)
        if self.bidirectional:
            # Concatenate forward and backward hidden states
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]
        
        # Fully connected layers
        output = self.dropout(hidden)
        output = F.relu(self.fc1(output))
        output = self.dropout(output)
        output = self.fc2(output)
        
        return output

In [11]:
def run_experiment_pytorch_lstm(hidden_units, dropout_rate, num_layers=1, bidirectional=True):
    """
    Run PyTorch LSTM experiment with varying architecture parameters
    
    Args:
        hidden_units (int): Number of LSTM hidden units
        dropout_rate (float): Dropout rate for regularization
        num_layers (int): Number of LSTM layers
        bidirectional (bool): Whether to use bidirectional LSTM
    """
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Tokenization parameters
    max_features = 10000  # Maximum vocabulary size
    maxlen = 100  # Maximum sequence length
    embedding_dim = 128
    
    # Training parameters
    batch_size = 32
    epochs = 50
    learning_rate = 0.001
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        df['clean_text'], df['category'], 
        test_size=0.2, random_state=42, stratify=df['category']
    )
    
    # Further split training into train/validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    
    # Encode labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)
    y_test_encoded = label_encoder.transform(y_test)
    
    num_classes = len(label_encoder.classes_)
    
    # Tokenization
    tokenizer = SimpleTokenizer(max_features=max_features)
    tokenizer.fit_on_texts(X_train)
    
    # Create datasets
    train_dataset = TextDataset(X_train, y_train_encoded, tokenizer, maxlen)
    val_dataset = TextDataset(X_val, y_val_encoded, tokenizer, maxlen)
    test_dataset = TextDataset(X_test, y_test_encoded, tokenizer, maxlen)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        lstm_type = "BiLSTM" if bidirectional else "LSTM"
        mlflow.set_tag("mlflow.runName", f"PyTorch_{lstm_type}_units_{hidden_units}_layers_{num_layers}_dropout_{dropout_rate}")
        mlflow.set_tag("experiment_type", "pytorch_neural_network")
        mlflow.set_tag("model_type", f"PyTorch{lstm_type}Classifier")
        
        # Add description
        mlflow.set_tag("description", 
                      f"PyTorch {lstm_type} with {hidden_units} units, {num_layers} layers, dropout={dropout_rate}")
        
        # Log preprocessing parameters
        mlflow.log_param("tokenizer_max_features", max_features)
        mlflow.log_param("sequence_maxlen", maxlen)
        mlflow.log_param("vocab_size", len(tokenizer.word_to_index))
        
        # Log model architecture parameters
        mlflow.log_param("embedding_dim", embedding_dim)
        mlflow.log_param("lstm_units", hidden_units)
        mlflow.log_param("num_layers", num_layers)
        mlflow.log_param("dropout_rate", dropout_rate)
        mlflow.log_param("bidirectional", bidirectional)
        mlflow.log_param("num_classes", num_classes)
        mlflow.log_param("device", str(device))
        
        # Log training parameters
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("optimizer", "Adam")
        
        # Initialize model
        model = SentimentLSTM(
            vocab_size=len(tokenizer.word_to_index),
            embedding_dim=embedding_dim,
            hidden_dim=hidden_units,
            num_layers=num_layers,
            num_classes=num_classes,
            dropout_rate=dropout_rate,
            bidirectional=bidirectional
        ).to(device)
        
        # Loss function and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        
        # Training history
        train_losses = []
        train_accuracies = []
        val_losses = []
        val_accuracies = []
        
        best_val_accuracy = 0
        patience = 5
        patience_counter = 0
        
        # Training loop
        for epoch in range(epochs):
            # Training phase
            model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0
            
            for batch_texts, batch_labels in train_loader:
                batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                
                optimizer.zero_grad()
                outputs = model(batch_texts)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                train_total += batch_labels.size(0)
                train_correct += (predicted == batch_labels).sum().item()
            
            # Validation phase
            model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                for batch_texts, batch_labels in val_loader:
                    batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                    outputs = model(batch_texts)
                    loss = criterion(outputs, batch_labels)
                    
                    val_loss += loss.item()
                    _, predicted = torch.max(outputs.data, 1)
                    val_total += batch_labels.size(0)
                    val_correct += (predicted == batch_labels).sum().item()
            
            # Calculate metrics
            train_loss_avg = train_loss / len(train_loader)
            train_acc = train_correct / train_total
            val_loss_avg = val_loss / len(val_loader)
            val_acc = val_correct / val_total
            
            train_losses.append(train_loss_avg)
            train_accuracies.append(train_acc)
            val_losses.append(val_loss_avg)
            val_accuracies.append(val_acc)
            
            # Early stopping
            if val_acc > best_val_accuracy:
                best_val_accuracy = val_acc
                patience_counter = 0
                # Save best model
                torch.save(model.state_dict(), 'best_model.pth')
            else:
                patience_counter += 1
                
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break
        
        # Load best model
        model.load_state_dict(torch.load('best_model.pth'))
        
        # Log training metrics
        mlflow.log_metric("actual_epochs", epoch + 1)
        mlflow.log_metric("final_train_loss", train_losses[-1])
        mlflow.log_metric("final_val_loss", val_losses[-1])
        mlflow.log_metric("final_train_accuracy", train_accuracies[-1])
        mlflow.log_metric("final_val_accuracy", val_accuracies[-1])
        mlflow.log_metric("best_val_accuracy", best_val_accuracy)
        
        # Plot training history
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 2, 1)
        plt.plot(train_losses, label='Training Loss')
        plt.plot(val_losses, label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        
        plt.subplot(1, 2, 2)
        plt.plot(train_accuracies, label='Training Accuracy')
        plt.plot(val_accuracies, label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        
        plt.tight_layout()
        plt.savefig("training_history.png")
        mlflow.log_artifact("training_history.png")
        plt.close()
        
        # Test evaluation
        model.eval()
        all_predictions = []
        all_labels = []
        
        with torch.no_grad():
            for batch_texts, batch_labels in test_loader:
                batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                outputs = model(batch_texts)
                _, predicted = torch.max(outputs.data, 1)
                
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(batch_labels.cpu().numpy())
        
        # Calculate test accuracy
        test_accuracy = accuracy_score(all_labels, all_predictions)
        mlflow.log_metric("test_accuracy", test_accuracy)
        
        # Log detailed classification metrics
        classification_rep = classification_report(
            all_labels, all_predictions, 
            target_names=label_encoder.classes_, 
            output_dict=True
        )
        
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)
        
        # Create and log confusion matrix
        conf_matrix = confusion_matrix(all_labels, all_predictions)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", 
                   xticklabels=label_encoder.classes_, 
                   yticklabels=label_encoder.classes_)
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: PyTorch {lstm_type}, units={hidden_units}, layers={num_layers}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()
        
        # Log the model
        mlflow.pytorch.log_model(
            model, 
            f"pytorch_lstm_model_units_{hidden_units}_layers_{num_layers}",
            registered_model_name=None
        )
        
        # Log tokenizer and label encoder for later use
        with open("tokenizer.pkl", "wb") as f:
            pickle.dump(tokenizer, f)
        with open("label_encoder.pkl", "wb") as f:
            pickle.dump(label_encoder, f)
            
        mlflow.log_artifact("tokenizer.pkl")
        mlflow.log_artifact("label_encoder.pkl")
        
        print(f"Completed: PyTorch {lstm_type} with {hidden_units} units, {num_layers} layers, "
              f"dropout={dropout_rate}, Test Accuracy: {test_accuracy:.4f}")



In [12]:
# Experiment configurations
pytorch_configs = [
    {"hidden_units": 64, "dropout_rate": 0.3, "num_layers": 1, "bidirectional": False},
    {"hidden_units": 128, "dropout_rate": 0.3, "num_layers": 1, "bidirectional": False},
    {"hidden_units": 64, "dropout_rate": 0.5, "num_layers": 1, "bidirectional": True},
]

# Run experiments
for config in pytorch_configs:
    run_experiment_pytorch_lstm(**config)

Using device: cpu
Early stopping at epoch 14




Completed: PyTorch LSTM with 64 units, 1 layers, dropout=0.3, Test Accuracy: 0.8931
🏃 View run PyTorch_LSTM_units_64_layers_1_dropout_0.3 at: http://ec2-13-204-157-189.ap-south-1.compute.amazonaws.com:5000/#/experiments/0/runs/80ab7f8140af4f6f8eae450df48f492d
🧪 View experiment at: http://ec2-13-204-157-189.ap-south-1.compute.amazonaws.com:5000/#/experiments/0
Using device: cpu
🏃 View run PyTorch_LSTM_units_128_layers_1_dropout_0.3 at: http://ec2-13-204-157-189.ap-south-1.compute.amazonaws.com:5000/#/experiments/0/runs/b0e2efc52533425cb9b853814779f95a
🧪 View experiment at: http://ec2-13-204-157-189.ap-south-1.compute.amazonaws.com:5000/#/experiments/0


KeyboardInterrupt: 

### 1 model is enough its taking way too long on my cpu

In [17]:
mlflow.set_experiment("Pytorch CPU optimized LSTM")

<Experiment: artifact_location='s3://mlflow-bucket-241103/375501789462796807', creation_time=1756507494887, experiment_id='375501789462796807', last_update_time=1756507494887, lifecycle_stage='active', name='Pytorch CPU optimized LSTM', tags={}>

In [20]:
def run_quick_experiment_pytorch_lstm(hidden_units, dropout_rate, bidirectional=False):
    """
    CPU-optimized version for quick experimentation
    """
    
    device = torch.device('cpu')  # Force CPU
    
    # REDUCED PARAMETERS FOR CPU
    max_features = 5000   # Reduced from 10,000
    maxlen = 50          # Reduced from 100
    embedding_dim = 64   # Reduced from 128
    batch_size = 64      # Increased for efficiency
    epochs = 10          # Reduced from 50
    learning_rate = 0.001
    
    print(f"Starting experiment: {hidden_units} units, dropout={dropout_rate}, bidirectional={bidirectional}")
    
    # Split the data (clean NaN values first)
    df_clean = df.dropna(subset=['clean_text', 'category'])
    
    X_train, X_test, y_train, y_test = train_test_split(
        df_clean['clean_text'], df_clean['category'], 
        test_size=0.2, random_state=42, stratify=df_clean['category']
    )
    
    # Further split training into train/validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    
    # Encode labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)
    y_test_encoded = label_encoder.transform(y_test)
    
    num_classes = len(label_encoder.classes_)
    
    # Tokenization
    tokenizer = SimpleTokenizer(max_features=max_features)
    tokenizer.fit_on_texts(X_train)
    
    # Create datasets
    train_dataset = TextDataset(X_train, y_train_encoded, tokenizer, maxlen)
    val_dataset = TextDataset(X_val, y_val_encoded, tokenizer, maxlen)
    test_dataset = TextDataset(X_test, y_test_encoded, tokenizer, maxlen)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    with mlflow.start_run() as run:
        lstm_type = "BiLSTM" if bidirectional else "LSTM"
        mlflow.set_tag("mlflow.runName", f"CPU_Quick_{lstm_type}_units_{hidden_units}_dropout_{dropout_rate}")
        mlflow.set_tag("experiment_type", "pytorch_cpu_quick")
        mlflow.set_tag("model_type", f"PyTorch{lstm_type}Classifier")
        
        # Log parameters
        mlflow.log_param("device", "CPU")
        mlflow.log_param("max_features", max_features)
        mlflow.log_param("maxlen", maxlen)
        mlflow.log_param("embedding_dim", embedding_dim)
        mlflow.log_param("lstm_units", hidden_units)
        mlflow.log_param("dropout_rate", dropout_rate)
        mlflow.log_param("bidirectional", bidirectional)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("epochs", epochs)
        
        # Initialize model
        model = SentimentLSTM(
            vocab_size=len(tokenizer.word_to_index),
            embedding_dim=embedding_dim,
            hidden_dim=hidden_units,
            num_layers=1,  # Single layer for CPU
            num_classes=num_classes,
            dropout_rate=dropout_rate,
            bidirectional=bidirectional
        ).to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        
        # Training with progress
        train_losses = []
        val_accuracies = []
        
        for epoch in range(epochs):
            # Training
            model.train()
            total_loss = 0
            
            for batch_idx, (batch_texts, batch_labels) in enumerate(train_loader):
                batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                
                optimizer.zero_grad()
                outputs = model(batch_texts)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                
                # Progress indicator
                if batch_idx % 50 == 0:
                    print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item():.4f}")
            
            # Validation
            model.eval()
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                for batch_texts, batch_labels in val_loader:
                    batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                    outputs = model(batch_texts)
                    _, predicted = torch.max(outputs.data, 1)
                    val_total += batch_labels.size(0)
                    val_correct += (predicted == batch_labels).sum().item()
            
            val_acc = val_correct / val_total
            train_losses.append(total_loss / len(train_loader))
            val_accuracies.append(val_acc)
            
            print(f"Epoch {epoch+1}: Train Loss: {train_losses[-1]:.4f}, Val Acc: {val_acc:.4f}")
        
        # Test evaluation
        model.eval()
        test_correct = 0
        test_total = 0
                
        with torch.no_grad():
            for batch_idx, (batch_texts, batch_labels) in enumerate(test_loader):
                # Progress indicator
                if batch_idx % 20 == 0:
                    print(f"Batch {batch_idx + 1}/{len(test_loader)}")
                
                batch_texts = batch_texts.to(device, non_blocking=True)
                batch_labels = batch_labels.to(device, non_blocking=True)
                
                outputs = model(batch_texts)
                predicted = torch.argmax(outputs, dim=1)  # More efficient than torch.max
                
                test_total += batch_labels.size(0)
                test_correct += (predicted == batch_labels).sum().item()
                
                # Clear variables to prevent memory buildup
                del outputs, predicted
                
                # Periodic memory cleanup
                if batch_idx % 50 == 0:
                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
        # Quick CPU experiments - should run in 30-45 minutes total
quick_configs = [
    {"hidden_units": 128, "dropout_rate": 0.3, "bidirectional": False},
    {"hidden_units": 64, "dropout_rate": 0.5, "bidirectional": True},
]

# Run quick experiments
for config in quick_configs:
    run_quick_experiment_pytorch_lstm(**config)

Starting experiment: 128 units, dropout=0.3, bidirectional=False
Epoch 1/10, Batch 0/1997, Loss: 1.1058
Epoch 1/10, Batch 50/1997, Loss: 1.0154
Epoch 1/10, Batch 100/1997, Loss: 1.0868
Epoch 1/10, Batch 150/1997, Loss: 1.1036
Epoch 1/10, Batch 200/1997, Loss: 1.0704
Epoch 1/10, Batch 250/1997, Loss: 1.0474
Epoch 1/10, Batch 300/1997, Loss: 1.0135
Epoch 1/10, Batch 350/1997, Loss: 1.1280
Epoch 1/10, Batch 400/1997, Loss: 1.0494
Epoch 1/10, Batch 450/1997, Loss: 1.0382
Epoch 1/10, Batch 500/1997, Loss: 1.0894
Epoch 1/10, Batch 550/1997, Loss: 1.0255
Epoch 1/10, Batch 600/1997, Loss: 1.0379
Epoch 1/10, Batch 650/1997, Loss: 0.9841
Epoch 1/10, Batch 700/1997, Loss: 0.9188
Epoch 1/10, Batch 750/1997, Loss: 0.9340
Epoch 1/10, Batch 800/1997, Loss: 0.9356
Epoch 1/10, Batch 850/1997, Loss: 1.0324
Epoch 1/10, Batch 900/1997, Loss: 0.8799
Epoch 1/10, Batch 950/1997, Loss: 0.7816
Epoch 1/10, Batch 1000/1997, Loss: 0.7957
Epoch 1/10, Batch 1050/1997, Loss: 0.7797
Epoch 1/10, Batch 1100/1997, Loss: