In [None]:
!pip install ray
!pip install stable-baselines3
!pip install shimmy
!pip install gymnasium



In [None]:
# Cell 0: Setup and Initialization
# Purpose: Set up environment, initialize Ray, load preprocessed data.

import torch
import ray
import psutil
import subprocess
from datetime import datetime
import logging
import json
import os
from google.colab import drive
import pandas as pd
import numpy as np
import gymnasium as gym

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)
drive_path = '/content/drive/MyDrive/Sentiment_Project'
os.makedirs(drive_path, exist_ok=True)
print(f"Drive mounted at {drive_path}")

# Initialize Ray
if not ray.is_initialized():
    ray.init(address='local', ignore_reinit_error=True, logging_level=logging.INFO)
    print("Ray cluster initialized in local mode.")
else:
    print("Ray cluster already initialized, skipping reinitialization.")

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define Coordinator Agent
@ray.remote(num_cpus=1, num_gpus=0 if not torch.cuda.is_available() else 0.1)
class CoordinatorAgent:
    def __init__(self):
        self.logs = []
        self.start_time = datetime.now()
        self.log_file = os.path.join(drive_path, 'coordinator_logs.txt')
        logger.info("Coordinator Agent initialized.")
        self._check_log_file()

    def _check_log_file(self):
        if os.path.exists(self.log_file) and os.path.getsize(self.log_file) > 10 * 1024 * 1024:
            with open(self.log_file, 'r') as f:
                lines = f.readlines()
            with open(self.log_file, 'w') as f:
                f.writelines(lines[-1000:])
            logger.info("Log file rotated due to size limit.")

    def log_message(self, message):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_entry = f"[{timestamp}] {message}"
        self.logs.append(log_entry)
        logger.info(log_entry)
        try:
            with open(self.log_file, 'a') as f:
                f.write(log_entry + '\n')
        except Exception as e:
            logger.error(f"Failed to write to log file: {e}")
        return log_entry

    def get_resource_usage(self):
        try:
            cpu_usage = psutil.cpu_percent(interval=1)
            ram = psutil.virtual_memory()
            ram_usage = ram.percent
            gpu_usage = 0.0
            memory_usage = 0.0
            if torch.cuda.is_available():
                gpu_query = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,noheader,nounits'],
                                          stdout=subprocess.PIPE, text=True, timeout=5)
                gpu_data = [float(x.strip()) for x in gpu_query.stdout.strip().split(',') if x.strip()]
                gpu_usage = gpu_data[0] if gpu_data else 0.0
                memory_used = gpu_data[1] if len(gpu_data) > 1 else 0.0
                memory_total = gpu_data[2] if len(gpu_data) > 2 else 1.0
                memory_usage = (memory_used / memory_total * 100) if memory_total > 0 else 0.0
            resource_dict = {
                "cpu_usage (%)": cpu_usage,
                "ram_usage (%)": ram_usage,
                "gpu_usage (%)": gpu_usage,
                "gpu_memory_usage (%)": memory_usage,
                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            logger.info(f"Resource usage: {json.dumps(resource_dict)}")
            return resource_dict
        except Exception as e:
            logger.error(f"Resource monitoring error: {e}")
            return {"error": str(e)}

    def get_logs(self):
        return self.logs

# Instantiate Coordinator Agent
coordinator = CoordinatorAgent.remote()

# Load preprocessed data
preprocessed_path = os.path.join(drive_path, 'preprocessed_data.pt')
try:
    data = torch.load(preprocessed_path, weights_only=False)
    input_ids = data['inputs']['input_ids']
    attention_mask = data['inputs']['attention_mask']
    embeddings = data['embeddings']  # Optional, not used currently
    labels = data['labels']
    print(f"Loaded preprocessed data: {len(labels)} samples")
    print(f"Input tensor shape: {input_ids.shape}")
    print(f"Embedding shape: {embeddings.shape}")
except Exception as e:
    logger.warning(f"Failed to load preprocessed data: {e}. Using dummy data.")
    input_ids = np.zeros((100, 128), dtype=np.int32)
    attention_mask = np.ones((100, 128), dtype=np.int32)
    labels = np.random.randint(0, 2, 100)
    embeddings = np.zeros((100, 768))  # Dummy embeddings

# Load original dataset for metadata
dataset_path = os.path.join(drive_path, 'processed_dataset.csv')
try:
    df = pd.read_csv(dataset_path, encoding='utf-8')
    print(f"Loaded original dataset: {len(df)} rows")
except Exception as e:
    logger.warning(f"Failed to load dataset: {e}. Using empty DataFrame.")
    df = pd.DataFrame(columns=['review', 'sentiment'])

# Load target accuracy from data_analysis_report.json
report_path = os.path.join(drive_path, 'data_analysis_report.json')
try:
    with open(report_path, 'r') as f:
        analysis_report = json.load(f)
    target_accuracy = analysis_report['target_accuracy']
    print(f"Target accuracy: {target_accuracy}")
except Exception as e:
    logger.warning(f"Failed to load target accuracy: {e}. Using default 0.95.")
    target_accuracy = 0.95

# Set intent and GPU availability
intent = 'performance'  # Default, can be adjusted
gpu_available = torch.cuda.is_available()

Mounted at /content/drive
Drive mounted at /content/drive/MyDrive/Sentiment_Project


2025-05-28 23:41:00,212	INFO worker.py:1888 -- Started a local Ray instance.


Ray cluster initialized in local mode.
Loaded preprocessed data: 50000 samples
Input tensor shape: (50000, 128)
Embedding shape: (50000, 768)
Loaded original dataset: 50000 rows
Target accuracy: 0.92


In [None]:
# Cell 1: Imports and Setup
# Purpose: Import additional libraries and configure the environment.

from torch.utils.data import Dataset, DataLoader
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, BertForSequenceClassification, \
    RobertaForSequenceClassification
from torch.optim import AdamW # Import AdamW from torch.optim
import joblib
from tqdm import tqdm # Import tqdm for progress bars


In [None]:
# Cell 2: Data Preprocessing (Fallback)
# Purpose: Preprocess dataset if preprocessed data is unavailable, with dynamic column detection.

import numpy as np
from transformers import DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
import logging
import pandas as pd
import os

# Ensure logger is defined
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define SentimentDataset
class SentimentDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        try:
            self.input_ids = torch.tensor(input_ids, dtype=torch.long) if input_ids is not None else torch.empty(0)
            self.attention_mask = torch.tensor(attention_mask, dtype=torch.long) if attention_mask is not None else torch.ones(0, 128)
            self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else torch.empty(0)
            if len(self.input_ids) != len(self.attention_mask) or len(self.input_ids) != len(self.labels):
                raise ValueError(f"Length mismatch: input_ids ({len(self.input_ids)}), attention_mask ({len(self.attention_mask)}), labels ({len(self.labels)})")
        except Exception as e:
            logger.error(f"Failed to initialize SentimentDataset: {e}. Using empty dataset.")
            self.input_ids = torch.empty(0)
            self.attention_mask = torch.ones(0, 128)
            self.labels = torch.empty(0)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], 'labels': self.labels[idx]}

# Check if preprocessing is needed
try:
    if len(input_ids) == 0 or len(attention_mask) == 0 or len(labels) == 0:
        # Validate df
        if 'df' not in globals() or df.empty:
            raise ValueError("DataFrame 'df' is empty or not loaded.")

        # Load column names from user_dataset_prompt.json or infer dynamically
        project_dir = '/content/drive/MyDrive/Sentiment_Project'
        prompt_file = os.path.join(project_dir, 'user_dataset_prompt.json')
        if os.path.exists(prompt_file):
            with open(prompt_file, 'r') as f:
                prompt_data = json.load(f)
            text_column = prompt_data.get('text_column', 'review')
            label_column = prompt_data.get('label_column', 'sentiment')
            logger.info("Loaded columns from prompt - Text: %s, Label: %s", text_column, label_column)
        else:
            # Infer columns dynamically
            text_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].str.len().mean() > 10]
            label_cols = [col for col in df.columns if df[col].dtype in ['object', 'int', 'float'] and df[col].nunique() < len(df) / 10]
            if not text_cols or not label_cols:
                raise ValueError("Could not infer text or label columns. Ensure dataset contains text and categorical label columns.")
            text_column, label_column = text_cols[0], label_cols[0]
            logger.info("Inferred columns - Text: %s, Label: %s", text_column, label_column)

        # Validate columns exist
        if text_column not in df.columns or label_column not in df.columns:
            raise ValueError(f"Missing inferred columns: {text_column} or {label_column}.")

        # Load tokenizer
        try:
            tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        except Exception as e:
            logger.error(f"Failed to load tokenizer: {e}. Using dummy data.")
            raise

        # Tokenize reviews
        reviews = list(df[text_column].fillna(''))
        if not reviews:
            raise ValueError("No reviews to tokenize after filling NaN values.")
        encodings = tokenizer(reviews, padding=True, truncation=True, max_length=128, return_tensors='pt')

        # Convert to NumPy
        input_ids = encodings['input_ids'].numpy()
        attention_mask = encodings['attention_mask'].numpy()

        # Dynamically map labels to integers
        unique_labels = df[label_column].unique()
        label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
        labels = df[label_column].map(label_mapping).fillna(0).astype(int).values
        logger.info("Dynamic label mapping: %s", label_mapping)

        # Validate shapes
        if len(input_ids) != len(labels):
            raise ValueError(f"Shape mismatch: input_ids ({len(input_ids)}), labels ({len(labels)})")
    else:
        logger.info("Preprocessed data already loaded; skipping fallback preprocessing.")
except Exception as e:
    logger.warning(f"Preprocessing failed: {e}. Using dummy data.")
    input_ids = np.zeros((100, 128), dtype=np.int32)
    attention_mask = np.ones((100, 128), dtype=np.int32)
    labels = np.random.randint(0, 2, 100)

# Create dataset and loader
try:
    dataset = SentimentDataset(input_ids, attention_mask, labels)
    loader = DataLoader(dataset, batch_size=16, shuffle=True)
    logger.info(f"Dataset created with {len(dataset)} samples.")
except Exception as e:
    logger.error(f"Failed to create DataLoader: {e}. Using empty dataset.")
    dataset = SentimentDataset(None, None, None)
    loader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
# Cell 3: Model Selector Agent with PPO
# Purpose: Dynamically select the best sentiment analysis approach using reinforcement learning.

class ModelSelectionEnv(gym.Env):
    def __init__(self, dataset_size, intent, gpu_available, target_accuracy, available_memory, text_complexity):
        super(ModelSelectionEnv, self).__init__()
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(8,), dtype=np.float32)
        self.dataset_size = max(1, dataset_size)
        self.intent = intent if intent in ['performance', 'efficiency'] else 'performance'
        self.gpu_available = bool(gpu_available)
        self.target_accuracy = min(1.0, max(0.0, target_accuracy))
        self.available_memory = min(12.0, max(0.1, available_memory)) / 12.0
        self.text_complexity = min(1.0, max(0.0, text_complexity))
        self.state = self._get_state()
        self.step_count = 0
        self.max_steps = 50

    def _get_state(self):
        memory = psutil.virtual_memory()
        return np.array([
            self.dataset_size / 100000,
            1.0 if self.intent == 'performance' else 0.0,
            1.0 if self.gpu_available else 0.0,
            self.target_accuracy,
            self.available_memory,
            self.text_complexity,
            memory.used / (1024 ** 3) / 12.0,
            psutil.cpu_percent() / 100.0
        ], dtype=np.float32)

    def reset(self, seed=None, **kwargs):
        self.step_count = 0
        self.state = self._get_state()
        return self.state, {}

    def step(self, action):
        self.step_count += 1
        model_config = self._action_to_model_config(action)
        reward = self._compute_reward(action, model_config)
        done = self.step_count >= self.max_steps
        info = {'model_config': model_config}
        return self.state, reward, done, done, info

    def _action_to_model_config(self, action):
        architecture_type, computational_cost, learning_capacity, data_fit, pretraining_level, \
        ensemble_weight, latency_sensitivity, regularization_strength = action

        if architecture_type < 0.1:  # Rule-Based
            approach = "Rule-Based (Pattern/Syntactic)"
            memory_usage = 0.2
            inference_speed = 0.1
            training_time = 0.0
            accuracy_potential = 0.70 + data_fit * 0.1
            robustness = 0.6
            scalability = 0.8
            latency = 0.05
        elif architecture_type < 0.2:  # Traditional ML (Naive Bayes)
            approach = "Traditional ML (Naive Bayes-like)"
            memory_usage = 0.3
            inference_speed = 0.2
            training_time = 0.5
            accuracy_potential = 0.78 + data_fit * 0.15
            robustness = 0.7
            scalability = 0.6
            latency = 0.1
        elif architecture_type < 0.3:  # Traditional ML (Logistic Regression)
            approach = "Traditional ML (Logistic Regression-like)"
            memory_usage = 0.5
            inference_speed = 0.3
            training_time = 0.6
            accuracy_potential = 0.80 + data_fit * 0.15
            robustness = 0.7
            scalability = 0.6
            latency = 0.15
        elif architecture_type < 0.4:  # Traditional ML (SVM)
            approach = "Traditional ML (SVM-like)"
            memory_usage = 0.7
            inference_speed = 0.4
            training_time = 1.0
            accuracy_potential = 0.82 + data_fit * 0.15
            robustness = 0.7
            scalability = 0.6
            latency = 0.2
        elif architecture_type < 0.5:  # Traditional ML (Gradient Boosting)
            approach = "Traditional ML (Gradient Boosting, e.g., XGBoost-like)"
            memory_usage = 0.8
            inference_speed = 0.5
            training_time = 1.2
            accuracy_potential = 0.83 + data_fit * 0.15
            robustness = 0.7
            scalability = 0.6
            latency = 0.25
        elif architecture_type < 0.6:  # Traditional ML (Random Forest)
            approach = "Traditional ML (Random Forest-like)"
            memory_usage = 0.6
            inference_speed = 0.4
            training_time = 0.9
            accuracy_potential = 0.81 + data_fit * 0.15
            robustness = 0.7
            scalability = 0.6
            latency = 0.2
        elif architecture_type < 0.65:  # Traditional ML (Decision Tree)
            approach = "Traditional ML (Decision Tree-like)"
            memory_usage = 0.4
            inference_speed = 0.3
            training_time = 0.7
            accuracy_potential = 0.75 + data_fit * 0.15
            robustness = 0.6
            scalability = 0.7
            latency = 0.15
        elif architecture_type < 0.7:  # Traditional ML (K-Nearest Neighbors)
            approach = "Traditional ML (K-Nearest Neighbors-like)"
            memory_usage = 0.5
            inference_speed = 0.35
            training_time = 0.8
            accuracy_potential = 0.77 + data_fit * 0.15
            robustness = 0.65
            scalability = 0.65
            latency = 0.18
        elif architecture_type < 0.75:  # Traditional ML (AdaBoost)
            approach = "Traditional ML (AdaBoost-like)"
            memory_usage = 0.6
            inference_speed = 0.45
            training_time = 1.0
            accuracy_potential = 0.80 + data_fit * 0.15
            robustness = 0.7
            scalability = 0.6
            latency = 0.22
        elif architecture_type < 0.8:  # Traditional ML (LightGBM)
            approach = "Traditional ML (LightGBM-like)"
            memory_usage = 0.9
            inference_speed = 0.55
            training_time = 1.3
            accuracy_potential = 0.84 + data_fit * 0.15
            robustness = 0.75
            scalability = 0.55
            latency = 0.25
        elif architecture_type < 0.85:  # Traditional ML (CatBoost)
            approach = "Traditional ML (CatBoost-like)"
            memory_usage = 1.0
            inference_speed = 0.6
            training_time = 1.4
            accuracy_potential = 0.85 + data_fit * 0.15
            robustness = 0.75
            scalability = 0.55
            latency = 0.28
        elif architecture_type < 0.875:  # Shallow Neural Network
            approach = "Shallow Neural Network (MLP-like)"
            memory_usage = 1.0
            inference_speed = 0.5
            training_time = 1.5
            accuracy_potential = 0.85 + data_fit * 0.1 + pretraining_level * 0.05
            robustness = 0.8
            scalability = 0.5
            latency = 0.3
        elif architecture_type < 0.9:  # Recurrent Neural Network
            approach = "Recurrent Neural Network (LSTM/GRU-like)"
            memory_usage = 2.0
            inference_speed = 1.0
            training_time = 2.0
            accuracy_potential = 0.87 + data_fit * 0.1 + pretraining_level * 0.05
            robustness = 0.8
            scalability = 0.5
            latency = 0.5
        elif architecture_type < 0.925:  # Convolutional Neural Network
            approach = "Convolutional Neural Network (CNN-like)"
            memory_usage = 1.5
            inference_speed = 0.8
            training_time = 1.8
            accuracy_potential = 0.86 + data_fit * 0.1 + pretraining_level * 0.05
            robustness = 0.8
            scalability = 0.5
            latency = 0.4
        elif architecture_type < 0.95:  # Bidirectional LSTM
            approach = "Bidirectional LSTM (BiLSTM-like)"
            memory_usage = 2.5
            inference_speed = 1.2
            training_time = 2.2
            accuracy_potential = 0.88 + data_fit * 0.1 + pretraining_level * 0.05
            robustness = 0.85
            scalability = 0.45
            latency = 0.6
        elif architecture_type < 0.9625:  # Gated Recurrent Unit
            approach = "Gated Recurrent Unit (GRU-like)"
            memory_usage = 2.0
            inference_speed = 1.0
            training_time = 2.0
            accuracy_potential = 0.87 + data_fit * 0.1 + pretraining_level * 0.05
            robustness = 0.8
            scalability = 0.5
            latency = 0.5
        elif architecture_type < 0.975:  # Feedforward Neural Network
            approach = "Feedforward Neural Network (FNN-like)"
            memory_usage = 1.2
            inference_speed = 0.6
            training_time = 1.6
            accuracy_potential = 0.84 + data_fit * 0.1 + pretraining_level * 0.05
            robustness = 0.75
            scalability = 0.55
            latency = 0.35
        elif architecture_type < 0.9875:  # Hybrid (CNN-RNN)
            approach = "Hybrid (CNN-RNN)"
            memory_usage = 4.0
            inference_speed = 2.0
            training_time = 3.0
            accuracy_potential = 0.90 + data_fit * 0.08
            robustness = 0.85
            scalability = 0.4
            latency = 0.8
        else:  # Transformer-Based Models
            if pretraining_level < 0.3:
                approach = "Deep Learning (Custom Transformer)"
                memory_usage = 5.0
                inference_speed = 2.5
                training_time = 5.0
                accuracy_potential = 0.92 + data_fit * 0.06
                robustness = 0.9
                scalability = 0.3
                latency = 1.0
            elif pretraining_level < 0.5:
                approach = "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)"
                memory_usage = 2.0
                inference_speed = 1.5
                training_time = 2.0
                accuracy_potential = 0.90 + data_fit * 0.08 + pretraining_level * 0.05
                robustness = 0.85
                scalability = 0.4
                latency = 0.6
            elif pretraining_level < 0.7:
                approach = "BERT (Bidirectional Encoder Representations from Transformers)"
                memory_usage = 3.0
                inference_speed = 1.8
                training_time = 2.5
                accuracy_potential = 0.93 + data_fit * 0.06 + pretraining_level * 0.04
                robustness = 0.9
                scalability = 0.35
                latency = 0.7
            elif pretraining_level < 0.8:
                approach = "RoBERTa (Robustly Optimized BERT Pretraining Approach)"
                memory_usage = 3.5
                inference_speed = 2.0
                training_time = 2.8
                accuracy_potential = 0.94 + data_fit * 0.06 + pretraining_level * 0.04
                robustness = 0.9
                scalability = 0.35
                latency = 0.75
            elif pretraining_level < 0.85:
                approach = "ALBERT (A Lite BERT)"
                memory_usage = 2.5
                inference_speed = 1.6
                training_time = 2.2
                accuracy_potential = 0.91 + data_fit * 0.07 + pretraining_level * 0.04
                robustness = 0.85
                scalability = 0.4
                latency = 0.65
            elif pretraining_level < 0.9:
                approach = "XLNet (Generalized Autoregressive Pretraining)"
                memory_usage = 4.0
                inference_speed = 2.2
                training_time = 3.0
                accuracy_potential = 0.95 + data_fit * 0.05 + pretraining_level * 0.04
                robustness = 0.9
                scalability = 0.3
                latency = 0.9
            elif pretraining_level < 0.925:
                approach = "T5 (Text-To-Text Transfer Transformer)"
                memory_usage = 5.5
                inference_speed = 2.5
                training_time = 3.5
                accuracy_potential = 0.94 + data_fit * 0.06 + pretraining_level * 0.04
                robustness = 0.9
                scalability = 0.3
                latency = 1.0
            elif pretraining_level < 0.95:
                approach = "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)"
                memory_usage = 3.5
                inference_speed = 2.0
                training_time = 2.8
                accuracy_potential = 0.95 + data_fit * 0.05 + pretraining_level * 0.04
                robustness = 0.9
                scalability = 0.35
                latency = 0.75
            elif pretraining_level < 0.975:
                approach = "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)"
                memory_usage = 3.0
                inference_speed = 1.9
                training_time = 2.6
                accuracy_potential = 0.93 + data_fit * 0.06 + pretraining_level * 0.04
                robustness = 0.9
                scalability = 0.35
                latency = 0.7
            elif pretraining_level < 0.9875:
                approach = "Longformer (for long documents)"
                memory_usage = 6.0
                inference_speed = 2.8
                training_time = 4.0
                accuracy_potential = 0.96 + data_fit * 0.05 + pretraining_level * 0.03
                robustness = 0.9
                scalability = 0.25
                latency = 1.2
            else:
                approach = "BigBird (sparse attention for long sequences)"
                memory_usage = 6.5
                inference_speed = 3.0
                training_time = 4.5
                accuracy_potential = 0.97 + data_fit * 0.04 + pretraining_level * 0.03
                robustness = 0.9
                scalability = 0.25
                latency = 1.3

        # Lexicon fallback (no training, fixed reward)
        if approach.startswith("Lexicon"):
            approach = "Lexicon-Based (e.g., Pattern-like)"
            memory_usage = 0.1
            inference_speed = 0.05
            training_time = 0.0
            accuracy_potential = 0.65
            robustness = 0.5
            scalability = 0.9
            latency = 0.03
            return {
                "approach": approach, "computational_cost": 0.1, "learning_capacity": 0.0,
                "data_fit": 0.0, "pretraining_level": 0.0, "ensemble_weight": 0.0,
                "latency_sensitivity": 0.0, "regularization_strength": 0.0,
                "memory_usage": memory_usage, "inference_speed": inference_speed,
                "training_time": training_time, "accuracy_potential": accuracy_potential,
                "robustness": robustness, "scalability": scalability, "latency": latency
            }

        # Resource and performance adjustments
        if memory_usage > self.available_memory * 12.0 and not self.gpu_available:
            memory_usage = self.available_memory * 12.0 * 0.9
            inference_speed *= 1.5
            training_time *= 1.2
            accuracy_potential *= 0.95

        return {
            "approach": approach, "computational_cost": computational_cost,
            "learning_capacity": learning_capacity, "data_fit": data_fit,
            "pretraining_level": pretraining_level, "ensemble_weight": ensemble_weight,
            "latency_sensitivity": latency_sensitivity, "regularization_strength": regularization_strength,
            "memory_usage": memory_usage, "inference_speed": inference_speed,
            "training_time": training_time, "accuracy_potential": accuracy_potential,
            "robustness": robustness, "scalability": scalability, "latency": latency
        }

    def _compute_reward(self, action, model_config):
        architecture_type, computational_cost, learning_capacity, data_fit, pretraining_level, \
        ensemble_weight, latency_sensitivity, regularization_strength = action
        memory_usage = model_config["memory_usage"]
        inference_speed = model_config["inference_speed"]
        training_time = model_config["training_time"]
        accuracy_potential = model_config["accuracy_potential"]
        robustness = model_config["robustness"]
        scalability = model_config["scalability"]
        latency = model_config["latency"]

        # Multi-objective reward function with adaptive weights
        weight_accuracy = 0.5 if self.intent == 'performance' else 0.3
        weight_efficiency = 0.3 if self.intent == 'performance' else 0.5
        weight_robustness = 0.1
        weight_scalability = 0.05
        weight_latency = 0.05

        # Base accuracy reward, adjusted for dataset size and complexity
        base_accuracy_reward = accuracy_potential * weight_accuracy
        if self.dataset_size < 100:
            base_accuracy_reward *= (1.0 + self.text_complexity * 0.2)
        elif self.dataset_size < 1000:
            base_accuracy_reward *= (1.0 + learning_capacity * 0.3 - computational_cost * 0.2)
        elif self.dataset_size < 10000:
            base_accuracy_reward *= (1.0 + pretraining_level * 0.2 + data_fit * 0.2)
        else:
            base_accuracy_reward *= (1.0 + pretraining_level * 0.3 + ensemble_weight * 0.1)

        # Efficiency reward (computational cost, memory, training time)
        efficiency_reward = (1.0 - computational_cost) * 0.4 / (memory_usage + 1e-6) * 0.3 / (training_time + 1e-6) * weight_efficiency

        # Robustness and scalability reward
        robustness_reward = robustness * weight_robustness
        scalability_reward = scalability * weight_scalability

        # Latency reward
        latency_reward = (1.0 - latency_sensitivity * latency) * weight_latency

        # Regularization penalty to prevent overfitting
        regularization_penalty = -regularization_strength * 0.1 if accuracy_potential > 0.95 else 0.0

        # Penalties
        memory_penalty = -5.0 if memory_usage > self.available_memory * 12.0 else 0.0
        runtime_penalty = -2.0 if inference_speed > 2.0 and not self.gpu_available else 0.0
        training_penalty = -2.0 if training_time > 3.0 and self.intent == 'efficiency' else 0.0
        complexity_mismatch = -3.0 if (self.text_complexity < 0.3 and computational_cost > 0.7) else 0.0
        simplicity_mismatch = -3.0 if (self.text_complexity > 0.7 and computational_cost < 0.3) else 0.0

        # Generalization potential bonus
        generalization_bonus = 0.1 * (1.0 - abs(self.target_accuracy - accuracy_potential)) if robustness > 0.7 else 0.0

        # Total reward
        reward = (base_accuracy_reward + efficiency_reward + robustness_reward +
                  scalability_reward + latency_reward + generalization_bonus +
                  regularization_penalty + memory_penalty + runtime_penalty +
                  training_penalty + complexity_mismatch + simplicity_mismatch)

        return np.clip(reward, -10.0, 10.0)

# Initialize environment and PPO agent
env = DummyVecEnv([lambda: ModelSelectionEnv(len(df) if 'df' in globals() else 1000,
                                            intent, gpu_available, target_accuracy,
                                            psutil.virtual_memory().available / (1024 ** 3),
                                            (sum(len(r.split()) for r in df['review']) / len(df) / 50 +
                                             len(set(" ".join(df['review']).split())) / 10000) / 2 if 'df' in globals() else 0.5)])
ppo_model = PPO('MlpPolicy', env, device='cpu', verbose=0, learning_rate=0.0001, batch_size=128, clip_range=0.2, ent_coef=0.01)
ppo_model.learn(total_timesteps=10000, progress_bar=False)

# Select initial model configuration
obs = env.reset()
best_reward = float('-inf')
best_config = None
for i in tqdm(range(20), desc="Initial Model Selection Trials"):
    action, _ = ppo_model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    if reward > best_reward:
        best_reward = reward
        best_config = info[0]['model_config']
selected_config = best_config
ray.get(coordinator.log_message.remote(f"Initial Selected Model: {selected_config['approach']}"))
print(f"Initial Selected Model: {selected_config['approach']}")

Initial Model Selection Trials: 100%|██████████| 20/20 [00:00<00:00, 357.93it/s]


Initial Selected Model: Rule-Based (Pattern/Syntactic)


In [None]:
# Cell 4: MAML Pre-Training
# Purpose: Pre-train MAML on benchmark datasets for deep learning models.

class BenchmarkDataset(Dataset):
    def __init__(self, num_samples=1000, num_labels=len(set(labels)) if 'labels' in globals() else 2):
        self.input_ids = torch.randint(0, 30000, (num_samples, 128))
        self.attention_mask = torch.ones(num_samples, 128, dtype=torch.long)
        self.labels = torch.randint(0, num_labels, (num_samples,))

    def __len__(self): return len(self.labels)
    def __getitem__(self, idx): return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], 'labels': self.labels[idx]}

class MAML:
    def __init__(self, model, lr_inner=0.01, lr_outer=0.001):
        self.model = model
        self.lr_inner = lr_inner
        self.lr_outer = lr_outer
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr_outer)

    def inner_update(self, inputs, labels, params=None):
        model = self.model
        if params: model.load_state_dict(params)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        grads = torch.autograd.grad(loss, model.parameters(), create_graph=True)
        return {name: param - self.lr_inner * grad for (name, param), grad in zip(model.named_parameters(), grads)}

    def meta_train(self, tasks, num_iterations=100):
        self.model.to(device)
        for iteration in tqdm(range(num_iterations), desc="MAML Pre-Training"):
            meta_loss = 0
            for task in tasks:
                loader = DataLoader(task, batch_size=16, shuffle=True)
                support_batch = next(iter(loader))
                query_batch = next(iter(loader))
                support_inputs = {k: v.to(device) for k, v in support_batch.items() if k != 'labels'}
                support_labels = support_batch['labels'].to(device)
                query_inputs = {k: v.to(device) for k, v in query_batch.items() if k != 'labels'}
                query_labels = query_batch['labels'].to(device)
                updated_params = self.inner_update(support_inputs, support_labels)
                self.model.load_state_dict(updated_params)
                query_outputs = self.model(**query_inputs, labels=query_labels)
                meta_loss += query_outputs.loss
            self.optimizer.zero_grad()
            meta_loss.backward()
            self.optimizer.step()
        return self.model.state_dict()

maml_model = None
if any(s in (selected_config.get('approach', 'performance')) for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                                "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                                "Hybrid", "Deep Learning"]):
    from transformers import DistilBertForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification
    model_map = {
        "Shallow Neural Network (MLP-like)": DistilBertForSequenceClassification,
        "Recurrent Neural Network (LSTM/GRU-like)": DistilBertForSequenceClassification,
        "Convolutional Neural Network (CNN-like)": DistilBertForSequenceClassification,
        "Bidirectional LSTM (BiLSTM-like)": BertForSequenceClassification,
        "Gated Recurrent Unit (GRU-like)": BertForSequenceClassification,
        "Feedforward Neural Network (FNN-like)": DistilBertForSequenceClassification,
        "Hybrid (CNN-RNN)": BertForSequenceClassification,
        "Deep Learning (Custom Transformer)": BertForSequenceClassification,
        "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)": DistilBertForSequenceClassification,
        "BERT (Bidirectional Encoder Representations from Transformers)": BertForSequenceClassification,
        "RoBERTa (Robustly Optimized BERT Pretraining Approach)": RobertaForSequenceClassification,
        "ALBERT (A Lite BERT)": DistilBertForSequenceClassification,
        "XLNet (Generalized Autoregressive Pretraining)": BertForSequenceClassification,
        "T5 (Text-To-Text Transfer Transformer)": BertForSequenceClassification,
        "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)": BertForSequenceClassification,
        "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)": BertForSequenceClassification,
        "Longformer (for long documents)": BertForSequenceClassification,
        "BigBird (sparse attention for long sequences)": BertForSequenceClassification,
        "Deep Learning (Advanced Pretrained Transformer, e.g., PaLM-like)": RobertaForSequenceClassification
    }
    model_class = model_map.get(selected_config.get('model', 'Shallow Neural Network (MLP-like)'), DistilBertForSequenceClassification)
    # Dynamically set num_labels based on unique labels in the dataset
    num_labels = len(set(labels)) if 'labels' in globals() and labels.size > 0 else 2
    maml_model = model_class.from_pretrained(model_class.pretrained_model_name_or_path if hasattr(model_class, 'pretrained_model_name_or_path') else 'distilbert-base-uncased', num_labels=num_labels)
    if torch.cuda.is_available(): maml_model.cuda()
    tasks = [BenchmarkDataset() for _ in range(3)]
    maml = MAML(maml_model)
    meta_params = maml.meta_train(tasks)
    drive_path = '/content/drive/MyDrive'
    os.makedirs(drive_path, exist_ok=True)
    meta_params_path = os.path.join(drive_path, 'meta_learned_params.pt')
    torch.save(meta_params, meta_params_path)
    ray.get(coordinator.log_message.remote(f"Meta-learned parameters saved to {meta_params_path}"))
    logger.info(f"Meta-learned parameters saved to {meta_params_path}")
else:
    meta_params_path = None
    # Use a fallback model name if selected_config['model'] is missing
    default_model = selected_config.get('model', 'Unknown Model')
    ray.get(coordinator.log_message.remote(f"Skipping MAML pre-training for {default_model} (non-deep learning model)."))
    logger.info(f"Skipping MAML pre-training for {default_model} (non-deep learning model).")

In [None]:
# Cell 5: Fine-Tune with MAML
# Purpose: Fine-tune the selected model on the target dataset.

class SentimentDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = torch.tensor(input_ids, dtype=torch.long) if input_ids is not None else torch.empty(0)
        self.attention_mask = torch.tensor(attention_mask, dtype=torch.long) if attention_mask is not None else torch.ones(0, 128)
        self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else torch.empty(0)
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx): return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], 'labels': self.labels[idx]}

try:
    dataset = SentimentDataset(input_ids, attention_mask, labels)
except Exception:
    logger.warning("Using empty dataset due to missing inputs.")
    dataset = SentimentDataset(None, None, None)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

maml_model = None
# Ensure selected_config has a default value if not defined
if 'selected_config' not in globals():
    selected_config = {
        'approach': 'Deep Learning',
        'model': 'Shallow Neural Network (MLP-like)',
        'hyperparams': {'learning_rate': 2e-5, 'batch_size': 16}
    }
    logger.warning("selected_config not found. Initialized with default values: %s", selected_config)

if any(s in selected_config.get('approach', 'Deep Learning') for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                                "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                                "Hybrid", "Deep Learning"]):
    from transformers import DistilBertForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification
    model_map = {
        "Shallow Neural Network (MLP-like)": DistilBertForSequenceClassification,
        "Recurrent Neural Network (LSTM/GRU-like)": DistilBertForSequenceClassification,
        "Convolutional Neural Network (CNN-like)": DistilBertForSequenceClassification,
        "Bidirectional LSTM (BiLSTM-like)": BertForSequenceClassification,
        "Gated Recurrent Unit (GRU-like)": BertForSequenceClassification,
        "Feedforward Neural Network (FNN-like)": DistilBertForSequenceClassification,
        "Hybrid (CNN-RNN)": BertForSequenceClassification,
        "Deep Learning (Custom Transformer)": BertForSequenceClassification,
        "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)": DistilBertForSequenceClassification,
        "BERT (Bidirectional Encoder Representations from Transformers)": BertForSequenceClassification,
        "RoBERTa (Robustly Optimized BERT Pretraining Approach)": RobertaForSequenceClassification,
        "ALBERT (A Lite BERT)": DistilBertForSequenceClassification,
        "XLNet (Generalized Autoregressive Pretraining)": BertForSequenceClassification,
        "T5 (Text-To-Text Transfer Transformer)": BertForSequenceClassification,
        "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)": BertForSequenceClassification,
        "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)": BertForSequenceClassification,
        "Longformer (for long documents)": BertForSequenceClassification,
        "BigBird (sparse attention for long sequences)": BertForSequenceClassification,
        "Deep Learning (Advanced Pretrained Transformer, e.g., PaLM-like)": RobertaForSequenceClassification
    }
    model_class = model_map.get(selected_config.get('model', 'Shallow Neural Network (MLP-like)'), DistilBertForSequenceClassification)
    # Dynamically set num_labels based on unique labels in the dataset
    num_labels = len(set(labels)) if 'labels' in globals() and labels.size > 0 else 2
    maml_model = model_class.from_pretrained(model_class.pretrained_model_name_or_path if hasattr(model_class, 'pretrained_model_name_or_path') else 'distilbert-base-uncased', num_labels=num_labels)
    maml_model.to(device)
    meta_params_path = '/content/drive/MyDrive/meta_learned_params.pt'
    if os.path.exists(meta_params_path):
        try: maml_model.load_state_dict(torch.load(meta_params_path))
        except: logger.warning(f"Failed to load meta parameters. Using pre-trained weights.")
    if len(dataset) > 0:
        maml = MAML(maml_model)
        ray.get(coordinator.log_message.remote("Starting MAML fine-tuning..."))
        logger.info("Starting MAML fine-tuning...")
        optimizer = AdamW(maml_model.parameters(), lr=2e-5)
        maml_model.train()
        for epoch in tqdm(range(2), desc="Fine-Tuning Epochs"):
            for batch in loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                labels_batch = batch['labels'].to(device)
                updated_params = maml.inner_update(inputs, labels_batch)
                maml_model.load_state_dict(updated_params)
                outputs = maml_model(**inputs, labels=labels_batch)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
        drive_path = '/content/drive/MyDrive'
        os.makedirs(drive_path, exist_ok=True)
        fine_tuned_path = os.path.join(drive_path, f'fine_tuned_{selected_config.get("model", "default_model").replace(" ", "_").lower()}.pt')
        torch.save(maml_model.state_dict(), fine_tuned_path)
        ray.get(coordinator.log_message.remote(f"Fine-tuned model saved to {fine_tuned_path}"))
        logger.info(f"Fine-tuned model saved to {fine_tuned_path}")
    else:
        ray.get(coordinator.log_message.remote("No fine-tuning performed (empty dataset)."))
        logger.info("No fine-tuning performed (empty dataset).")
elif "Traditional ML" in selected_config.get('approach', ''):
    from sklearn.pipeline import make_pipeline
    ml_model_map = {
        "Traditional ML (Naive Bayes-like)": None,  # Placeholder for MultinomialNB
        "Traditional ML (Logistic Regression-like)": LogisticRegression(max_iter=1000),
        "Traditional ML (SVM-like)": SVC(probability=True),
        "Traditional ML (Gradient Boosting, e.g., XGBoost-like)": GradientBoostingClassifier(),
        "Traditional ML (Random Forest-like)": RandomForestClassifier(),
        "Traditional ML (Decision Tree-like)": None,  # Placeholder for DecisionTreeClassifier
        "Traditional ML (K-Nearest Neighbors-like)": None,  # Placeholder for KNeighborsClassifier
        "Traditional ML (AdaBoost-like)": None,  # Placeholder for AdaBoostClassifier
        "Traditional ML (LightGBM-like)": None,  # Placeholder for LGBMClassifier
        "Traditional ML (CatBoost-like)": None  # Placeholder for CatBoostClassifier
    }
    ml_model = ml_model_map.get(selected_config.get('model', 'Traditional ML (Logistic Regression-like)'))
    if ml_model is not None:
        # Load column names dynamically
        project_dir = '/content/drive/MyDrive/Sentiment_Project'
        prompt_file = os.path.join(project_dir, 'user_dataset_prompt.json')
        if os.path.exists(prompt_file):
            with open(prompt_file, 'r') as f:
                prompt_data = json.load(f)
            text_column = prompt_data.get('text_column', 'review')
            label_column = prompt_data.get('label_column', 'sentiment')
        else:
            text_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].str.len().mean() > 10]
            label_cols = [col for col in df.columns if df[col].dtype in ['object', 'int', 'float'] and df[col].nunique() < len(df) / 10]
            text_column, label_column = text_cols[0], label_cols[0]

        vectorizer = TfidfVectorizer(max_features=5000)
        X = vectorizer.fit_transform(df[text_column].fillna(''))
        # Dynamically map labels
        unique_labels = df[label_column].unique()
        label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
        y = df[label_column].map(label_mapping).fillna(0).astype(int)
        ml_model.fit(X, y)
        fine_tuned_path = os.path.join('/content/drive/MyDrive', f'fine_tuned_{selected_config.get("model", "logistic_regression").replace(" ", "_").lower()}.joblib')
        joblib.dump(ml_model, fine_tuned_path)
        ray.get(coordinator.log_message.remote(f"Fine-tuned ML model saved to {fine_tuned_path}"))
        logger.info(f"Fine-tuned ML model saved to {fine_tuned_path}")
    else:
        model_name = selected_config.get('model', 'Unknown Model')
        ray.get(coordinator.log_message.remote(f"Skipping fine-tuning for {model_name} (model not implemented)."))
        logger.info(f"Skipping fine-tuning for {model_name} (model not implemented).")
else:
    model_name = selected_config.get('model', 'Unknown Model')
    approach_name = selected_config.get('approach', 'Unknown Approach')
    ray.get(coordinator.log_message.remote(f"Skipping fine-tuning for unknown approach {approach_name} with model {model_name}"))
    logger.info(f"Skipping fine-tuning for unknown approach {approach_name} with model {model_name}")

In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [None]:
# Cell 7: Model Switching and Iterative Training
# Purpose: Switch to alternative models if target accuracy is not met and retrain, with memory optimization.

import torch
import gc
from collections import deque
import joblib
from tqdm import tqdm
import os
import logging
import psutil
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader, Subset
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure device is defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to monitor and log memory usage
def log_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    ram_usage_mb = mem_info.rss / 1024 ** 2
    logger.info(f"Current RAM usage: {ram_usage_mb:.2f} MB")
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.memory_allocated() / 1024 ** 2
        logger.info(f"Current GPU memory usage: {gpu_mem:.2f} MB")
    return ram_usage_mb

# SentimentDataset definition
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        try:
            if input_ids is None or attention_mask is None or labels is None:
                raise ValueError("Input data (input_ids, attention_mask, labels) cannot be None")
            self.input_ids = torch.tensor(input_ids, dtype=torch.long) if input_ids is not None else torch.empty(0)
            self.attention_mask = torch.tensor(attention_mask, dtype=torch.long) if attention_mask is not None else torch.ones(0, 128)
            self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else torch.empty(0)
            if len(self.input_ids) != len(self.attention_mask) or len(self.input_ids) != len(self.labels):
                raise ValueError(f"Length mismatch: input_ids ({len(self.input_ids)}), attention_mask ({len(self.attention_mask)}), labels ({len(self.labels)})")
            logger.info(f"SentimentDataset initialized with {len(self.labels)} samples")
        except Exception as e:
            logger.error(f"Failed to initialize SentimentDataset: {e}. Using empty dataset.")
            self.input_ids = torch.empty(0)
            self.attention_mask = torch.ones(0, 128)
            self.labels = torch.empty(0)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        try:
            return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], 'labels': self.labels[idx]}
        except Exception as e:
            logger.error(f"Error in __getitem__ at index {idx}: {e}")
            raise

# Recreate loader if not defined
if 'loader' not in globals() or loader is None:
    try:
        logger.info("Loader not found. Recreating from input_ids, attention_mask, and labels.")
        if 'input_ids' not in globals() or 'attention_mask' not in globals() or 'labels' not in globals():
            raise NameError("Required data (input_ids, attention_mask, labels) not found in globals. Please run earlier cells (e.g., Cell 6).")
        dataset = SentimentDataset(input_ids, attention_mask, labels)
        if len(dataset) == 0:
            raise ValueError("Dataset is empty after initialization.")
        loader = DataLoader(dataset, batch_size=8, shuffle=True)
        logger.info(f"Recreated loader with {len(dataset)} samples.")
        first_batch = next(iter(loader), None)
        if first_batch is None:
            raise ValueError("Loader is empty; no batches available.")
        logger.info(f"First batch shapes: input_ids={first_batch['input_ids'].shape}, attention_mask={first_batch['attention_mask'].shape}, labels={first_batch['labels'].shape}")
    except Exception as e:
        logger.error(f"Failed to recreate loader: {e}. Using empty loader.")
        dataset = SentimentDataset(None, None, None)
        loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Limit dataset size for memory efficiency
if len(loader.dataset) > 1000:
    subset_indices = torch.randperm(len(loader.dataset))[:1000]
    subset_dataset = Subset(loader.dataset, subset_indices)
    loader = DataLoader(subset_dataset, batch_size=8, shuffle=True)
    logger.info(f"Dataset size capped at 1000 samples to reduce memory usage.")
else:
    loader = DataLoader(loader.dataset, batch_size=8, shuffle=True)
    logger.info(f"Using full dataset with {len(loader.dataset)} samples.")

# Define missing variables with fallbacks
if 'accuracy' not in globals():
    accuracy = 0.0
    logger.warning("accuracy not found. Defaulting to 0.0.")
if 'selected_config' not in globals():
    selected_config = {
        'approach': 'performance',
        'model': 'Shallow Neural Network (MLP-like)',
        'hyperparams': {'learning_rate': (1e-5, 5e-5), 'batch_size': (8, 32)}
    }
    logger.warning("selected_config not found. Defaulting to Shallow Neural Network (MLP-like) with default hyperparameters.")
else:
    if 'model' not in selected_config and 'approach' in selected_config:
        current_approach = selected_config['approach']
        if 'Traditional ML' in current_approach:
            selected_config['model'] = 'Traditional ML (Logistic Regression-like)'
            selected_config['approach'] = 'Traditional ML'
        elif 'Deep Learning' in current_approach:
            selected_config['model'] = 'Shallow Neural Network (MLP-like)'
            selected_config['approach'] = 'Deep Learning'
        else:
            selected_config['model'] = 'Hybrid (CNN-RNN)'
            selected_config['approach'] = 'Hybrid'
        logger.info(f"Inferred 'model' from existing 'approach'. Updated selected_config: {selected_config}")
if 'target_accuracy' not in globals():
    target_accuracy = 0.85
    logger.warning("target_accuracy not found. Defaulting to 0.85.")
if 'model_priority' not in globals():
    model_priority = deque([
        "Traditional ML (Logistic Regression-like)",
        "Traditional ML (Random Forest-like)",
        "Shallow Neural Network (MLP-like)",
        "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)",
        "BERT (Bidirectional Encoder Representations from Transformers)",
        "RoBERTa (Robustly Optimized BERT Pretraining Approach)",
        "ALBERT (A Lite BERT)",
        "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)",
        "Traditional ML (SVM-like)",
        "Traditional ML (Gradient Boosting, e.g., XGBoost-like)",
        "Recurrent Neural Network (LSTM/GRU-like)",
        "Convolutional Neural Network (CNN-like)",
        "Bidirectional LSTM (BiLSTM-like)",
        "Gated Recurrent Unit (GRU-like)",
        "Feedforward Neural Network (FNN-like)",
        "Hybrid (CNN-RNN)",
        "Deep Learning (Custom Transformer)",
        "XLNet (Generalized Autoregressive Pretraining)",
        "T5 (Text-To-Text Transfer Transformer)",
        "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)",
        "Longformer (for long documents)",
        "BigBird (sparse attention for long sequences)",
        "Deep Learning (Advanced Pretrained Transformer, e.g., PaLM-like)",
        "Traditional ML (Naive Bayes-like)",
        "Traditional ML (Decision Tree-like)",
        "Traditional ML (K-Nearest Neighbors-like)",
        "Traditional ML (AdaBoost-like)",
        "Traditional ML (LightGBM-like)",
        "Traditional ML (CatBoost-like)"
    ])
    logger.info("model_priority not found. Initialized with default model list.")

# Validate selected_config
logger.info(f"Initial selected_config: {selected_config}")
if not isinstance(selected_config, dict):
    logger.error(f"selected_config is not a dictionary: {selected_config}")
    raise ValueError("selected_config must be a dictionary with 'approach' and 'model' keys.")
required_keys = ['approach', 'model']
for key in required_keys:
    if key not in selected_config:
        logger.error(f"selected_config missing required key: '{key}'. Current selected_config: {selected_config}")
        raise ValueError(f"selected_config must contain '{key}' key. Please set it in an earlier cell.")

# Validate 'approach' and 'model' values
valid_approaches = ['Traditional ML', 'Deep Learning', 'Hybrid']
valid_models = [
    "Shallow Neural Network (MLP-like)", "Recurrent Neural Network (LSTM/GRU-like)", "Convolutional Neural Network (CNN-like)",
    "Bidirectional LSTM (BiLSTM-like)", "Gated Recurrent Unit (GRU-like)", "Feedforward Neural Network (FNN-like)",
    "Hybrid (CNN-RNN)", "Deep Learning (Custom Transformer)",
    "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)",
    "BERT (Bidirectional Encoder Representations from Transformers)",
    "RoBERTa (Robustly Optimized BERT Pretraining Approach)", "ALBERT (A Lite BERT)",
    "XLNet (Generalized Autoregressive Pretraining)", "T5 (Text-To-Text Transfer Transformer)",
    "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)",
    "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)",
    "Longformer (for long documents)", "BigBird (sparse attention for long sequences)",
    "Deep Learning (Advanced Pretrained Transformer, e.g., PaLM-like)",
    "Traditional ML (Naive Bayes-like)", "Traditional ML (Logistic Regression-like)", "Traditional ML (SVM-like)",
    "Traditional ML (Gradient Boosting, e.g., XGBoost-like)", "Traditional ML (Random Forest-like)",
    "Traditional ML (Decision Tree-like)", "Traditional ML (K-Nearest Neighbors-like)",
    "Traditional ML (AdaBoost-like)", "Traditional ML (LightGBM-like)", "Traditional ML (CatBoost-like)"
]
if selected_config['model'] not in valid_models:
    logger.warning(f"Invalid 'model' in selected_config: {selected_config['model']}. Must be one of {valid_models}.")
    if 'Traditional ML' in selected_config['approach']:
        selected_config['model'] = 'Traditional ML (Logistic Regression-like)'
    elif 'Deep Learning' in selected_config['approach']:
        selected_config['model'] = 'Shallow Neural Network (MLP-like)'
    else:
        selected_config['model'] = 'Hybrid (CNN-RNN)'
    logger.info(f"Set 'model' to fallback: {selected_config['model']}")
if selected_config['approach'] not in valid_approaches:
    logger.error(f"Invalid 'approach' in selected_config: {selected_config['approach']}. Must be one of {valid_approaches}.")
    raise ValueError(f"selected_config['approach'] must be one of {valid_approaches}.")

# Add default hyperparameters if missing
if 'hyperparams' not in selected_config:
    if 'Traditional ML' in selected_config['approach']:
        selected_config['hyperparams'] = {
            'n_estimators': (50, 200),
            'max_depth': (10, 30, None),
            'min_samples_split': (2, 10)
        }
    else:  # Deep Learning or Hybrid
        selected_config['hyperparams'] = {
            'learning_rate': (1e-5, 5e-5),
            'batch_size': (8, 32)
        }
    logger.info(f"Added default hyperparameters: {selected_config['hyperparams']}")

# Save selected_config to a checkpoint file
checkpoint_path = '/content/drive/MyDrive/selected_config_checkpoint.pkl'
try:
    import pickle
    with open(checkpoint_path, 'wb') as f:
        pickle.dump(selected_config, f)
    logger.info(f"Saved selected_config to {checkpoint_path}")
except Exception as e:
    logger.warning(f"Failed to save selected_config checkpoint: {str(e)}")

best_accuracy = accuracy
best_config = selected_config.copy()
max_trials = 3
trials = 0
ram_threshold_mb = 10240

def log_message(msg):
    logger.info(msg)

while accuracy < target_accuracy and trials < max_trials and model_priority:
    current_ram = log_memory_usage()
    if current_ram > ram_threshold_mb:
        logger.warning(f"RAM usage ({current_ram:.2f} MB) exceeds threshold ({ram_threshold_mb} MB). Stopping trials.")
        break

    next_approach = model_priority.popleft()
    log_message(f"Switching to {next_approach} for trial {trials + 1}")
    selected_config['approach'] = 'Traditional ML' if 'Traditional ML' in next_approach else 'Deep Learning' if 'Deep Learning' in next_approach else 'Hybrid'
    selected_config['model'] = next_approach

    maml_model = None
    ml_model = None
    vectorizer = None

    if any(s in next_approach for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                       "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                       "Hybrid", "Deep Learning"]):
        from transformers import DistilBertForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification
        model_map = {
            "Shallow Neural Network (MLP-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
            "Recurrent Neural Network (LSTM/GRU-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
            "Convolutional Neural Network (CNN-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
            "Bidirectional LSTM (BiLSTM-like)": (BertForSequenceClassification, 'bert-base-uncased'),
            "Gated Recurrent Unit (GRU-like)": (BertForSequenceClassification, 'bert-base-uncased'),
            "Feedforward Neural Network (FNN-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
            "Hybrid (CNN-RNN)": (BertForSequenceClassification, 'bert-base-uncased'),
            "Deep Learning (Custom Transformer)": (BertForSequenceClassification, 'bert-base-uncased'),
            "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
            "BERT (Bidirectional Encoder Representations from Transformers)": (BertForSequenceClassification, 'bert-base-uncased'),
            "RoBERTa (Robustly Optimized BERT Pretraining Approach)": (RobertaForSequenceClassification, 'roberta-base'),
            "ALBERT (A Lite BERT)": (DistilBertForSequenceClassification, 'albert-base-v2'),
            "XLNet (Generalized Autoregressive Pretraining)": (BertForSequenceClassification, 'xlnet-base-cased'),
            "T5 (Text-To-Text Transfer Transformer)": (BertForSequenceClassification, 't5-small'),
            "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)": (BertForSequenceClassification, 'deberta-base'),
            "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)": (BertForSequenceClassification, 'electra-base-discriminator'),
            "Longformer (for long documents)": (BertForSequenceClassification, 'longformer-base-4096'),
            "BigBird (sparse attention for long sequences)": (BertForSequenceClassification, 'google/bigbird-roberta-base'),
            "Deep Learning (Advanced Pretrained Transformer, e.g., PaLM-like)": (RobertaForSequenceClassification, 'roberta-base')
        }
        model_class, pretrained_model = model_map.get(next_approach, (DistilBertForSequenceClassification, 'distilbert-base-uncased'))
        try:
            maml_model = model_class.from_pretrained(pretrained_model, num_labels=len(set(labels)) if 'labels' in globals() and labels.size > 0 else 2)
            maml_model.to(device)
            meta_params_path = '/content/drive/MyDrive/meta_learned_params.pt'
            if os.path.exists(meta_params_path):
                try:
                    maml_model.load_state_dict(torch.load(meta_params_path))
                except Exception as e:
                    logger.warning(f"Failed to load meta parameters: {e}. Using pre-trained weights.")
            maml = MAML(maml_model)
            maml_model.train()
            for epoch in tqdm(range(1), desc=f"Fine-Tuning {next_approach}"):
                for batch in loader:
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                    labels_batch = batch['labels'].to(device)
                    updated_params = maml.inner_update(inputs, labels_batch)
                    maml_model.load_state_dict(updated_params)
                    torch.cuda.empty_cache()
            fine_tuned_path = os.path.join('/content/drive/MyDrive', f'fine_tuned_{next_approach.replace(" ", "_").lower()}.pt')
            torch.save(maml_model.state_dict(), fine_tuned_path)
            log_message(f"Fine-tuned model saved to {fine_tuned_path}")
        except Exception as e:
            logger.error(f"Failed to train {next_approach}: {e}. Skipping to next model.")
            continue
        finally:
            del maml_model
            del maml
            torch.cuda.empty_cache()
            gc.collect()
    elif "Traditional ML" in next_approach:
        ml_model_map = {
            "Traditional ML (Naive Bayes-like)": None,
            "Traditional ML (Logistic Regression-like)": LogisticRegression(max_iter=1000),
            "Traditional ML (SVM-like)": SVC(probability=True),
            "Traditional ML (Gradient Boosting, e.g., XGBoost-like)": GradientBoostingClassifier(),
            "Traditional ML (Random Forest-like)": RandomForestClassifier(),
            "Traditional ML (Decision Tree-like)": None,
            "Traditional ML (K-Nearest Neighbors-like)": None,
            "Traditional ML (AdaBoost-like)": None,
            "Traditional ML (LightGBM-like)": None,
            "Traditional ML (CatBoost-like)": None
        }
        ml_model = ml_model_map.get(next_approach)
        if ml_model is not None:
            # Load column names dynamically
            project_dir = '/content/drive/MyDrive/Sentiment_Project'
            prompt_file = os.path.join(project_dir, 'user_dataset_prompt.json')
            if os.path.exists(prompt_file):
                with open(prompt_file, 'r') as f:
                    prompt_data = json.load(f)
                text_column = prompt_data.get('text_column', 'review')
                label_column = prompt_data.get('label_column', 'sentiment')
            else:
                text_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].str.len().mean() > 10]
                label_cols = [col for col in df.columns if df[col].dtype in ['object', 'int', 'float'] and df[col].nunique() < len(df) / 10]
                text_column, label_column = text_cols[0], label_cols[0]

            vectorizer = TfidfVectorizer(max_features=5000)
            X = vectorizer.fit_transform(df[text_column].fillna(''))
            # Dynamically map labels
            unique_labels = df[label_column].unique()
            label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
            y = df[label_column].map(label_mapping).fillna(0).astype(int)
            ml_model.fit(X, y)
            fine_tuned_path = os.path.join('/content/drive/MyDrive', f'fine_tuned_{next_approach.replace(" ", "_").lower()}.joblib')
            joblib.dump(ml_model, fine_tuned_path)
            log_message(f"Fine-tuned ML model saved to {fine_tuned_path}")
        else:
            log_message(f"Skipping fine-tuning for {next_approach} (model not implemented).")
            ml_model = None

    # Re-evaluate
    predictions = []
    true_labels = []
    if any(s in next_approach for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                       "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                       "Hybrid", "Deep Learning"]):
        try:
            maml_model = model_class.from_pretrained(pretrained_model, num_labels=len(set(labels)) if 'labels' in globals() and labels.size > 0 else 2)
            maml_model.to(device)
            if os.path.exists(fine_tuned_path):
                maml_model.load_state_dict(torch.load(fine_tuned_path))
            maml_model.eval()
            with torch.no_grad():
                for batch in tqdm(loader, desc=f"Evaluating {next_approach}"):
                    if len(batch['input_ids']) == 0:
                        log_message("Empty batch in loader. Skipping evaluation.")
                        break
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                    labels_batch = batch['labels'].to(device)
                    outputs = maml_model(**inputs)
                    logits = outputs.logits
                    preds = torch.argmax(logits, dim=1)
                    predictions.extend(preds.cpu().numpy())
                    true_labels.extend(labels_batch.cpu().numpy())
                    torch.cuda.empty_cache()
            accuracy = accuracy_score(true_labels, predictions) if predictions else 0.0
            f1 = f1_score(true_labels, predictions, average='weighted') if predictions else 0.0
        except Exception as e:
            logger.error(f"Evaluation failed for {next_approach}: {e}. Setting accuracy to 0.")
            accuracy = 0.0
            f1 = 0.0
        finally:
            del maml_model
            torch.cuda.empty_cache()
            gc.collect()
    elif "Traditional ML" in next_approach and ml_model is not None:
        try:
            X_test = vectorizer.transform(df[text_column].fillna(''))
            predictions = ml_model.predict(X_test)
            true_labels = y
            accuracy = accuracy_score(true_labels, predictions)
            f1 = f1_score(true_labels, predictions, average='weighted')
        except Exception as e:
            logger.error(f"Evaluation failed for {next_approach}: {e}. Setting accuracy to 0.")
            accuracy = 0.0
            f1 = 0.0
        finally:
            del X_test
            gc.collect()
    else:
        accuracy = 0.0
        f1 = 0.0

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_config = selected_config.copy()
        log_message(f"New best model: {next_approach} with accuracy {best_accuracy:.3f}")
    trials += 1

# Finalize selected_config
if best_accuracy > accuracy:
    selected_config = best_config
    accuracy = best_accuracy
    log_message(f"Final selected model switched to {selected_config['model']} with best accuracy {best_accuracy:.3f}")
else:
    log_message(f"No better model found. Sticking with initial {selected_config['model']} (accuracy: {accuracy:.3f})")

# Save updated selected_config to checkpoint
try:
    with open(checkpoint_path, 'wb') as f:
        pickle.dump(selected_config, f)
    logger.info(f"Updated selected_config saved to {checkpoint_path}")
except Exception as e:
    logger.warning(f"Failed to update selected_config checkpoint: {str(e)}")

# Final memory cleanup
torch.cuda.empty_cache()
gc.collect()
log_memory_usage()

# Optional: Print final chosen model configuration
print("\nFinal Model Configuration:")
print(f"Approach: {selected_config['approach']}")
print(f"Model: {selected_config['model']}")
print(f"Best Accuracy Achieved: {best_accuracy:.3f}")
print(f"Target Accuracy: {target_accuracy:.3f}")
print(f"Target Achieved: {best_accuracy >= target_accuracy}")




Final Model Configuration:
Approach: Traditional ML
Model: Traditional ML (Random Forest-like)
Best Accuracy Achieved: 1.000
Target Accuracy: 0.920
Target Achieved: True


In [None]:
# Cell 7: Model Switching and Iterative Training with Hyperparameter Optimization
# Purpose: Switch to alternative models if target accuracy is not met, optimize hyperparameters using Bayesian Optimization, and retrain with memory optimization.

import torch
import gc
from collections import deque
import joblib
from tqdm import tqdm
import os
import logging
import psutil
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader, Subset
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
import pandas as pd
import json

# Ensure device is defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define project directory
project_dir = '/content/drive/MyDrive/Sentiment_Project'
os.makedirs(project_dir, exist_ok=True)

# Function to monitor and log memory usage
def log_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    ram_usage_mb = mem_info.rss / 1024 ** 2
    logger.info(f"Current RAM usage: {ram_usage_mb:.2f} MB")
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.memory_allocated() / 1024 ** 2
        logger.info(f"Current GPU memory usage: {gpu_mem:.2f} MB")
    return ram_usage_mb

# SentimentDataset definition
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        try:
            if input_ids is None or attention_mask is None or labels is None:
                raise ValueError("Input data (input_ids, attention_mask, labels) cannot be None")
            self.input_ids = torch.tensor(input_ids, dtype=torch.long) if input_ids is not None else torch.empty(0)
            self.attention_mask = torch.tensor(attention_mask, dtype=torch.long) if attention_mask is not None else torch.ones(0, 128)
            self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else torch.empty(0)
            if len(self.input_ids) != len(self.attention_mask) or len(self.input_ids) != len(self.labels):
                raise ValueError(f"Length mismatch: input_ids ({len(self.input_ids)}), attention_mask ({len(self.attention_mask)}), labels ({len(self.labels)})")
            logger.info(f"SentimentDataset initialized with {len(self.labels)} samples")
        except Exception as e:
            logger.error(f"Failed to initialize SentimentDataset: {e}. Using empty dataset.")
            self.input_ids = torch.empty(0)
            self.attention_mask = torch.ones(0, 128)
            self.labels = torch.empty(0)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        try:
            return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], 'labels': self.labels[idx]}
        except Exception as e:
            logger.error(f"Error in __getitem__ at index {idx}: {e}")
            raise

# Load dataset
try:
    dataset_path = os.path.join(project_dir, 'processed_dataset.csv')
    df = pd.read_csv(dataset_path)
    logger.info(f"Loaded dataset from {dataset_path} with {len(df)} samples.")
except Exception as e:
    logger.error(f"Failed to load dataset from {dataset_path}: {e}")
    raise

# Recreate loader if not defined
if 'loader' not in globals() or loader is None:
    try:
        logger.info("Loader not found. Recreating from input_ids, attention_mask, and labels.")
        if 'input_ids' not in globals() or 'attention_mask' not in globals() or 'labels' in globals():
            raise NameError("Required data (input_ids, attention_mask, labels) not found in globals. Please run earlier cells (e.g., Cell 2).")
        dataset = SentimentDataset(input_ids, attention_mask, labels)
        if len(dataset) == 0:
            raise ValueError("Dataset is empty after initialization.")
        loader = DataLoader(dataset, batch_size=8, shuffle=True)
        logger.info(f"Recreated loader with {len(dataset)} samples.")
        first_batch = next(iter(loader), None)
        if first_batch is None:
            raise ValueError("Loader is empty; no batches available.")
        logger.info(f"First batch shapes: input_ids={first_batch['input_ids'].shape}, attention_mask={first_batch['attention_mask'].shape}, labels={first_batch['labels'].shape}")
    except Exception as e:
        logger.error(f"Failed to recreate loader: {e}. Using empty loader.")
        dataset = SentimentDataset(None, None, None)
        loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Limit dataset size for memory efficiency
if len(loader.dataset) > 1000:
    subset_indices = torch.randperm(len(loader.dataset))[:1000]
    subset_dataset = Subset(loader.dataset, subset_indices)
    loader = DataLoader(subset_dataset, batch_size=8, shuffle=True)
    logger.info(f"Dataset size capped at 1000 samples to reduce memory usage.")
else:
    loader = DataLoader(loader.dataset, batch_size=8, shuffle=True)
    logger.info(f"Using full dataset with {len(loader.dataset)} samples.")

# Define all possible models
ALL_MODELS = [
    "Traditional ML (Logistic Regression-like)",
    "Traditional ML (Random Forest-like)",
    "Shallow Neural Network (MLP-like)",
    "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)",
    "BERT (Bidirectional Encoder Representations from Transformers)",
    "RoBERTa (Robustly Optimized BERT Pretraining Approach)",
    "ALBERT (A Lite BERT)",
    "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)",
    "Traditional ML (SVM-like)",
    "Traditional ML (Gradient Boosting, e.g., XGBoost-like)",
    "Recurrent Neural Network (LSTM/GRU-like)",
    "Convolutional Neural Network (CNN-like)",
    "Bidirectional LSTM (BiLSTM-like)",
    "Gated Recurrent Unit (GRU-like)",
    "Feedforward Neural Network (FNN-like)",
    "Hybrid (CNN-RNN)",
    "Deep Learning (Custom Transformer)",
    "XLNet (Generalized Autoregressive Pretraining)",
    "T5 (Text-To-Text Transfer Transformer)",
    "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)",
    "Longformer (for long documents)",
    "BigBird (sparse attention for long sequences)",
    "Deep Learning (Advanced Pretrained Transformer, e.g., PaLM-like)",
    "Traditional ML (Naive Bayes-like)",
    "Traditional ML (Decision Tree-like)",
    "Traditional ML (K-Nearest Neighbors-like)",
    "Traditional ML (AdaBoost-like)",
    "Traditional ML (LightGBM-like)",
    "Traditional ML (CatBoost-like)"
]

# Define missing variables with fallbacks
if 'accuracy' not in globals():
    accuracy = 0.0
    logger.warning("accuracy not found. Defaulting to 0.0.")
if 'selected_config' not in globals():
    selected_config = {
        'approach': 'Deep Learning',
        'model': 'Shallow Neural Network (MLP-like)',
        'hyperparams': {}
    }
    logger.warning("selected_config not found. Initialized with placeholder.")
if 'target_accuracy' not in globals():
    target_accuracy = 0.85
    logger.warning("target_accuracy not found. Defaulting to 0.85.")
if 'model_priority' not in globals():
    model_priority = deque(ALL_MODELS)
    logger.info("model_priority not found. Initialized with default model list.")

# Validate selected_config
logger.info(f"Initial selected_config: {selected_config}")
if not isinstance(selected_config, dict):
    logger.error(f"selected_config is not a dictionary: {selected_config}")
    raise ValueError("selected_config must be a dictionary with 'approach' and 'model' keys.")
required_keys = ['approach', 'model']
for key in required_keys:
    if key not in selected_config:
        logger.error(f"selected_config missing required key: '{key}'. Current selected_config: {selected_config}")
        raise ValueError(f"selected_config must contain '{key}' key. Please set it in an earlier cell.")

# Validate 'approach' and 'model' values
valid_approaches = ['Traditional ML', 'Deep Learning', 'Hybrid']
valid_models = ALL_MODELS
if selected_config['approach'] not in valid_approaches:
    logger.error(f"Invalid 'approach' in selected_config: {selected_config['approach']}. Must be one of {valid_approaches}.")
    raise ValueError(f"selected_config['approach'] must be one of {valid_approaches}.")
if selected_config['model'] not in valid_models:
    logger.error(f"Invalid 'model' in selected_config: {selected_config['model']}. Must be one of {valid_models}.")
    raise ValueError(f"selected_config['model'] must be one of {valid_models}.")

# Define hyperparameter search spaces
hyperparam_spaces = {
    # Traditional ML Models
    "Traditional ML (Logistic Regression-like)": [
        Integer(1, 1000, name='max_iter'),
        Real(1e-4, 1e2, name='C', prior='log-uniform')
    ],
    "Traditional ML (Random Forest-like)": [
        Integer(10, 200, name='n_estimators'),
        Integer(2, 50, name='max_depth'),
        Integer(2, 10, name='min_samples_split')
    ],
    "Traditional ML (SVM-like)": [
        Real(1e-4, 1e2, name='C', prior='log-uniform'),
        Categorical(['linear', 'rbf'], name='kernel')
    ],
    "Traditional ML (Gradient Boosting, e.g., XGBoost-like)": [
        Integer(10, 200, name='n_estimators'),
        Real(1e-3, 1.0, name='learning_rate', prior='log-uniform'),
        Integer(1, 20, name='max_depth')
    ],
    "Traditional ML (Naive Bayes-like)": [],
    "Traditional ML (Decision Tree-like)": [],
    "Traditional ML (K-Nearest Neighbors-like)": [],
    "Traditional ML (AdaBoost-like)": [],
    "Traditional ML (LightGBM-like)": [],
    "Traditional ML (CatBoost-like)": [],
    # Deep Learning Models
    "Shallow Neural Network (MLP-like)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size'),
        Integer(1, 5, name='num_layers')
    ],
    "Recurrent Neural Network (LSTM/GRU-like)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size'),
        Integer(50, 200, name='hidden_size')
    ],
    "Convolutional Neural Network (CNN-like)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size'),
        Integer(16, 128, name='filters')
    ],
    "Bidirectional LSTM (BiLSTM-like)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size'),
        Integer(50, 200, name='hidden_size')
    ],
    "Gated Recurrent Unit (GRU-like)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size'),
        Integer(50, 200, name='hidden_size')
    ],
    "Feedforward Neural Network (FNN-like)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size'),
        Integer(1, 5, name='num_layers')
    ],
    "Deep Learning (Custom Transformer)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size'),
        Integer(2, 12, name='num_attention_heads')
    ],
    "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "BERT (Bidirectional Encoder Representations from Transformers)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "RoBERTa (Robustly Optimized BERT Pretraining Approach)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "ALBERT (A Lite BERT)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "XLNet (Generalized Autoregressive Pretraining)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "T5 (Text-To-Text Transfer Transformer)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "Longformer (for long documents)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "BigBird (sparse attention for long sequences)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    "Deep Learning (Advanced Pretrained Transformer, e.g., PaLM-like)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size')
    ],
    # Hybrid Models
    "Hybrid (CNN-RNN)": [
        Real(1e-5, 1e-2, name='learning_rate', prior='log-uniform'),
        Integer(8, 64, name='batch_size'),
        Integer(16, 128, name='filters'),
        Integer(50, 200, name='hidden_size')
    ]
}

# Define model mapping
ml_model_map = {
    "Traditional ML (Naive Bayes-like)": None,
    "Traditional ML (Logistic Regression-like)": LogisticRegression,
    "Traditional ML (SVM-like)": lambda **params: SVC(probability=True, **params),
    "Traditional ML (Gradient Boosting, e.g., XGBoost-like)": GradientBoostingClassifier,
    "Traditional ML (Random Forest-like)": RandomForestClassifier,
    "Traditional ML (Decision Tree-like)": None,
    "Traditional ML (K-Nearest Neighbors-like)": None,
    "Traditional ML (AdaBoost-like)": None,
    "Traditional ML (LightGBM-like)": None,
    "Traditional ML (CatBoost-like)": None
}

from transformers import DistilBertForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification
model_map = {
    "Shallow Neural Network (MLP-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "Recurrent Neural Network (LSTM/GRU-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "Convolutional Neural Network (CNN-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "Bidirectional LSTM (BiLSTM-like)": (BertForSequenceClassification, 'bert-base-uncased'),
    "Gated Recurrent Unit (GRU-like)": (BertForSequenceClassification, 'bert-base-uncased'),
    "Feedforward Neural Network (FNN-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "Hybrid (CNN-RNN)": (BertForSequenceClassification, 'bert-base-uncased'),
    "Deep Learning (Custom Transformer)": (BertForSequenceClassification, 'bert-base-uncased'),
    "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "BERT (Bidirectional Encoder Representations from Transformers)": (BertForSequenceClassification, 'bert-base-uncased'),
    "RoBERTa (Robustly Optimized BERT Pretraining Approach)": (RobertaForSequenceClassification, 'roberta-base'),
    "ALBERT (A Lite BERT)": (DistilBertForSequenceClassification, 'albert-base-v2'),
    "XLNet (Generalized Autoregressive Pretraining)": (BertForSequenceClassification, 'xlnet-base-cased'),
    "T5 (Text-To-Text Transfer Transformer)": (BertForSequenceClassification, 't5-small'),
    "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)": (BertForSequenceClassification, 'deberta-base'),
    "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)": (BertForSequenceClassification, 'electra-base-discriminator'),
    "Longformer (for long documents)": (BertForSequenceClassification, 'longformer-base-4096'),
    "BigBird (sparse attention for long sequences)": (BertForSequenceClassification, 'google/bigbird-roberta-base'),
    "Deep Learning (Advanced Pretrained Transformer, e.g., PaLM-like)": (RobertaForSequenceClassification, 'roberta-base')
}

# Prepare data for Traditional ML models with dynamic column handling
prompt_file = os.path.join(project_dir, 'user_dataset_prompt.json')
if os.path.exists(prompt_file):
    with open(prompt_file, 'r') as f:
        prompt_data = json.load(f)
    text_column = prompt_data.get('text_column', 'review')
    label_column = prompt_data.get('label_column', 'sentiment')
    logger.info("Loaded columns from prompt - Text: %s, Label: %s", text_column, label_column)
else:
    # Infer columns dynamically
    text_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].str.len().mean() > 10]
    label_cols = [col for col in df.columns if df[col].dtype in ['object', 'int', 'float'] and df[col].nunique() < len(df) / 10]
    if not text_cols or not label_cols:
        raise ValueError("Could not infer text or label columns. Ensure dataset contains text and categorical label columns.")
    text_column, label_column = text_cols[0], label_cols[0]
    logger.info("Inferred columns - Text: %s, Label: %s", text_column, label_column)

# Validate columns exist
if text_column not in df.columns or label_column not in df.columns:
    raise ValueError(f"Missing inferred columns: {text_column} or {label_column}.")

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df[text_column].fillna(''))
# Dynamically map labels
unique_labels = df[label_column].unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
y = df[label_column].map(label_mapping).fillna(0).astype(int)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_accuracy = accuracy
best_config = selected_config.copy()
max_trials = 3
trials = 0
ram_threshold_mb = 10240

def log_message(msg):
    logger.info(msg)

model_priority = deque(ALL_MODELS)
logger.info(f"model_priority reset for trials: {list(model_priority)}")

while accuracy < target_accuracy and trials < max_trials and model_priority:
    current_ram = log_memory_usage()
    if current_ram > ram_threshold_mb:
        logger.warning(f"RAM usage ({current_ram:.2f} MB) exceeds threshold ({ram_threshold_mb} MB). Stopping trials.")
        break

    next_approach = model_priority.popleft()
    log_message(f"Switching to {next_approach} for trial {trials + 1}")
    selected_config['approach'] = 'Traditional ML' if 'Traditional ML' in next_approach else 'Deep Learning' if 'Deep Learning' in next_approach else 'Hybrid'
    selected_config['model'] = next_approach

    space = hyperparam_spaces.get(next_approach, [])
    if not space:
        logger.warning(f"No hyperparameter space defined for {next_approach}. Skipping optimization.")
        selected_config['hyperparams'] = {}
    else:
        @use_named_args(space)
        def objective(**params):
            try:
                if any(s in next_approach for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                                   "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                                   "Hybrid", "Deep Learning"]):
                    model_class, pretrained_model = model_map.get(next_approach, (DistilBertForSequenceClassification, 'distilbert-base-uncased'))
                    maml_model = model_class.from_pretrained(pretrained_model, num_labels=len(set(labels)) if 'labels' in globals() and labels.size > 0 else 2)
                    maml_model.to(device)
                    meta_params_path = os.path.join(project_dir, 'meta_learned_params.pt')
                    if os.path.exists(meta_params_path):
                        try:
                            maml_model.load_state_dict(torch.load(meta_params_path))
                        except Exception as e:
                            logger.warning(f"Failed to load meta parameters: {e}. Using pre-trained weights.")
                    maml = MAML(maml_model)
                    maml_model.train()
                    for epoch in range(1):
                        for batch in loader:
                            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                            labels_batch = batch['labels'].to(device)
                            learning_rate = params.get('learning_rate', 1e-5)
                            updated_params = maml.inner_update(inputs, labels_batch, learning_rate=learning_rate)
                            maml_model.load_state_dict(updated_params)
                            torch.cuda.empty_cache()
                    fine_tuned_path = os.path.join(project_dir, f'fine_tuned_{next_approach.replace(" ", "_").lower()}.pt')
                    torch.save(maml_model.state_dict(), fine_tuned_path)
                    maml_model.eval()
                    predictions = []
                    true_labels = []
                    with torch.no_grad():
                        for batch in loader:
                            if len(batch['input_ids']) == 0:
                                logger.warning("Empty batch in loader. Skipping evaluation.")
                                return 1.0
                            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                            labels_batch = batch['labels'].to(device)
                            outputs = maml_model(**inputs)
                            logits = outputs.logits
                            preds = torch.argmax(logits, dim=1)
                            predictions.extend(preds.cpu().numpy())
                            true_labels.extend(labels_batch.cpu().numpy())
                            torch.cuda.empty_cache()
                    acc = accuracy_score(true_labels, predictions) if predictions else 0.0
                    del maml_model, maml
                    torch.cuda.empty_cache()
                    gc.collect()
                elif "Traditional ML" in next_approach:
                    ml_model_class = ml_model_map.get(next_approach)
                    if ml_model_class is None:
                        logger.warning(f"Model {next_approach} not implemented. Skipping.")
                        return 1.0
                    ml_model = ml_model_class(**params)
                    ml_model.fit(X_train, y_train)
                    predictions = ml_model.predict(X_test)
                    acc = accuracy_score(y_test, predictions)
                else:
                    return 1.0
                return -acc
            except Exception as e:
                logger.error(f"Objective function failed for {next_approach} with params {params}: {e}")
                return 1.0

        try:
            result = gp_minimize(
                objective,
                space,
                n_calls=10,
                n_random_starts=3,
                random_state=42,
                verbose=True
            )
            optimized_params = {dim.name: result.x[i] for i, dim in enumerate(space)}
            selected_config['hyperparams'] = optimized_params
            logger.info(f"Optimized hyperparameters for {next_approach}: {optimized_params}")
            accuracy = -result.fun
        except Exception as e:
            logger.error(f"Bayesian Optimization failed for {next_approach}: {e}")
            selected_config['hyperparams'] = {}
            accuracy = 0.0

    maml_model = None
    ml_model = None
    vectorizer = None

    # Retrain with optimized hyperparameters
    if any(s in next_approach for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                       "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                       "Hybrid", "Deep Learning"]):
        model_class, pretrained_model = model_map.get(next_approach, (DistilBertForSequenceClassification, 'distilbert-base-uncased'))
        try:
            maml_model = model_class.from_pretrained(pretrained_model, num_labels=len(set(labels)) if 'labels' in globals() and labels.size > 0 else 2)
            maml_model.to(device)
            meta_params_path = os.path.join(project_dir, 'meta_learned_params.pt')
            if os.path.exists(meta_params_path):
                try:
                    maml_model.load_state_dict(torch.load(meta_params_path))
                except Exception as e:
                    logger.warning(f"Failed to load meta parameters: {e}. Using pre-trained weights.")
            maml = MAML(maml_model)
            maml_model.train()
            for epoch in tqdm(range(1), desc=f"Fine-Tuning {next_approach}"):
                for batch in loader:
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                    labels_batch = batch['labels'].to(device)
                    learning_rate = selected_config['hyperparams'].get('learning_rate', 1e-5)
                    updated_params = maml.inner_update(inputs, labels_batch, learning_rate=learning_rate)
                    maml_model.load_state_dict(updated_params)
                    torch.cuda.empty_cache()
            fine_tuned_path = os.path.join(project_dir, f'fine_tuned_{next_approach.replace(" ", "_").lower()}.pt')
            torch.save(maml_model.state_dict(), fine_tuned_path)
            log_message(f"Fine-tuned model saved to {fine_tuned_path}")
        except Exception as e:
            logger.error(f"Failed to train {next_approach}: {e}. Skipping to next model.")
            trials += 1
            continue
        finally:
            del maml_model
            del maml
            torch.cuda.empty_cache()
            gc.collect()
    elif "Traditional ML" in next_approach:
        ml_model_class = ml_model_map.get(next_approach)
        if ml_model_class is not None:
            vectorizer = TfidfVectorizer(max_features=5000)
            X = vectorizer.fit_transform(df[text_column].fillna(''))
            y = df[label_column].map(label_mapping).fillna(0).astype(int)
            ml_model = ml_model_class(**selected_config['hyperparams'])
            ml_model.fit(X, y)
            fine_tuned_path = os.path.join(project_dir, f'fine_tuned_{next_approach.replace(" ", "_").lower()}.joblib')
            joblib.dump(ml_model, fine_tuned_path)
            log_message(f"Fine-tuned ML model saved to {fine_tuned_path}")
        else:
            log_message(f"Skipping fine-tuning for {next_approach} (model not implemented).")
            ml_model = None
            trials += 1
            continue

    # Re-evaluate with optimized model
    predictions = []
    true_labels = []
    if any(s in next_approach for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                       "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                       "Hybrid", "Deep Learning"]):
        try:
            maml_model = model_class.from_pretrained(pretrained_model, num_labels=len(set(labels)) if 'labels' in globals() and labels.size > 0 else 2)
            maml_model.to(device)
            if os.path.exists(fine_tuned_path):
                maml_model.load_state_dict(torch.load(fine_tuned_path))
            maml_model.eval()
            with torch.no_grad():
                for batch in tqdm(loader, desc=f"Evaluating {next_approach}"):
                    if len(batch['input_ids']) == 0:
                        log_message("Empty batch in loader. Skipping evaluation.")
                        break
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                    labels_batch = batch['labels'].to(device)
                    outputs = maml_model(**inputs)
                    logits = outputs.logits
                    preds = torch.argmax(logits, dim=1)
                    predictions.extend(preds.cpu().numpy())
                    true_labels.extend(labels_batch.cpu().numpy())
                    torch.cuda.empty_cache()
            accuracy = accuracy_score(true_labels, predictions) if predictions else 0.0
            f1 = f1_score(true_labels, predictions, average='weighted') if predictions else 0.0
        except Exception as e:
            logger.error(f"Evaluation failed for {next_approach}: {e}. Setting accuracy to 0.")
            accuracy = 0.0
            f1 = 0.0
        finally:
            del maml_model
            torch.cuda.empty_cache()
            gc.collect()
    elif "Traditional ML" in next_approach and ml_model is not None:
        try:
            X_test = vectorizer.transform(df[text_column].fillna(''))
            predictions = ml_model.predict(X_test)
            true_labels = y
            accuracy = accuracy_score(true_labels, predictions)
            f1 = f1_score(true_labels, predictions, average='weighted')
        except Exception as e:
            logger.error(f"Evaluation failed for {next_approach}: {e}. Setting accuracy to 0.")
            accuracy = 0.0
            f1 = 0.0
        finally:
            del X_test
            gc.collect()
    else:
        accuracy = 0.0
        f1 = 0.0

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_config = selected_config.copy()
        log_message(f"New best model: {next_approach} with accuracy {best_accuracy:.3f} and F1-score {f1:.3f}")
    trials += 1

# Finalize selected_config
if best_accuracy > accuracy:
    selected_config = best_config
    accuracy = best_accuracy
    log_message(f"Final selected model switched to {selected_config['model']} with best accuracy {best_accuracy:.3f}")
else:
    log_message(f"No better model found. Sticking with initial {selected_config['model']} (accuracy: {accuracy:.3f})")

# Save updated selected_config to checkpoint
checkpoint_path = os.path.join(project_dir, 'selected_config_checkpoint.pkl')
try:
    import pickle
    with open(checkpoint_path, 'wb') as f:
        pickle.dump(selected_config, f)
    logger.info(f"Updated selected_config saved to {checkpoint_path}")
except Exception as e:
    logger.warning(f"Failed to update selected_config checkpoint: {str(e)}")

# Final memory cleanup
torch.cuda.empty_cache()
gc.collect()
log_memory_usage()

# Optional: Print final chosen model configuration
print("\nFinal Model Configuration:")
print(f"Approach: {selected_config['approach']}")
print(f"Model: {selected_config['model']}")
print(f"Hyperparameters: {selected_config['hyperparams']}")
print(f"Best Accuracy Achieved: {best_accuracy:.3f}")
print(f"Target Accuracy: {target_accuracy:.3f}")
print(f"Target Achieved: {best_accuracy >= target_accuracy}")


Final Model Configuration:
Approach: Traditional ML
Model: Traditional ML (Random Forest-like)
Hyperparameters: {'learning_rate': (1e-05, 5e-05), 'batch_size': (8, 32)}
Best Accuracy Achieved: 1.000
Target Accuracy: 0.920
Target Achieved: True


In [None]:
# Cell 8: Final Model Evaluation and Deployment Preparation
# Purpose: Evaluate the final selected model on a test set (or full dataset if no test set), save the model for deployment, and log the results.

import torch
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import DataLoader, Subset
from tqdm import tqdm
import os
import logging
import json
from transformers import DistilBertForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Ensure device is defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define project directory
project_dir = '/content/drive/MyDrive/Sentiment_Project'
os.makedirs(project_dir, exist_ok=True)

# SentimentDataset definition
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        try:
            if input_ids is None or attention_mask is None or labels is None:
                raise ValueError("Input data (input_ids, attention_mask, labels) cannot be None")
            self.input_ids = torch.tensor(input_ids, dtype=torch.long) if input_ids is not None else torch.empty(0)
            self.attention_mask = torch.tensor(attention_mask, dtype=torch.long) if attention_mask is not None else torch.ones(0, 128)
            self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else torch.empty(0)
            if len(self.input_ids) != len(self.attention_mask) or len(self.input_ids) != len(self.labels):
                raise ValueError(f"Length mismatch: input_ids ({len(self.input_ids)}), attention_mask ({len(self.attention_mask)}), labels ({len(self.labels)})")
            logger.info(f"SentimentDataset initialized with {len(self.labels)} samples")
        except Exception as e:
            logger.error(f"Failed to initialize SentimentDataset: {e}. Using empty dataset.")
            self.input_ids = torch.empty(0)
            self.attention_mask = torch.ones(0, 128)
            self.labels = torch.empty(0)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        try:
            return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], 'labels': self.labels[idx]}
        except Exception as e:
            logger.error(f"Error in __getitem__ at index {idx}: {e}")
            raise

# Load dataset
try:
    dataset_path = os.path.join(project_dir, 'processed_dataset.csv')
    df = pd.read_csv(dataset_path)
    logger.info(f"Loaded dataset from {dataset_path} with {len(df)} samples.")
except Exception as e:
    logger.error(f"Failed to load dataset from {dataset_path}: {e}")
    raise

# Load column names dynamically
prompt_file = os.path.join(project_dir, 'user_dataset_prompt.json')
if os.path.exists(prompt_file):
    with open(prompt_file, 'r') as f:
        prompt_data = json.load(f)
    text_column = prompt_data.get('text_column', 'review')
    label_column = prompt_data.get('label_column', 'sentiment')
    logger.info("Loaded columns from prompt - Text: %s, Label: %s", text_column, label_column)
else:
    # Infer columns dynamically
    text_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].str.len().mean() > 10]
    label_cols = [col for col in df.columns if df[col].dtype in ['object', 'int', 'float'] and df[col].nunique() < len(df) / 10]
    if not text_cols or not label_cols:
        raise ValueError("Could not infer text or label columns. Ensure dataset contains text and categorical label columns.")
    text_column, label_column = text_cols[0], label_cols[0]
    logger.info("Inferred columns - Text: %s, Label: %s", text_column, label_column)

# Validate columns exist
if text_column not in df.columns or label_column not in df.columns:
    raise ValueError(f"Missing inferred columns: {text_column} or {label_column}.")

# Dynamically map labels
unique_labels = df[label_column].unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
reverse_label_mapping = {idx: label for label, idx in label_mapping.items()}
y = df[label_column].map(label_mapping).fillna(0).astype(int)

# Load selected_config
checkpoint_path = os.path.join(project_dir, 'selected_config_checkpoint.pkl')
try:
    import pickle
    with open(checkpoint_path, 'rb') as f:
        selected_config = pickle.load(f)
    logger.info(f"Loaded selected_config from {checkpoint_path}: {selected_config}")
except Exception as e:
    logger.error(f"Failed to load selected_config from {checkpoint_path}: {e}")
    raise

# Define model mapping with valid hyperparameters
model_map = {
    "Shallow Neural Network (MLP-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "Recurrent Neural Network (LSTM/GRU-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "Convolutional Neural Network (CNN-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "Bidirectional LSTM (BiLSTM-like)": (BertForSequenceClassification, 'bert-base-uncased'),
    "Gated Recurrent Unit (GRU-like)": (BertForSequenceClassification, 'bert-base-uncased'),
    "Feedforward Neural Network (FNN-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "Hybrid (CNN-RNN)": (BertForSequenceClassification, 'bert-base-uncased'),
    "Deep Learning (Custom Transformer)": (BertForSequenceClassification, 'bert-base-uncased'),
    "Deep Learning (Lightweight Pretrained Transformer, e.g., DistilBERT-like)": (DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    "BERT (Bidirectional Encoder Representations from Transformers)": (BertForSequenceClassification, 'bert-base-uncased'),
    "RoBERTa (Robustly Optimized BERT Pretraining Approach)": (RobertaForSequenceClassification, 'roberta-base'),
    "ALBERT (A Lite BERT)": (DistilBertForSequenceClassification, 'albert-base-v2'),
    "XLNet (Generalized Autoregressive Pretraining)": (BertForSequenceClassification, 'xlnet-base-cased'),
    "T5 (Text-To-Text Transfer Transformer)": (BertForSequenceClassification, 't5-small'),
    "DeBERTa (Decoding-enhanced BERT with Disentangled Attention)": (BertForSequenceClassification, 'deberta-base'),
    "ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)": (BertForSequenceClassification, 'electra-base-discriminator'),
    "Longformer (for long documents)": (BertForSequenceClassification, 'longformer-base-4096'),
    "BigBird (sparse attention for long sequences)": (BertForSequenceClassification, 'google/bigbird-roberta-base'),
    "Deep Learning (Advanced Pretrained Transformer, e.g., PaLM-like)": (RobertaForSequenceClassification, 'roberta-base')
}

ml_model_map = {
    "Traditional ML (Naive Bayes-like)": None,
    "Traditional ML (Logistic Regression-like)": LogisticRegression,
    "Traditional ML (SVM-like)": lambda **params: SVC(probability=True, **params),
    "Traditional ML (Gradient Boosting, e.g., XGBoost-like)": GradientBoostingClassifier,
    "Traditional ML (Random Forest-like)": RandomForestClassifier,
    "Traditional ML (Decision Tree-like)": None,
    "Traditional ML (K-Nearest Neighbors-like)": None,
    "Traditional ML (AdaBoost-like)": None,
    "Traditional ML (LightGBM-like)": None,
    "Traditional ML (CatBoost-like)": None
}

# Define valid hyperparameters for ML models
valid_ml_params = {
    "Traditional ML (Logistic Regression-like)": ['max_iter', 'C'],
    "Traditional ML (SVM-like)": ['C', 'kernel'],
    "Traditional ML (Gradient Boosting, e.g., XGBoost-like)": ['n_estimators', 'learning_rate', 'max_depth'],
    "Traditional ML (Random Forest-like)": ['n_estimators', 'max_depth', 'min_samples_split'],
    "Traditional ML (Naive Bayes-like)": [],
    "Traditional ML (Decision Tree-like)": [],
    "Traditional ML (K-Nearest Neighbors-like)": [],
    "Traditional ML (AdaBoost-like)": [],
    "Traditional ML (LightGBM-like)": [],
    "Traditional ML (CatBoost-like)": []
}

# Prepare data for evaluation
if any(s in selected_config['approach'] for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                                "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                                "Hybrid", "Deep Learning"]):
    # Recreate loader if not defined
    if 'loader' not in globals() or loader is None:
        try:
            logger.info("Loader not found. Recreating from input_ids, attention_mask, and labels.")
            if 'input_ids' not in globals() or 'attention_mask' not in globals() or 'labels' not in globals():
                raise NameError("Required data (input_ids, attention_mask, labels) not found in globals. Please run earlier cells (e.g., Cell 2).")
            dataset = SentimentDataset(input_ids, attention_mask, labels)
            if len(dataset) == 0:
                raise ValueError("Dataset is empty after initialization.")
            loader = DataLoader(dataset, batch_size=8, shuffle=False)
            logger.info(f"Recreated loader with {len(dataset)} samples.")
        except Exception as e:
            logger.error(f"Failed to recreate loader: {e}. Using empty loader.")
            dataset = SentimentDataset(None, None, None)
            loader = DataLoader(dataset, batch_size=8, shuffle=False)
else:
    # Prepare data for Traditional ML
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(df[text_column].fillna(''))
    y = df[label_column].map(label_mapping).fillna(0).astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    logger.info(f"Prepared data for Traditional ML: X_train shape {X_train.shape}, X_test shape {X_test.shape}")

# Load the final model with sanitized hyperparameters
final_model = None
fine_tuned_path = os.path.join(project_dir, f'fine_tuned_{selected_config["model"].replace(" ", "_").lower()}.pt')
ml_fine_tuned_path = os.path.join(project_dir, f'fine_tuned_{selected_config["model"].replace(" ", "_").lower()}.joblib')

if any(s in selected_config['approach'] for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                                "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                                "Hybrid", "Deep Learning"]):
    model_class, pretrained_model = model_map.get(selected_config['model'], (DistilBertForSequenceClassification, 'distilbert-base-uncased'))
    try:
        final_model = model_class.from_pretrained(pretrained_model, num_labels=len(unique_labels))
        final_model.to(device)
        if os.path.exists(fine_tuned_path):
            final_model.load_state_dict(torch.load(fine_tuned_path))
            logger.info(f"Loaded fine-tuned model from {fine_tuned_path}")
        else:
            logger.warning(f"Fine-tuned model not found at {fine_tuned_path}. Using pretrained weights.")
    except Exception as e:
        logger.error(f"Failed to load model {selected_config['model']}: {e}")
        raise
elif "Traditional ML" in selected_config['approach']:
    ml_model_class = ml_model_map.get(selected_config['model'])
    if ml_model_class is not None:
        try:
            if os.path.exists(ml_fine_tuned_path):
                final_model = joblib.load(ml_fine_tuned_path)
                logger.info(f"Loaded fine-tuned ML model from {ml_fine_tuned_path}")
            else:
                logger.warning(f"Fine-tuned ML model not found at {ml_fine_tuned_path}. Training a new model.")
                # Sanitize hyperparameters
                valid_params = {k: v for k, v in selected_config.get('hyperparams', {}).items() if k in valid_ml_params.get(selected_config['model'], [])}
                if not valid_params:
                    valid_params = {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2}  # Default params for RandomForest
                final_model = ml_model_class(**valid_params)
                final_model.fit(X_train, y_train)
                joblib.dump(final_model, ml_fine_tuned_path)
                logger.info(f"Trained and saved new ML model to {ml_fine_tuned_path}")
        except Exception as e:
            logger.error(f"Failed to load or train ML model {selected_config['model']}: {e}")
            raise
    else:
        logger.error(f"Model {selected_config['model']} not implemented.")
        raise ValueError(f"Model {selected_config['model']} not implemented.")

# Evaluate the final model
predictions = []
true_labels = []

if any(s in selected_config['approach'] for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                                "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                                "Hybrid", "Deep Learning"]):
    try:
        final_model.eval()
        with torch.no_grad():
            for batch in tqdm(loader, desc=f"Final Evaluation of {selected_config['model']}"):
                if len(batch['input_ids']) == 0:
                    logger.warning("Empty batch in loader. Skipping evaluation.")
                    break
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                labels_batch = batch['labels'].to(device)
                outputs = final_model(**inputs)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)
                predictions.extend(preds.cpu().numpy())
                true_labels.extend(labels_batch.cpu().numpy())
                torch.cuda.empty_cache()
        final_accuracy = accuracy_score(true_labels, predictions) if predictions else 0.0
        final_f1 = f1_score(true_labels, predictions, average='weighted') if predictions else 0.0
        # Map numeric predictions back to original labels
        predicted_labels = [reverse_label_mapping[pred] for pred in predictions]
        true_labels_mapped = [reverse_label_mapping[label] for label in true_labels]
        class_report = classification_report(true_labels_mapped, predicted_labels)
    except Exception as e:
        logger.error(f"Final evaluation failed for {selected_config['model']}: {e}")
        final_accuracy = 0.0
        final_f1 = 0.0
        class_report = "Evaluation failed."
    finally:
        del final_model
        torch.cuda.empty_cache()
elif "Traditional ML" in selected_config['approach']:
    try:
        predictions = final_model.predict(X_test)
        true_labels = y_test
        final_accuracy = accuracy_score(true_labels, predictions)
        final_f1 = f1_score(true_labels, predictions, average='weighted')
        # Map numeric predictions back to original labels
        predicted_labels = [reverse_label_mapping[pred] for pred in predictions]
        true_labels_mapped = [reverse_label_mapping[label] for label in true_labels]
        class_report = classification_report(true_labels_mapped, predicted_labels)
    except Exception as e:
        logger.error(f"Final evaluation failed for {selected_config['model']}: {e}")
        final_accuracy = 0.0
        final_f1 = 0.0
        class_report = "Evaluation failed."

# Log and print evaluation results
logger.info(f"Final Evaluation Results for {selected_config['model']}:")
logger.info(f"Accuracy: {final_accuracy:.3f}")
logger.info(f"F1-Score (Weighted): {final_f1:.3f}")
logger.info(f"Classification Report:\n{class_report}")

print("\nFinal Evaluation Results:")
print(f"Model: {selected_config['model']}")
print(f"Accuracy: {final_accuracy:.3f}")
print(f"F1-Score (Weighted): {final_f1:.3f}")
print(f"Classification Report:\n{class_report}")

# Prepare for deployment
deployment_path = os.path.join(project_dir, 'deploy')
os.makedirs(deployment_path, exist_ok=True)

# Save the model in a deployment-ready format
if any(s in selected_config['approach'] for s in ["Shallow Neural Network", "Recurrent Neural Network", "Convolutional Neural Network",
                                                "Bidirectional LSTM", "Gated Recurrent Unit", "Feedforward Neural Network",
                                                "Hybrid", "Deep Learning"]):
    # Reload and save the model using transformers' save_pretrained for deployment
    try:
        model_class, pretrained_model = model_map.get(selected_config['model'], (DistilBertForSequenceClassification, 'distilbert-base-uncased'))
        final_model = model_class.from_pretrained(pretrained_model, num_labels=len(unique_labels))
        if os.path.exists(fine_tuned_path):
            final_model.load_state_dict(torch.load(fine_tuned_path))
        final_model.save_pretrained(os.path.join(deployment_path, 'final_model'))
        logger.info(f"Model saved for deployment at {os.path.join(deployment_path, 'final_model')}")
    except Exception as e:
        logger.error(f"Failed to save model for deployment: {e}")
elif "Traditional ML" in selected_config['approach']:
    # Save the ML model and vectorizer
    try:
        joblib.dump(final_model, os.path.join(deployment_path, 'final_model.joblib'))
        joblib.dump(vectorizer, os.path.join(deployment_path, 'vectorizer.joblib'))
        logger.info(f"ML model and vectorizer saved for deployment at {deployment_path}")
    except Exception as e:
        logger.error(f"Failed to save ML model for deployment: {e}")

# Save label mapping for inference
with open(os.path.join(deployment_path, 'label_mapping.json'), 'w') as f:
    json.dump(label_mapping, f)
logger.info(f"Label mapping saved to {os.path.join(deployment_path, 'label_mapping.json')}")

# Save evaluation results
eval_results = {
    'model': selected_config['model'],
    'approach': selected_config['approach'],
    'hyperparameters': selected_config['hyperparams'],
    'accuracy': final_accuracy,
    'f1_score': final_f1,
    'classification_report': class_report
}
with open(os.path.join(deployment_path, 'evaluation_results.json'), 'w') as f:
    json.dump(eval_results, f, indent=4)
logger.info(f"Evaluation results saved to {os.path.join(deployment_path, 'evaluation_results.json')}")

# Final log message
logger.info("Model evaluation and deployment preparation completed successfully.")
print("Model evaluation and deployment preparation completed successfully.")




Final Evaluation Results:
Model: Traditional ML (Random Forest-like)
Accuracy: 0.850
F1-Score (Weighted): 0.850
Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.84      0.85      4961
    positive       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Model evaluation and deployment preparation completed successfully.


In [None]:
# Cell 9: Ensure the selected_config from Cell 7 is saved to the checkpoint file.

import os
import pickle
import logging

# Set up logging (if not already set)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define project directory and checkpoint path
project_dir = '/content/drive/MyDrive/Sentiment_Project'
os.makedirs(project_dir, exist_ok=True)
checkpoint_path = os.path.join(project_dir, 'selected_config_checkpoint.pkl')

# Verify Google Drive is mounted
if not os.path.exists('/content/drive/MyDrive'):
    logger.error("Google Drive not mounted at /content/drive/MyDrive. Please mount Drive and rerun.")
    raise RuntimeError("Google Drive not mounted. Mount Drive using the Files tab in Colab.")

# Verify selected_config exists
if 'selected_config' not in globals():
    logger.error("selected_config not found in globals. Ensure Cell 7 ran successfully.")
    raise NameError("selected_config not found. Run Cell 7 to set selected_config.")

# Save selected_config to checkpoint
try:
    with open(checkpoint_path, 'wb') as f:
        pickle.dump(selected_config, f)
    logger.info(f"Successfully saved selected_config to {checkpoint_path}: {selected_config}")
except Exception as e:
    logger.error(f"Failed to save selected_config to {checkpoint_path}: {str(e)}")
    raise

# Verify the file exists
if os.path.exists(checkpoint_path):
    logger.info(f"Checkpoint file verified at {checkpoint_path}")
else:
    logger.error(f"Checkpoint file not found at {checkpoint_path} after saving.")
    raise FileNotFoundError(f"Failed to create checkpoint file at {checkpoint_path}")

In [None]:
!ls "/content/drive/MyDrive/Sentiment_Project"

 coordinator_logs.txt
 Data_Analysis_Quality_Preprocessing.ipynb
 data_analysis_report.json
 dataset_data.json
 deploy
 explainability_output
 explainability_outputs
'fine_tuned_traditional_ml_(random_forest-like).joblib'
 hyperparams.json
 Notebook_5_CodeGen_Explainability.ipynb
 Part_1_Environment_Setup.ipynb
 Part_3_Model_Training_and_Evaluation.ipynb
 part3_output.json
 part4_output.json
 preprocessed_data.pt
 processed_dataset.csv
 quality_check_report.json
 selected_config_checkpoint.pkl
 Sentiment_Analysis_Model_Optimization.ipynb
 trained_traditional_ml_random_forest-like_encoder.joblib
 trained_traditional_ml_random_forest-like.joblib
 trained_traditional_ml_random_forest-like_vectorizer.joblib
'train_traditional_ml_(random_forest-like).py'
 train_traditional_ml_random_forest-like.py
 user_dataset_prompt.json
 user_feedback.csv
