In [None]:
import shutil
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Define source and destination paths
source_path1 = '/content/drive/MyDrive/Colab Notebooks/Part_1_Environment_Setup.ipynb'
source_path2 = '/content/drive/MyDrive/Colab Notebooks/Data_Analysis_Quality_Preprocessing.ipynb'
dest_dir = '/content/drive/MyDrive/Sentiment_Project'

# Create destination directory if it doesn't exist
os.makedirs(dest_dir, exist_ok=True)
logger.info("Destination directory ensured: %s", dest_dir)

# Move notebooks
for source in [source_path1, source_path2]:
    if os.path.exists(source):
        shutil.move(source, os.path.join(dest_dir, os.path.basename(source)))
        logger.info("Moved %s to %s", source, dest_dir)
    else:
        logger.warning("%s not found.", source)

# Verify moved files
!ls "/content/drive/MyDrive/Sentiment_Project"

Mounted at /content/drive




 coordinator_logs.txt
 Data_Analysis_Quality_Preprocessing.ipynb
 data_analysis_report.json
 dataset_data.json
 deploy
 explainability_output
 explainability_outputs
'fine_tuned_traditional_ml_(random_forest-like).joblib'
 hyperparams.json
 model_details.json
 model_integration.json
 Notebook_5_CodeGen_Explainability.ipynb
 Part_1_Environment_Setup.ipynb
 Part_3_Model_Training_and_Evaluation.ipynb
 part3_output.json
 part4_output.json
 preprocessed_data.pt
 processed_dataset.csv
 quality_check_report.json
 selected_config_checkpoint.pkl
 Sentiment_Analysis_Model_Optimization.ipynb
 trained_traditional_ml_random_forest-like_encoder.joblib
 trained_traditional_ml_random_forest-like.joblib
 trained_traditional_ml_random_forest-like_vectorizer.joblib
'train_traditional_ml_(random_forest-like).py'
 train_traditional_ml_random_forest-like.py
 user_dataset_prompt.json
 user_feedback.csv


In [None]:
# Cell 1: Initial Setup and Library Installation
# Purpose: Install required libraries with specific versions for compatibility.
# Note: Enforces scipy 1.14.1 and handles runtime restart requirement.
# Strategy: Clean install, isolate conflicts, and verify integrity.

# Update pip
!pip install --upgrade pip

# Clean install numpy and scipy
!pip uninstall -y numpy scipy
!pip install numpy==1.26.4
!pip install scipy==1.14.1

# Uninstall conflicting pre-installed packages
!pip uninstall -y tensorflow numba

# Install project-specific libraries
!pip install ray==2.46.0
!pip install langchain==0.3.25
!pip install transformers==4.51.3
!pip install nlpaug==1.1.11
!pip install psutil==7.0.0

# Install thinc with compatible version
!pip install thinc==8.2.3

# Install GPyOpt, GPy, and paramz with scipy constraint
!pip install GPyOpt==1.2.6 --no-deps
!pip install GPy==1.13.2 --no-deps
!pip install paramz==0.9.6

!pip install torch==2.1.0+cu118 --index-url https://download.pytorch.org/whl/cu118

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scipy 1.14.1
Uninstalling scipy-1.14.1:
  Successfully uninstalled scipy-1.14.1
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gpy 1.13.2 requires scipy<=1.12.0,>=1.3.0, which is not installed.
paramz 0.9.6 requires scipy, which is not installed.
gpyopt 1.2.6 requires scipy>=0.16, which is not installed.
imbalanced-learn 0.13.0 requires scipy<2,>=1.10.1, which is not installed.
librosa 0.11.0 requires numba>=0.51.0, which is not installed.
librosa 0.11.0 requires 

In [None]:
# Verify installations
import numpy
import scipy
import ray
import langchain
import transformers
import nlpaug
import psutil
import GPyOpt
import GPy
import paramz
import thinc
import torch

print(f"Numpy version: {numpy.__version__}")
print(f"Scipy version: {scipy.__version__}")
print(f"Ray version: {ray.__version__}")
print(f"LangChain version: {langchain.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Thinc version: {thinc.__version__}")
print(f"NLP Augmentation version: {nlpaug.__version__}")
print(f"Psutil version: {psutil.__version__}")
print(f"GPyOpt version: {GPyOpt.__version__}")
print(f"GPy version: {GPy.__version__}")
print(f"Paramz version: {paramz.__version__}")
print(f"Torch version: {torch.__version__}")

# Check versions
assert numpy.__version__ == "1.26.4", "Numpy version mismatch"
assert scipy.__version__ == "1.14.1", "Scipy version mismatch"
assert ray.__version__ == "2.46.0", "Ray version mismatch"
assert langchain.__version__ == "0.3.25", "LangChain version mismatch"
assert transformers.__version__ == "4.51.3", "Transformers version mismatch"
assert thinc.__version__ == "8.2.3", "Thinc version mismatch"
assert nlpaug.__version__ == "1.1.11", "NLP Augmentation version mismatch"
assert psutil.__version__ == "7.0.0", "Psutil version mismatch"
assert GPyOpt.__version__ == "1.2.6", "GPyOpt version mismatch"
assert GPy.__version__ == "1.13.2", "GPy version mismatch"
assert paramz.__version__ == "0.9.6", "Paramz version mismatch"
assert torch.__version__.startswith("2.1.0"), "Torch version mismatch"
print("All libraries installed and verified successfully.")

Numpy version: 1.26.4
Scipy version: 1.14.1
Ray version: 2.46.0
LangChain version: 0.3.25
Transformers version: 4.51.3
Thinc version: 8.2.3
NLP Augmentation version: 1.1.11
Psutil version: 7.0.0
GPyOpt version: 1.2.6
GPy version: 1.13.2
Paramz version: 0.9.6
Torch version: 2.1.0+cu118
All libraries installed and verified successfully.


In [None]:
# Cell 2: Google Drive Integration
# Purpose: Mount Google Drive for persistent storage of models, weights, feedback, and MAML parameters.

from google.colab import drive
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Attempt to mount Google Drive with retry logic
max_attempts = 2
for attempt in range(max_attempts):
    try:
        drive.mount('/content/drive', force_remount=True)
        project_dir = '/content/drive/MyDrive/Sentiment_Project'
        os.makedirs(project_dir, exist_ok=True)
        logger.info("Google Drive mounted successfully at %s", project_dir)
        break
    except Exception as e:
        logger.error("Attempt %d failed: %s", attempt + 1, e)
        if attempt == max_attempts - 1:
            raise SystemExit("Drive mounting failed after maximum attempts. Please check authentication.")
        continue

# Verify Drive access with write test
try:
    test_file = os.path.join(project_dir, 'test_write.txt')
    with open(test_file, 'w') as f:
        f.write('Test successful')
    os.remove(test_file)
    logger.info("Drive contents verified with write access.")
    !ls /content/drive/MyDrive
except Exception as e:
    logger.error("Error verifying Drive access: %s", e)
    raise SystemExit("Drive verification failed. Check permissions or connection.")

Mounted at /content/drive
 1543920364764.mp4
 58a5b6c7-392a-4679-9bee-4a247fa83aae.mov
 airtel-ringtone_original.mp3
 Bbq
 Character
 Classroom
'Colab Notebooks'
 CS-041.pdf
 Dance
'DLD 1.pdf'
'Edited - jpg2pdf.pdf'
'fine_tuned_traditional_ml_(logistic_regression-like).joblib'
'fine_tuned_traditional_ml_(random_forest-like).joblib'
'https:  www.fac.txt'
'images (9).jpeg'
 IMG_0664.jpeg
 IMG_20151003_130302.jpg
 IMG_20171219_164852-1.jpg
 IMG_20180127_150527.jpg
 IMG_20180127_150818.jpg
 IMG_20180128_143759.jpg
 IMG-20180324-WA0000.jpg
 IMG-20180324-WA0001.jpg
 IMG-20180324-WA0002.jpg
 IMG-20180324-WA0003.jpg
 IMG-20180324-WA0004.jpg
 IMG-20180324-WA0025.jpg
 IMG-20180324-WA0026.jpg
 IMG-20180324-WA0027.jpg
 IMG_20180616_111511.jpg
 IMG_20180616_111526.jpg
 IMG_20180616_111552.jpg
 IMG_20180616_111603.jpg
 IMG_20180616_111643.jpg
 IMG_20180616_111708.jpg
 IMG_20181019_154232.jpg
 IMG_20181019_160956.jpg
 IMG_20181019_171839.jpg
 IMG_20181021_150156.jpg
 IMG_20181021_151340.jpg
 IMG_4542

In [None]:
# Cell 3: Coordinator Agent Implementation with Ray
# Purpose: Implement a sophisticated Coordinator Agent for agent communication, logging, and resource monitoring.

import ray
import psutil
import subprocess
from datetime import datetime
import logging
import json
import os
import torch  # Added for GPU check

# Check if Ray is already initialized; if not, initialize in local mode
if not ray.is_initialized():
    ray.init(address='local', ignore_reinit_error=True, logging_level=logging.INFO)
    logger.info("Ray cluster initialized in local mode.")
else:
    logger.info("Ray cluster already initialized, skipping reinitialization.")

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define Coordinator Agent as a Ray actor
@ray.remote(num_cpus=1, num_gpus=0 if not torch.cuda.is_available() else 0.1)
class CoordinatorAgent:
    def __init__(self):
        self.logs = []
        self.start_time = datetime.now()
        self.log_file = os.path.join(project_dir, 'coordinator_logs.txt')
        logger.info("Coordinator Agent initialized.")
        self._check_log_file()

    def _check_log_file(self):
        """Ensure log file exists and manage size."""
        if os.path.exists(self.log_file) and os.path.getsize(self.log_file) > 10 * 1024 * 1024:  # 10MB limit
            with open(self.log_file, 'r') as f:
                lines = f.readlines()
            with open(self.log_file, 'w') as f:
                f.writelines(lines[-1000:])  # Keep last 1000 lines
            logger.info("Log file rotated due to size limit.")

    def log_message(self, message):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_entry = f"[{timestamp}] {message}"
        self.logs.append(log_entry)
        logger.info(log_entry)
        try:
            with open(self.log_file, 'a') as f:
                f.write(log_entry + '\n')
        except Exception as e:
            logger.error("Failed to write to log file: %s", e)
        return log_entry

    def get_resource_usage(self):
        try:
            cpu_usage = psutil.cpu_percent(interval=1)
            ram = psutil.virtual_memory()
            ram_usage = ram.percent
            gpu_usage = 0.0
            memory_usage = 0.0
            if torch.cuda.is_available():
                gpu_query = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,noheader,nounits'],
                                          stdout=subprocess.PIPE, text=True, timeout=5)
                gpu_data = [float(x.strip()) for x in gpu_query.stdout.strip().split(',') if x.strip()]
                gpu_usage = gpu_data[0] if gpu_data else 0.0
                memory_used = gpu_data[1] if len(gpu_data) > 1 else 0.0
                memory_total = gpu_data[2] if len(gpu_data) > 2 else 1.0  # Avoid division by zero
                memory_usage = (memory_used / memory_total * 100) if memory_total > 0 else 0.0
            resource_dict = {
                "cpu_usage (%)": cpu_usage,
                "ram_usage (%)": ram_usage,
                "gpu_usage (%)": gpu_usage,
                "gpu_memory_usage (%)": memory_usage,
                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            logger.info("Resource usage: %s", json.dumps(resource_dict))
            return resource_dict
        except Exception as e:
            logger.error("Resource monitoring error: %s", e)
            return {"error": str(e)}

    def get_logs(self):
        return self.logs

# Instantiate Coordinator Agent
coordinator = CoordinatorAgent.remote()

# Test Coordinator functionalities
ray.get(coordinator.log_message.remote("System startup successful."))
resource_usage = ray.get(coordinator.get_resource_usage.remote())
print("Current Resource Usage:", resource_usage)
logs = ray.get(coordinator.get_logs.remote())
print("Coordinator Logs:", logs)

2025-05-29 07:28:29,726	INFO worker.py:1888 -- Started a local Ray instance.


Current Resource Usage: {'cpu_usage (%)': 8.2, 'ram_usage (%)': 18.1, 'gpu_usage (%)': 0.0, 'gpu_memory_usage (%)': 0.0, 'timestamp': '2025-05-29 07:28:36'}
Coordinator Logs: ['[2025-05-29 07:28:34] System startup successful.']


In [None]:
# Cell 4: Data Input Handling
# Purpose: Handle dataset upload, validation, and storage with advanced error checking, dynamic column detection, and user prompt.

import pandas as pd
from google.colab import files
import os
import json
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to infer text and label columns
def infer_columns(df):
    """Infer text and label columns based on data characteristics."""
    text_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].str.len().mean() > 10]
    label_cols = [col for col in df.columns if df[col].dtype in ['object', 'int', 'float'] and df[col].nunique() < len(df) / 10]
    if not text_cols or not label_cols:
        raise ValueError("Could not infer text or label columns. Ensure dataset contains text and categorical label columns.")
    return text_cols[0], label_cols[0]

def validate_dataset(df, text_column, label_column):
    """Validate dataset structure and content dynamically."""
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    if df.empty:
        raise ValueError("Dataset is empty.")
    if df[text_column].isnull().all() or df[label_column].isnull().all():
        raise ValueError(f"{text_column} or {label_column} columns contain only null values.")
    # Validate minimum text length
    if (df[text_column].str.len() < 10).any():
        logger.warning("Some text entries in %s are shorter than 10 characters.", text_column)
    # Validate label diversity (at least 2 unique classes)
    if df[label_column].nunique() < 2:
        raise ValueError(f"Label column {label_column} must have at least 2 unique categories.")
    return True

def process_dataset():
    try:
        # Prompt user for dataset upload
        print("Please upload your dataset (e.g., dataset.csv):")
        uploaded = files.upload()
        if not uploaded:
            raise ValueError("No file uploaded.")
        filename = list(uploaded.keys())[0]

        # Support CSV and JSON
        if filename.endswith('.csv'):
            df = pd.read_csv(filename, encoding='utf-8', on_bad_lines='skip')
        elif filename.endswith('.json'):
            df = pd.read_json(filename, encoding='utf-8')
        else:
            raise ValueError("Unsupported file format. Use CSV or JSON.")

        # Infer columns dynamically
        text_column, label_column = infer_columns(df)
        logger.info("Inferred text column: %s, label column: %s", text_column, label_column)

        # Validate dataset
        validate_dataset(df, text_column, label_column)
        logger.info("Dataset validation successful.")
        print("Dataset Preview (first 5 rows):")
        print(df[[text_column, label_column]].head())
        print(f"Null counts: \n{df.isnull().sum()}")

        # Save processed dataset
        output_file = os.path.join(project_dir, 'processed_dataset.csv')
        max_attempts = 2
        for attempt in range(max_attempts):
            try:
                df.to_csv(output_file, index=False, encoding='utf-8')
                logger.info("Dataset saved to %s", output_file)
                break
            except Exception as e:
                logger.error("Attempt %d failed to save dataset: %s", attempt + 1, e)
                if attempt == max_attempts - 1:
                    raise

        # Additional validation and statistics
        duplicates = df.duplicated().sum()
        logger.info("Number of duplicate rows: %d", duplicates)
        print(f"Dataset shape: {df.shape}")
        print(f"Label distribution: \n{df[label_column].value_counts()}")

        # Prompt user for additional instructions/context
        print("Please enter any additional instructions or context for the dataset (e.g., 'Use for sentiment analysis with 3 classes') or press Enter to skip:")
        user_prompt = input().strip()
        prompt_data = {
            'dataset_prompt': user_prompt if user_prompt else "No additional instructions provided",
            'filename': filename,
            'text_column': text_column,
            'label_column': label_column,
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        prompt_file = os.path.join(project_dir, 'user_dataset_prompt.json')
        with open(prompt_file, 'w') as f:
            json.dump(prompt_data, f)
        logger.info("User dataset prompt saved to %s", prompt_file)

        return df, text_column, label_column
    except Exception as e:
        logger.error("Error processing dataset: %s", e)
        raise

# Execute data processing
dataset, text_column, label_column = process_dataset()

Please upload your dataset (e.g., dataset.csv):


Saving imdb.csv to imdb.csv
Dataset Preview (first 5 rows):
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Null counts: 
review       0
sentiment    0
dtype: int64
Dataset shape: (50000, 2)
Label distribution: 
sentiment
positive    25000
negative    25000
Name: count, dtype: int64
Please enter any additional instructions or context for the dataset (e.g., 'Use for sentiment analysis with 3 classes') or press Enter to skip:


In [None]:
# Cell 5: Final Validation and Cleanup
# Purpose: Validate the entire setup and clean up temporary files.

import glob
import os

# Validate Coordinator, dataset, and user prompt
try:
    ray.get(coordinator.log_message.remote("Setup validation started."))
    assert os.path.exists(os.path.join(project_dir, 'processed_dataset.csv')), "Dataset not saved to Drive."
    assert os.path.exists(os.path.join(project_dir, 'user_dataset_prompt.json')), "User dataset prompt file not saved."
    resource_usage = ray.get(coordinator.get_resource_usage.remote())
    assert all(k in resource_usage for k in ["cpu_usage (%)", "ram_usage (%)", "gpu_usage (%)"]), "Resource usage data incomplete."
    assert all(v >= 0 for k, v in resource_usage.items() if k != "timestamp"), "Invalid resource usage values."
    logger.info("Inferred columns - Text: %s, Label: %s", text_column, label_column)
    print("Setup validation successful. System ready for Part 2.")
except AssertionError as e:
    error_msg = f"Validation failed: {e}"
    logger.error(error_msg)
    ray.get(coordinator.log_message.remote(error_msg))
    raise
finally:
    ray.get(coordinator.log_message.remote("Setup process completed."))
    # Automatic cleanup of temporary uploaded files
    temp_files = glob.glob("*.csv") + glob.glob("*.json")
    for temp_file in temp_files:
        try:
            os.remove(temp_file)
            logger.info("Cleaned up temporary file: %s", temp_file)
        except Exception as e:
            logger.error("Failed to clean up %s: %s", temp_file, e)
    logger.info("Cleanup completed.")

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
print("Drive mounted successfully.")

In [None]:
import os

# Define the root directory to start the search
root_dir = '/content/drive/MyDrive'

# Search for the notebooks
notebook1 = 'Part_1_Environment_Setup.ipynb'
notebook2 = 'Data_Analysis_Quality_Preprocessing.ipynb'

# Walk through the directory to find the files
for root, dirs, files in os.walk(root_dir):
    if notebook1 in files:
        print(f"Found {notebook1} at: {os.path.join(root, notebook1)}")
    if notebook2 in files:
        print(f"Found {notebook2} at: {os.path.join(root, notebook2)}")