In [None]:
!pip install langchain langchain-community langchain-google-genai google-generativeai shap pandas numpy scikit-learn matplotlib -q
print("Dependencies installed successfully.")

!pip install transformers torch -q
print("Dependencies installed successfully.")

Dependencies installed successfully.
Dependencies installed successfully.


In [None]:
from google.colab import drive
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

try:
    drive.mount('/content/drive')
    logger.info("Google Drive mounted successfully.")
except Exception as e:
    logger.error(f"Failed to mount Google Drive: {str(e)}")
    raise

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls "/content/drive/MyDrive/Sentiment_Project"

 coordinator_logs.txt
 Data_Analysis_Quality_Preprocessing.ipynb
 data_analysis_report.json
 dataset_data.json
 deploy
 explainability_output
 explainability_outputs
'fine_tuned_traditional_ml_(random_forest-like).joblib'
 hyperparams.json
 Notebook_5_CodeGen_Explainability.ipynb
 Part_1_Environment_Setup.ipynb
 Part_3_Model_Training_and_Evaluation.ipynb
 part3_output.json
 part4_output.json
 preprocessed_data.pt
 processed_dataset.csv
 quality_check_report.json
 selected_config_checkpoint.pkl
 Sentiment_Analysis_Model_Optimization.ipynb
 trained_traditional_ml_random_forest-like_encoder.joblib
 trained_traditional_ml_random_forest-like.joblib
 trained_traditional_ml_random_forest-like_vectorizer.joblib
'train_traditional_ml_(random_forest-like).py'
 train_traditional_ml_random_forest-like.py
 user_dataset_prompt.json
 user_feedback.csv


In [None]:
import os
import logging
import pandas as pd
from langchain_google_genai import GoogleGenerativeAI

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define global paths
project_dir = "/content/drive/MyDrive/Sentiment_Project"
dataset_path = os.path.join(project_dir, "processed_dataset.csv")
output_dir = os.path.join(project_dir, "explainability_outputs")
model_path = os.path.join(project_dir, f"fine_tuned_{selected_config['model'].replace(' ', '_').lower()}.pt" if 'selected_config' in globals() and 'model' in selected_config else os.path.join(project_dir, "fine_tuned_model.pt"))
checkpoint_path = os.path.join(project_dir, "selected_config_checkpoint.pkl")

# Retrieve selected_config (must be set by PPO optimization in Notebook 3, Cell 7)
try:
    if 'selected_config' not in globals():
        import pickle
        if os.path.exists(checkpoint_path):
            with open(checkpoint_path, 'rb') as f:
                selected_config = pickle.load(f)
            logger.info(f"Loaded selected_config from checkpoint: {selected_config}")
            model_path = os.path.join(project_dir, f"fine_tuned_{selected_config['model'].replace(' ', '_').lower()}.pt")
        else:
            logger.error("selected_config not found in globals and checkpoint not available. Run Notebook 3, Cell 7 to set PPO-optimized configuration.")
            raise ValueError("selected_config must be defined by PPO optimization in Notebook 3, Cell 7.")
    else:
        logger.info(f"Retrieved selected_config from globals: {selected_config}")
        model_path = os.path.join(project_dir, f"fine_tuned_{selected_config['model'].replace(' ', '_').lower()}.pt")

    # Validate selected_config
    required_keys = ['approach', 'model']
    for key in required_keys:
        if key not in selected_config:
            logger.error(f"selected_config missing required key: '{key}'. Current selected_config: {selected_config}")
            raise ValueError(f"selected_config must contain '{key}' key.")
    if 'hyperparams' not in selected_config:
        selected_config['hyperparams'] = {}  # Default to empty dict if not present
        logger.info("Added empty 'hyperparams' to selected_config as it was missing.")

except Exception as e:
    logger.error(f"Failed to retrieve selected_config: {str(e)}")
    raise

# Create output directory
os.makedirs(output_dir, exist_ok=True)
logger.info("Setup completed. Global paths and selected_config retrieved.")

In [None]:
!pip install -U langchain langchain-community langchain-google-genai google-generativeai pandas numpy scikit-learn matplotlib shap -q
logger.info("Installed compatible versions of langchain, langchain-community, langchain-google-genai, google-generativeai, pandas, numpy, scikit-learn, matplotlib, and shap.")

In [None]:
# Cell 3b: Code Generator Agent
# Purpose: Generate a tailored Python script based on user prompt and dynamically selected operations using LangChain and Gemini.

from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence
from langchain_google_genai import GoogleGenerativeAI
import os
import re
import google.generativeai as genai
from langchain_core.runnables import RunnableConfig
from sklearn.preprocessing import LabelEncoder
import joblib
import logging
import json
import subprocess
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Configure logging
logging.basicConfig(filename='code_generator.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configure Gemini API key (set by default)
api_key = "AIzaSyDH9NNbWsRODKSsaQWyvCucS3rx-4zYiEI"
genai.configure(api_key=api_key)
logger.info("Gemini API key configured successfully.")

# List available models and select the appropriate one
try:
    available_models = [model.name for model in genai.list_models() if 'generateContent' in model.supported_generation_methods]
    logger.info(f"Available Gemini models: {available_models}")
    selected_model = "models/gemini-1.5-flash-001"
    if selected_model not in available_models:
        raise ValueError(f"Selected model {selected_model} not found in available models: {available_models}")
    logger.info(f"Selected Gemini model: {selected_model}")
except Exception as e:
    logger.error(f"Failed to list Gemini models: {str(e)}")
    raise

# Initialize Gemini model with increased flexibility
try:
    gemini_llm = GoogleGenerativeAI(
        model=selected_model,
        google_api_key=api_key,
        temperature=0.5,
        max_output_tokens=2000,
        max_retries=3
    )
    logger.info("Gemini model initialized successfully.")
except Exception as e:
    logger.error(f"Failed to initialize Gemini model: {str(e)}")
    raise

# Function to merge user prompt with dynamic prompt
def create_combined_prompt(user_prompt, model, approach, hyperparams, dataset_path, output_path):
    """Merge user prompt with a dynamic template to generate a tailored Python script."""
    if not isinstance(hyperparams, dict) or not hyperparams:
        raise ValueError("Hyperparameters must be a non-empty dictionary.")
    dynamic_template = """
    Use the dataset from {dataset_path} (a CSV with 'review' and 'sentiment' columns containing string labels 'negative' and 'positive').
    Preprocess the 'sentiment' column by encoding string labels ('negative' and 'positive') to integers (e.g., 0 and 1) using sklearn.preprocessing.LabelEncoder.
    Generate a complete Python script to train a {model} model using the {approach} approach with hyperparameters provided dynamically via command-line arguments (e.g., argparse).
    The script must accept command-line arguments: --data_path, --hyperparameters_path, --model_path, --vectorizer_path, and --encoder_path.
    If arguments are not provided, use defaults: --data_path={dataset_path}, --hyperparameters_path=/content/drive/MyDrive/Sentiment_Project/hyperparams.json,
    --model_path={output_path}, --vectorizer_path={vectorizer_path}, --encoder_path={encoder_path}.
    The script must log the dataset shape and label distribution for validation.
    Include evaluation metrics (accuracy, F1 score) with logging using the logging module for both training and test sets, ensuring pos_label is set to the integer encoded value for 'positive' (e.g., 1).
    Save the trained model to the path specified by --model_path using joblib.dump.
    Save the TfidfVectorizer to the path specified by --vectorizer_path using joblib.dump.
    Save the LabelEncoder to the path specified by --encoder_path using joblib.dump.
    Use functions for modularity and follow Python best practices with detailed comments.
    Strictly use the hyperparameters n_estimators=300, max_depth=10, and min_samples_split=10 from the provided --hyperparameters_path or default values, and do not use any other values.
    Return only the Python script code, starting with 'import', 'def', '#', or 'class', without any Markdown formatting (e.g., no ```python wrappers).
    Ensure the script is complete and functional, avoiding truncation or partial code.
    """
    base_prompt = PromptTemplate(
        input_variables=["model", "approach", "hyperparams", "dataset_path", "output_path", "vectorizer_path", "encoder_path"],
        template=dynamic_template
    ).format(
        model=model,
        approach=approach,
        hyperparams=str(hyperparams),
        dataset_path=dataset_path,
        output_path=output_path,
        vectorizer_path=output_path.replace('.joblib', '_vectorizer.joblib'),
        encoder_path=output_path.replace('.joblib', '_encoder.joblib')
    )
    combined_prompt = f"Generate a complete, professional Python script to train a {model} model using the {approach} approach. {user_prompt}\n{base_prompt}"
    return combined_prompt

# Function to generate code using Gemini with custom config
def gemini_generate_code(prompt, gemini_llm):
    """Generate Python code using the Gemini model with enhanced error handling."""
    try:
        config = RunnableConfig(max_retries=3)
        response = gemini_llm.invoke(prompt, config=config)
        response_text = response.content if hasattr(response, 'content') else str(response)
        if isinstance(response_text, list):
            response_text = " ".join(response_text)
        # Remove Markdown formatting and handle incomplete lines
        if response_text.startswith("```python") and response_text.endswith("```"):
            response_text = response_text.replace("```python\n", "").replace("\n```", "").strip()
        elif response_text.startswith("```") and response_text.endswith("```"):
            response_text = response_text.replace("```", "").strip()
        # Remove any trailing incomplete lines
        response_text = "\n".join(line for line in response_text.split("\n") if line.strip())
        response_text = re.sub(r'\n\s*\n+', '\n', response_text).strip()
        logger.info(f"Raw generated code:\n{response_text}")
        if not response_text or not any(response_text.strip().startswith(prefix) for prefix in ('import', 'def', '#', 'class')):
            logger.warning(f"Invalid or empty Python code detected. Raw output: {response_text}")
            raise ValueError("Generated output does not appear to be valid Python code. Check logs for details.")
        return response_text
    except Exception as e:
        logger.error(f"Failed to generate code with Gemini: {str(e)}")
        if "ResourceExhausted" in str(e):
            logger.warning("Quota exceeded. Please wait for reset or check https://aistudio.google.com/ for details.")
        raise

# Create RunnableSequence
try:
    code_chain = RunnableSequence(
        lambda x: create_combined_prompt(x.get("user_prompt", ""), x["model"], x["approach"], x["hyperparams"], x["dataset_path"], x["output_path"]),
        lambda x: gemini_generate_code(x, gemini_llm)
    )
    logger.info("RunnableSequence created successfully.")
except Exception as e:
    logger.error(f"Failed to create RunnableSequence: {str(e)}")
    raise

# Prompt user for input
try:
    user_prompt = input("Please enter your prompt for generating the Python script (e.g., 'Add 5-fold cross-validation and save logs to a file'): ")
    if not user_prompt.strip() or user_prompt.strip() in ["..", ".", " ", "do it"]:
        raise ValueError("User prompt must be meaningful and not just dots, spaces, or vague phrases like 'do it'.")
    logger.info(f"User provided prompt: {user_prompt}")
except Exception as e:
    logger.error(f"Failed to get user prompt: {str(e)}")
    raise

# Prepare context dynamically from selected_config
try:
    # Load selected_config from part4output.json
    with open("/content/drive/MyDrive/Sentiment_Project/part4_output.json", "r") as f:
        selected_config = json.load(f)["selected_config"]

    model = selected_config.get("model", "Traditional ML (Random Forest-like)")
    approach = selected_config.get("approach", "Traditional ML")
    hyperparams = selected_config.get("hyperparams", {"n_estimators": 300, "max_depth": 10, "min_samples_split": 10})

    if not all([model, approach, hyperparams]):
        missing = [k for k, v in {"model": model, "approach": approach, "hyperparams": hyperparams}.items() if not v]
        raise ValueError(f"Missing required configuration parameters: {missing}")

    dataset_path = "/content/drive/MyDrive/Sentiment_Project/processed_dataset.csv"
    output_path = os.path.join("/content/drive/MyDrive/Sentiment_Project", f"trained_{model.replace(' ', '_').lower().replace('(', '').replace(')', '')}.joblib")

    if not os.path.exists(dataset_path):
        raise FileNotFoundError(f"Dataset not found at {dataset_path}")

    # Save hyperparameters to a JSON file for the script to use
    hyperparams_path = os.path.join("/content/drive/MyDrive/Sentiment_Project", "hyperparams.json")
    with open(hyperparams_path, "w") as f:
        json.dump(hyperparams, f)
    logger.info(f"Hyperparameters saved to {hyperparams_path}")

    response = code_chain.invoke({
        "user_prompt": user_prompt,
        "model": model,
        "approach": approach,
        "hyperparams": hyperparams,
        "dataset_path": dataset_path,
        "output_path": output_path
    })

    script_path = os.path.join("/content/drive/MyDrive/Sentiment_Project", f"train_{model.replace(' ', '_').lower().replace('(', '').replace(')', '')}.py")
    with open(script_path, "w") as f:
        f.write(response)
    logger.info(f"Generated training script saved to {script_path}")

    # Execute the generated script as a subprocess with arguments
    try:
        logger.info(f"Running the generated script: {script_path}")
        cmd = [
            "python",
            script_path,
            f"--data_path={dataset_path}",
            f"--hyperparameters_path={hyperparams_path}",
            f"--model_path={output_path}",
            f"--vectorizer_path={output_path.replace('.joblib', '_vectorizer.joblib')}",
            f"--encoder_path={output_path.replace('.joblib', '_encoder.joblib')}"
        ]
        process = subprocess.run(cmd, capture_output=True, text=True, check=True)
        logger.info(f"Script output: {process.stdout}")
        if process.stderr:
            logger.error(f"Script errors: {process.stderr}")
        # Verify model file creation
        if os.path.exists(output_path):
            logger.info(f"Model successfully saved to {output_path}")
        else:
            logger.error(f"Model file not found at {output_path} after execution.")
            raise FileNotFoundError(f"Model file not created at {output_path}")
    except subprocess.CalledProcessError as e:
        logger.error(f"Error executing the script: {e.stderr}")
        raise
    except Exception as e:
        logger.error(f"Error executing the script: {str(e)}")
        raise
except Exception as e:
    logger.error(f"Error generating or running training script: {str(e)}")
    raise

Mounted at /content/drive


KeyboardInterrupt: Interrupted by user

In [None]:
# Cell 4: Custom Transformer for SHAP
# Purpose: Define a custom transformer to preprocess text data for SHAP compatibility.

from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
import torch
import logging

# Configure logging if not already done
if not logging.getLogger().handlers:
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class CustomTextTransformer:
    def __init__(self, tokenizer_name="bert-base-uncased", max_length=128):
        """Initialize the transformer with a tokenizer and max length.
        Args:
            tokenizer_name (str): Name of the pre-trained tokenizer (default: "bert-base-uncased").
            max_length (int): Maximum sequence length for tokenization.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length
        self.label_encoder = LabelEncoder()

    def fit(self, texts, labels):
        """Fit the transformer by encoding labels.
        Args:
            texts (list): List of text samples.
            labels (list): List of corresponding labels.
        Returns:
            self: The fitted transformer.
        """
        try:
            self.label_encoder.fit(labels)
            logger.info("Label encoder fitted successfully with classes: %s", self.label_encoder.classes_)
            return self
        except Exception as e:
            logger.error("Error fitting label encoder: %s", str(e))
            raise

    def transform(self, texts):
        """Transform texts into input_ids and attention_mask for the model.
        Args:
            texts (list): List of text samples.
        Returns:
            tuple: (input_ids, attention_mask) as PyTorch tensors.
        """
        try:
            encodings = self.tokenizer(
                texts,
                truncation=True,
                padding=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            return encodings['input_ids'], encodings['attention_mask']
        except Exception as e:
            logger.error("Error transforming texts: %s", str(e))
            raise

    def fit_transform(self, texts, labels):
        """Fit and transform in one step.
        Args:
            texts (list): List of text samples.
            labels (list): List of corresponding labels.
        Returns:
            tuple: (input_ids, attention_mask) as PyTorch tensors.
        """
        self.fit(texts, labels)
        return self.transform(texts)

    def inverse_transform(self, encoded_labels):
        """Inverse transform encoded labels back to original labels.
        Args:
            encoded_labels (array-like): Encoded label indices.
        Returns:
            list: Original labels.
        """
        try:
            return self.label_encoder.inverse_transform(encoded_labels)
        except Exception as e:
            logger.error("Error inverse transforming labels: %s", str(e))
            raise

logger.info("CustomTextTransformer class defined successfully.")

In [None]:
# Cell 5: TF-IDF Top Features Analysis
# Purpose: Extract, display, and visualize the top TF-IDF features (words) from the dataset to understand key terms influencing sentiment.

import pandas as pd
import os
import logging
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import drive

# Configure logging
if not logging.getLogger().handlers:
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def extract_top_tfidf_features(dataset_path, output_dir, top_n=20):
    """Extract, visualize, and save the top TF-IDF features from the dataset.
    Args:
        dataset_path (str): Path to the dataset CSV file.
        output_dir (str): Directory to save output files.
        top_n (int): Number of top features to display (default: 20).
    """
    try:
        # Mount Google Drive
        drive.mount('/content/drive', force_remount=True)

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Load dataset
        df = pd.read_csv(dataset_path)
        texts = df['review'].tolist()
        logger.info(f"Loaded dataset with {len(texts)} samples.")

        # Apply TF-IDF vectorization
        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(texts)
        feature_names = vectorizer.get_feature_names_out()

        # Calculate average TF-IDF scores across all documents
        tfidf_scores = tfidf_matrix.mean(axis=0).A1  # Convert to 1D array
        tfidf_df = pd.DataFrame({'feature': feature_names, 'score': tfidf_scores})
        top_features = tfidf_df.sort_values(by='score', ascending=False).head(top_n)

        # Log and display top features
        logger.info(f"Top {top_n} TF-IDF features:\n{top_features}")
        print(f"Top {top_n} TF-IDF Features:")
        print(top_features)

        # Save top features to a CSV file
        csv_path = os.path.join(output_dir, "top_tfidf_features.csv")
        top_features.to_csv(csv_path, index=False)
        logger.info(f"Top TF-IDF features saved to {csv_path}")

        # Create a bar graph of the top TF-IDF features
        plt.figure(figsize=(10, 6))
        plt.barh(top_features['feature'], top_features['score'], color='skyblue')
        plt.xlabel('Average TF-IDF Score')
        plt.ylabel('Feature (Word)')
        plt.title(f'Top {top_n} TF-IDF Features for Sentiment Analysis')
        plt.gca().invert_yaxis()  # Invert y-axis to show highest scores at the top

        # Save the graph as a PNG file
        graph_path = os.path.join(output_dir, "top_tfidf_features_graph.png")
        plt.savefig(graph_path, bbox_inches='tight')
        plt.close()
        logger.info(f"TF-IDF features graph saved to {graph_path}")

    except Exception as e:
        logger.error(f"Error during TF-IDF feature extraction or visualization: {str(e)}")
        raise

# Run TF-IDF feature extraction and visualization
try:
    extract_top_tfidf_features(
        dataset_path="/content/drive/MyDrive/Sentiment_Project/processed_dataset.csv",
        output_dir="/content/drive/MyDrive/Sentiment_Project/explainability_outputs",
        top_n=20
    )
except Exception as e:
    logger.error(f"Failed to extract or visualize top TF-IDF features: {str(e)}")
    raise

Mounted at /content/drive
Top 20 TF-IDF Features:
        feature     score
96           br  0.158836
575       movie  0.081215
323        film  0.071504
502        like  0.039808
461        just  0.036872
374        good  0.034723
832       story  0.029514
881        time  0.029505
695      really  0.029123
62          bad  0.026889
378       great  0.026559
628      people  0.024301
235         don  0.024018
576      movies  0.023748
951       watch  0.022078
530        make  0.021380
876       think  0.020980
955         way  0.020966
755        seen  0.020959
132  characters  0.020842


In [None]:
import joblib
model = joblib.load("/content/drive/MyDrive/Sentiment_Project/trained_traditional_ml_random_forest-like.joblib")
print("Model loaded successfully")

Model loaded successfully


In [None]:
!ls "/content/drive/MyDrive/Sentiment_Project"

 coordinator_logs.txt
 Data_Analysis_Quality_Preprocessing.ipynb
 data_analysis_report.json
 dataset_data.json
 explainability_output
 explainability_outputs
 hyperparams.json
 Notebook_5_CodeGen_Explainability.ipynb
 Part_1_Environment_Setup.ipynb
 Part_3_Model_Training_and_Evaluation.ipynb
 part3_output.json
 part4_output.json
 preprocessed_data.pt
 processed_dataset.csv
 quality_check_report.json
 selected_config_checkpoint.pkl
 Sentiment_Analysis_Model_Optimization.ipynb
 trained_traditional_ml_random_forest-like_encoder.joblib
 trained_traditional_ml_random_forest-like.joblib
 trained_traditional_ml_random_forest-like_vectorizer.joblib
'train_traditional_ml_(random_forest-like).py'
 train_traditional_ml_random_forest-like.py
 user_feedback.csv


In [None]:
# Cell 7: Save Model Details and Generate Integration File
# Purpose: Save model details (e.g., accuracy, hyperparameters) to a file and generate a non-.py integration file for user software.

import json
import os
import logging
from google.colab import drive
import joblib
from sklearn.metrics import accuracy_score
import pandas as pd

# Configure logging if not already done
if not logging.getLogger().handlers:
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

def save_model_details(model_path, dataset_path, output_path):
    """Save model details including accuracy and hyperparameters to a JSON file.
    Args:
        model_path (str): Path to the saved model file.
        dataset_path (str): Path to the dataset CSV file.
        output_path (str): Path to save the details file.
    """
    try:
        # Load the trained model
        model = joblib.load(model_path)
        logger.info(f"Loaded model from {model_path}")

        # Load the vectorizer used during training
        vectorizer_path = model_path.replace('.joblib', '_vectorizer.joblib')
        if not os.path.exists(vectorizer_path):
            raise FileNotFoundError(f"Vectorizer not found at {vectorizer_path}. Ensure Cell 3b completed successfully.")
        vectorizer = joblib.load(vectorizer_path)
        logger.info(f"Loaded vectorizer from {vectorizer_path}")

        # Load dataset
        df = pd.read_csv(dataset_path)
        X = vectorizer.transform(df['review'])  # Use the loaded vectorizer
        y = df['sentiment'].map({'positive': 1, 'negative': 0})
        y_pred = model.predict(X)
        accuracy = accuracy_score(y, y_pred)

        # Load selected_config for hyperparameters
        with open("/content/drive/MyDrive/Sentiment_Project/part4_output.json", "r") as f:
            selected_config = json.load(f)["selected_config"]
        hyperparams = selected_config.get("hyperparams", {})

        # Prepare details
        model_details = {
            "model_name": selected_config.get("model", "Traditional ML (Random Forest-like)"),
            "hyperparameters": hyperparams,
            "accuracy": float(accuracy),
            "timestamp": pd.Timestamp.now().isoformat(),
            "sample_size": len(df)
        }

        # Save to JSON
        with open(output_path, "w") as f:
            json.dump(model_details, f, indent=2)
        logger.info(f"Model details saved to {output_path}")
    except Exception as e:
        logger.error(f"Error saving model details: {str(e)}")
        raise

def generate_code_integration_file(model_path, output_path):
    """Generate a non-.py integration file (e.g., .pyi or JSON) for user software integration.
    Args:
        model_path (str): Path to the saved model file.
        output_path (str): Path to save the integration file (e.g., .json).
    """
    try:
        # Load model
        model = joblib.load(model_path)

        # Generate integration details (e.g., API-like structure)
        integration_content = {
            "model_type": "RandomForestClassifier",
            "predict_method": {
                "input": "list of strings (text reviews)",
                "output": "array of integers (0 for negative, 1 for positive)",
                "example": "predict(['good review', 'bad review'])"
            },
            "load_path": model_path,
            "required_libraries": ["joblib", "sklearn.ensemble"],
            "version": "1.0"
        }

        # Save as JSON (can be adjusted to .pyi or other formats)
        with open(output_path, "w") as f:
            json.dump(integration_content, f, indent=2)
        logger.info(f"Integration file saved to {output_path}")
    except Exception as e:
        logger.error(f"Error generating integration file: {str(e)}")
        raise

# Run the functions
try:
    model_path = "/content/drive/MyDrive/Sentiment_Project/trained_traditional_ml_random_forest-like.joblib"
    dataset_path = "/content/drive/MyDrive/Sentiment_Project/processed_dataset.csv"
    details_path = os.path.join("/content/drive/MyDrive/Sentiment_Project", "model_details.json")
    integration_path = os.path.join("/content/drive/MyDrive/Sentiment_Project", "model_integration.json")

    save_model_details(model_path, dataset_path, details_path)
    generate_code_integration_file(model_path, integration_path)
except Exception as e:
    logger.error(f"Error in final processing: {str(e)}")
    raise

Mounted at /content/drive
