In [1]:
import os
import openai
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
# If you want to use a library like LangChain for the LLM calls, uncomment:
# from langchain import OpenAI

# For environment variables
from dotenv import load_dotenv

In [3]:
load_dotenv()  # Load .env if present

True

In [4]:
# Set up your OpenAI (or other LLM) API Key
openai.api_key = os.getenv("OPENAI_API_KEY", None)
if not openai.api_key:
    raise ValueError("No OpenAI API key found. Please set OPENAI_API_KEY in your environment or .env file.")

In [9]:
##################################################################
# Helper function to call LLM
##################################################################
def call_llm_system(prompt, model="gpt-3.5-turbo", temperature=0.7):
    """
    Simple wrapper for calling the OpenAI ChatCompletion API.
    Returns the text of the assistant's response.
    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature
    )
    return response["choices"][0]["message"]["content"]

def extract_code_block(text):
    """
    Attempts to find the FIRST triple backtick code block in the text.
    Returns just the code inside the backticks, or None.
    """
    pattern = r"```(.*?)```"
    matches = re.findall(pattern, text, flags=re.DOTALL)
    if matches:
        return matches[0].strip()
    return None

def run_code_snippet(code_snippet, function_name, *args, **kwargs):
    """
    Dynamically executes a code snippet that *must* define a function
    named `function_name`. Then calls that function with `*args, **kwargs`.
    
    Returns whatever that function returns.
    """
    # Write to a temporary Python file
    with tempfile.NamedTemporaryFile(suffix=".py", delete=False, mode="w") as tmp_file:
        tmp_file_path = tmp_file.name
        tmp_file.write(code_snippet)

    module_name = os.path.splitext(os.path.basename(tmp_file_path))[0]
    loaded_module = None
    result = None
    try:
        spec = importlib.util.spec_from_file_location(module_name, tmp_file_path)
        loaded_module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(loaded_module)

        if hasattr(loaded_module, function_name):
            func = getattr(loaded_module, function_name)
            result = func(*args, **kwargs)
        else:
            raise ValueError(f"Function '{function_name}' not found in the snippet.")
    finally:
        # Clean up
        os.remove(tmp_file_path)

    return result

In [10]:
##################################################################
# 1. DataAgent
##################################################################
class DataAgent:
    """
    - Scans a folder for CSV files.
    - Asks LLM how to interpret or merge them if needed.
    - Produces a "combined DataFrame" or multiple DataFrames.
    """
    def __init__(self, data_dir="data"):
        self.data_dir = data_dir
        self.dataframes = {}  # key -> DataFrame
    
    def run(self):
        # 1. Gather CSV files
        csv_files = glob.glob(os.path.join(self.data_dir, "*.csv"))
        if not csv_files:
            print("[DataAgent] No CSV files found in data folder.")
            return None
        
        # 2. Summarize file names, ask LLM how to handle them
        file_list_str = "\n".join([os.path.basename(f) for f in csv_files])
        prompt = (
            "You are a Data Engineer. I have these CSV files:\n"
            f"{file_list_str}\n\n"
            "They contain NCAA basketball data (e.g., Teams, Seasons, Seeds, etc.) "
            "for March Madness predictions. Propose how to read them, interpret them, "
            "and whether we should merge them into a single DataFrame or keep multiple. "
            "Provide Python code (in a function named `load_and_merge_data(data_dir)`), "
            "using pandas. The function should return a single pandas DataFrame if possible, "
            "or a dictionary of DataFrames if you deem that better."
        )
        llm_response = call_llm_system(prompt)
        code_snippet = extract_code_block(llm_response)
        if not code_snippet:
            # fallback - no code block found
            print("[DataAgent] No code block found. Will attempt to just load all CSVs individually.")
            for fpath in csv_files:
                key = os.path.splitext(os.path.basename(fpath))[0]
                self.dataframes[key] = pd.read_csv(fpath)
            return self.dataframes
        
        # 3. Execute code snippet
        try:
            result = run_code_snippet(code_snippet, "load_and_merge_data", self.data_dir)
            if isinstance(result, dict):
                self.dataframes = result
                print(f"[DataAgent] LLM returned dict of DataFrames: {list(self.dataframes.keys())}")
                return self.dataframes
            elif isinstance(result, pd.DataFrame):
                self.dataframes["merged"] = result
                print(f"[DataAgent] LLM returned single DataFrame: {result.shape}")
                return self.dataframes
            else:
                print("[DataAgent] Unexpected return type from LLM code. Storing as 'unknown' key.")
                self.dataframes["unknown"] = result
                return self.dataframes
        except Exception as e:
            print("[DataAgent] Error executing LLM code:", e)
            print("[DataAgent] Falling back to simply loading each CSV individually.")
            for fpath in csv_files:
                key = os.path.splitext(os.path.basename(fpath))[0]
                self.dataframes[key] = pd.read_csv(fpath)
            return self.dataframes

In [8]:
##################################################################
# 2. ExplorationAgent
##################################################################
class ExplorationAgent:
    """
    Takes loaded dataframes. Asks LLM for EDA approach or feature inspection. 
    Potentially returns suggestions or code.
    """
    def __init__(self):
        pass

    def run(self, dataframes):
        # Summarize data shapes & columns
        summary = []
        for k, df in dataframes.items():
            summary.append(f"DataFrame '{k}': shape={df.shape}, columns={list(df.columns)}")
        summary_str = "\n".join(summary)

        # Ask LLM to propose an EDA or data exploration approach
        prompt = (
            "You are a data scientist exploring NCAA basketball data. "
            "Below are the dataframes loaded:\n"
            f"{summary_str}\n\n"
            "Propose a quick EDA approach, interesting insights, and potential merges or transformations. "
            "Provide a rationale, but also supply Python code in a function named `eda_and_insights(dataframes)` "
            "that uses pandas, matplotlib, or seaborn for EDA. The function can just print or display insights. "
            "We won't necessarily display plots here, but we want to see how you'd do it."
        )

        llm_response = call_llm_system(prompt)
        code_snippet = extract_code_block(llm_response)

        return {
            "exploration_plan": llm_response,
            "code": code_snippet
        }

In [11]:
##################################################################
# 3. FeatureEngineeringAgent
##################################################################
class FeatureEngineeringAgent:
    """
    Takes EDA suggestions + data. Asks LLM for feature engineering code.
    """
    def __init__(self):
        pass

    def run(self, dataframes, eda_suggestions):
        # Summarize again, and ask for feature engineering approach
        summary = []
        for k, df in dataframes.items():
            summary.append(f"'{k}': shape={df.shape}, columns={list(df.columns)}")
        summary_str = "\n".join(summary)

        prompt = (
            "You are an ML engineer. We have the following dataframes:\n"
            f"{summary_str}\n\n"
            "We want to engineer features for predicting NCAA basketball matchups. "
            "Here is the EDA's suggestions:\n"
            f"{eda_suggestions}\n\n"
            "Provide a function named `feature_engineering(dataframes)` in Python that:\n"
            "- Possibly merges or transforms them.\n"
            "- Creates new features.\n"
            "- Returns a single DataFrame with the features and a 'target' column (if available). "
            "If there's no explicit target, you can randomly create one or deduce it from the data. "
            "Only return the final DataFrame."
        )

        llm_response = call_llm_system(prompt)
        code_snippet = extract_code_block(llm_response)
        if not code_snippet:
            print("[FeatureEngineeringAgent] No code snippet found. Returning dataframes as is.")
            # fallback: pick first DF if we can't do better
            key0 = list(dataframes.keys())[0]
            df = dataframes[key0].copy()
            # If no target, add a random one
            if "target" not in df.columns:
                df["target"] = np.random.randint(0, 2, len(df))
            return df

        # Execute code snippet
        try:
            result = run_code_snippet(code_snippet, "feature_engineering", dataframes)
            if isinstance(result, pd.DataFrame):
                print(f"[FeatureEngineeringAgent] Feature engineering returned DataFrame {result.shape}")
                return result
            else:
                print("[FeatureEngineeringAgent] Unexpected return type from LLM code. Converting to DataFrame forcibly.")
                return pd.DataFrame(result)
        except Exception as e:
            print("[FeatureEngineeringAgent] Error executing LLM code snippet:", e)
            # fallback
            key0 = list(dataframes.keys())[0]
            df = dataframes[key0].copy()
            if "target" not in df.columns:
                df["target"] = np.random.randint(0, 2, len(df))
            return df

In [12]:
##################################################################
# 4. ModelSelectionAgent
##################################################################
class ModelSelectionAgent:
    """
    Asks LLM to pick a model or architecture and produce code to train it.
    Runs the code, obtains the model object, returns it.
    """
    def __init__(self):
        pass

    def run(self, df):
        # Summarize
        shape = df.shape
        cols = list(df.columns)
        prompt = (
            f"You are an ML expert. We have a DataFrame shape={shape}, columns={cols}.\n"
            "We want to do classification for the 'target' column. Provide Python code in a function named "
            "`build_and_train_model(X_train, y_train, X_val, y_val)` that:\n"
            "1) Splits the data if not already split.\n"
            "2) Builds a recommended model (could be scikit-learn or a simple neural network).\n"
            "3) Trains it.\n"
            "4) Returns the trained model. "
            "We want to see your best guess. Also consider we might have limited data typical of Kaggle March Madness (tens of thousands of rows)."
        )

        llm_response = call_llm_system(prompt)
        code_snippet = extract_code_block(llm_response)
        if not code_snippet:
            print("[ModelSelectionAgent] No code snippet found. Generating fallback logistic regression.")
            code_snippet = self.fallback_code()

        # We do our own train/val split externally for better control:
        # We'll pass them to the LLM's function. 
        # But if the LLM tries to split inside the function, that's okay—it can do so again.
        # We'll just override or pass the split in anyway.

        # We'll do a local split:
        if "target" not in df.columns:
            # fallback
            df["target"] = np.random.randint(0,2,len(df))

        X = df.drop(columns=["target"])
        y = df["target"]

        # Make train/val
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

        try:
            model = run_code_snippet(code_snippet, "build_and_train_model", X_train, y_train, X_val, y_val)
            return model, (X_val, y_val)
        except Exception as e:
            print("[ModelSelectionAgent] Error running LLM-provided code snippet:", e)
            # fallback code
            code_snippet = self.fallback_code()
            model = run_code_snippet(code_snippet, "build_and_train_model", X_train, y_train, X_val, y_val)
            return model, (X_val, y_val)

    def fallback_code(self):
        """
        Just in case the LLM code fails, use a minimal logistic regression snippet.
        """
        code = """\
import pandas as pd
from sklearn.linear_model import LogisticRegression

def build_and_train_model(X_train, y_train, X_val, y_val):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model
"""
        return code

In [13]:
##################################################################
# 5. EvaluationAgent
##################################################################
class EvaluationAgent:
    """
    Evaluates the trained model, can feed results back to LLM for analysis or next-step suggestions.
    """
    def __init__(self):
        pass

    def run(self, model, X_val, y_val):
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(X_val)
            ll = log_loss(y_val, proba)
        else:
            preds = model.predict(X_val)
            # naive approach to approximate proba
            proba = np.column_stack([1 - preds, preds])
            ll = log_loss(y_val, proba)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)

        print(f"[EvaluationAgent] log_loss={ll:.4f}, accuracy={acc:.4f}")
        return {"log_loss": ll, "accuracy": acc}

In [14]:
##################################################################
# 6. Coordinator
##################################################################
class FullAutomationCoordinator:
    """
    Orchestrates the entire pipeline with Agents:
    1. DataAgent -> load or merge data
    2. ExplorationAgent -> EDA suggestions
    3. FeatureEngineeringAgent -> create final features + target
    4. ModelSelectionAgent -> build and train model
    5. EvaluationAgent -> measure performance
    6. (Optional) Feedback loop or multiple iterations
    """
    def __init__(self, data_dir="data"):
        self.data_dir = data_dir
        self.data_agent = DataAgent(data_dir=data_dir)
        self.exploration_agent = ExplorationAgent()
        self.feature_engineering_agent = FeatureEngineeringAgent()
        self.model_selection_agent = ModelSelectionAgent()
        self.evaluation_agent = EvaluationAgent()

    def run_pipeline(self):
        # 1. Load data
        dataframes = self.data_agent.run()
        if not dataframes:
            print("[Coordinator] No data loaded. Pipeline ends.")
            return

        # 2. Exploration
        eda_info = self.exploration_agent.run(dataframes)
        # We *could* run the EDA code snippet just for demonstration:
        if eda_info["code"]:
            try:
                run_code_snippet(eda_info["code"], "eda_and_insights", dataframes)
            except Exception as e:
                print("[Coordinator] EDA code snippet failed:", e)

        # 3. Feature Engineering
        df_features = self.feature_engineering_agent.run(dataframes, eda_info["exploration_plan"])
        if df_features is None or len(df_features) == 0:
            print("[Coordinator] FeatureEngineeringAgent returned empty. Exiting.")
            return

        # 4. Model Selection
        model, val_data = self.model_selection_agent.run(df_features)
        if model is None:
            print("[Coordinator] No model returned. Exiting.")
            return
        
        # 5. Evaluation
        X_val, y_val = val_data
        metrics = self.evaluation_agent.run(model, X_val, y_val)
        print("[Coordinator] Final metrics:", metrics)

        # 6. Could add a loop for refinement
        # e.g. feed metrics back to LLM, ask for improved feature eng or modeling
        # We'll stop here for demonstration.
        print("[Coordinator] Pipeline complete!")


In [None]:
coordinator = FullAutomationCoordinator(data_dir="data")

In [None]:
coordinator.run_pipeline()