In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

** Installing the Google Generative AI Library**


In [None]:
!pip install google-generativeai


# SmartBiz AI Agents ðŸ§ ðŸ“Š
### Enterprise Data Analytics Multi-Agent System

This notebook implements a multi-agent system for enterprise data analytics 
using data cleaning, EDA, regression & non-linear models, and a Gemini-powered 
Insight Agent for business recommendations.


# SmartBiz AI Agents ðŸ§ ðŸ“Š
### Enterprise Multi-Agent System for Business Data Analysis

This project implements a multi-agent system for enterprise data analysis,
including data cleaning, EDA, regression, non-linear models, and a Gemini-powered 
Insight Agent to generate business recommendations.


**Installing the Google Generative AI package for enabling Gemini functionality**


In [None]:
!pip install google-generativeai


**Importing all required libraries and configuring the notebook environment**


In [None]:
# Importing essential data manipulation and analysis libraries
import pandas as pd      # Handling and analyzing structured data
import numpy as np       # Performing numerical computations

# Importing visualization library for plotting data insights
import matplotlib.pyplot as plt

# Importing machine learning tools for model training and testing
from sklearn.model_selection import train_test_split        # Splitting data into train/test sets
from sklearn.linear_model import LinearRegression           # Building a linear regression model
from sklearn.ensemble import RandomForestRegressor          # Building a random forest model
from sklearn.metrics import mean_absolute_error, r2_score   # Evaluating model performance

# Importing Google's Generative AI client for extended insight generation
import google.generativeai as genai

# Importing date-time utilities and text-wrapping helper
import datetime as dt     # Handling date and time operations
import textwrap           # Formatting long text outputs

# Configuring default plot size for cleaner visualization
plt.rcParams["figure.figsize"] = (8, 4)



**Searching through Kaggle directories to locate the exact file path of the uploaded dataset**


In [None]:
# Importing the OS module to interact with the file system
import os

# Walking through all folders and subfolders inside the /kaggle directory
# and continuously scanning for any file that contains the words "store" or "sales"
for root, dirs, files in os.walk("/kaggle"):
    for f in files:
        # Converting the filename to lowercase and checking if it contains keywords
        if "store" in f.lower() or "sales" in f.lower():
            # Printing the complete path of the matched file to identify the correct dataset location
            print(os.path.join(root, f))



**Defining the dataset path and configuring key column identifiers for the analysis pipeline**


In [None]:
# Setting the exact file path of the dataset stored in the Kaggle environment
DATA_PATH = "/kaggle/input/store-sales/new_store_sales.csv"

# Specifying the target column that the model is trying to predict
TARGET_COLUMN = "sales"

# Identifying the date column so the pipeline can correctly parse and process time-based features
DATE_COLUMN = "date"

# Listing out identifier columns that should be excluded from model training (e.g., primary keys)
ID_COLUMNS = ["id"]



**Adding Utility Tools**

In [None]:
def load_data(path: str) -> pd.DataFrame:
    """
    Loading the CSV file into a Pandas DataFrame.
    Displaying the shape of the dataset after loading.
    """
    df = pd.read_csv(path)
    print(f"[TOOL] Loaded data with shape: {df.shape}")
    return df


def basic_cleaning(df: pd.DataFrame) -> pd.DataFrame:
    """
    Performing basic cleaning operations:
    - Removing duplicate rows.
    - Filling missing numeric values with the median.
    - Trimming whitespace from text columns.
    """
    df = df.copy()

    # Removing duplicate rows
    df = df.drop_duplicates()

    # Filling missing values in numeric columns using the median
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(df[col].median())

    # Trimming extra whitespace from text columns
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].astype(str).str.strip()

    print("[TOOL] Basic cleaning complete.")
    return df


def parse_dates(df: pd.DataFrame, col: str):
    """
    Converting the specified column into datetime format.
    Handling invalid date formats by coercing them into NaT.
    """
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")
        print(f"[TOOL] Parsed {col} as datetime.")
    return df


def summarize_dataframe(df):
    """
    Displaying summary information of the DataFrame:
    - Showing dataframe info (columns, types, null values).
    - Showing first few rows of the DataFrame.
    """
    print("\nData Info:")
    print(df.info())

    print("\nHead:")
    print(df.head())


**Memory, Logger and Base Agent**

In [None]:
class Memory:
    def __init__(self):
        # Initializing an empty dictionary for storing keyâ€“value memory data
        self.store = {}

    def set(self, key, value):
        # Saving a value in memory using the provided key
        self.store[key] = value

    def get(self, key):
        # Retrieving a stored value using the given key
        return self.store.get(key)


class Logger:
    def __init__(self):
        # Initializing a list for storing log messages
        self.logs = []

    def log(self, agent, message):
        # Creating a formatted log line and storing it while also printing it
        line = f"[{agent}] {message}"
        self.logs.append(line)
        print(line)


class Agent:
    def __init__(self, name, memory, logger):
        # Initializing the agent with its name, shared memory, and logger
        self.name = name
        self.memory = memory
        self.logger = logger

    def log(self, msg):
        # Logging a message with the agent's name
        self.logger.log(self.name, msg)

    def run(self, context):
        # Raising an error because child classes are expected to implement their own run() method
        raise NotImplementedError



**Defining All Agents**

In [None]:
class DataIngestionAgent(Agent):
    def run(self, context):
        # Logging that the agent is loading the data
        self.log("Loading data...")

        # Loading the dataset from the file path
        df = load_data(DATA_PATH)

        # Parsing the date column into proper datetime format
        df = parse_dates(df, DATE_COLUMN)

        # Storing the raw dataframe into the context for further agents
        context["raw"] = df
        return context


class DataCleaningAgent(Agent):
    def run(self, context):
        # Logging that the agent is cleaning the data
        self.log("Cleaning data...")

        # Retrieving raw data stored by the ingestion agent
        df = context["raw"]

        # Performing basic data cleaning operations
        df = basic_cleaning(df)

        # Dropping rows where the target column is still missing
        df = df.dropna(subset=[TARGET_COLUMN])

        # Storing cleaned data into context
        context["clean"] = df
        return context


class EDAAgent(Agent):
    def run(self, context):
        # Logging that the agent is performing exploratory data analysis
        self.log("Performing EDA...")

        # Getting cleaned data from context and summarizing it
        df = context["clean"]
        summarize_dataframe(df)

        return context


class ModelingAgent(Agent):
    def run(self, context):
        # Logging that model training is beginning
        self.log("Training models...")

        df = context["clean"]

        # Selecting all numeric features for modeling
        features = df.select_dtypes(include=[np.number]).columns.tolist()

        # Removing target and ID columns from the feature list
        for c in [TARGET_COLUMN] + ID_COLUMNS:
            if c in features:
                features.remove(c)

        # Splitting the data into features and target
        X = df[features]
        y = df[TARGET_COLUMN]

        # Splitting into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Training a linear regression model
        lin = LinearRegression()
        lin.fit(X_train, y_train)

        # Making predictions using the linear model
        pred_lin = lin.predict(X_test)

        # Random Forest
        rf = RandomForestRegressor(random_state=42)
        rf.fit(X_train, y_train)
        pred_rf = rf.predict(X_test)


        # Storing model performance metrics in context
        context["metrics"] = {
            "linear": mean_absolute_error(y_test, pred_lin),
            "rf": mean_absolute_error(y_test, pred_rf),
        }

        # Logging the performance of each model
        self.log(f"Linear MAE: {context['metrics']['linear']}")
        self.log(f"RF MAE: {context['metrics']['rf']}")

        return context


class DecisionAgent(Agent):
    def run(self, context):
        # Logging that decision rules are being generated
        self.log("Generating rule-based business decisions...")

        m = context["metrics"]

        # Comparing performance metrics and selecting the better model
        if m["rf"] < m["linear"]:
            context["decision"] = "RandomForest is performing better. Using nonlinear model."
        else:
            context["decision"] = "Linear Regression is performing similarly. Using simpler model."

        return context


In [None]:
from kaggle_secrets import UserSecretsClient
import google.generativeai as genai

# Loading Google API key securely from Kaggle secrets
user_secrets = UserSecretsClient()
GOOGLE_API_KEY = user_secrets.get_secret("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

**Adding Gemini Insight Agent**

In [None]:
class GeminiInsightAgent(Agent):
    def run(self, context):
        # Logging that the agent is simulating Gemini insights due to lack of internet access
        self.log("Simulating Gemini insights (Kaggle has no internet)...")

        # Retrieving stored model performance metrics from context
        metrics = context.get("metrics", {})

        # Retrieving the recommended decision from the DecisionAgent
        decision = context.get("decision", "")

        # Generating a simulated AI-style business insight report
        simulated_output = f"""
        --- AI Business Insights (Simulated Gemini) ---

        Model says:
        {metrics}

        Recommended Strategy:
        {decision}

        Additional Insights:
        - Sales patterns are indicating that customer activity is being influenced by date, item, and store.
        - Revenue is increasing when focusing on high-sales items.
        - Customer engagement is varying across stores; targeted promotions are helping improve reach.

        ------------------------------------------------
        """

        # Storing the simulated insights back into context for later use
        context["gemini_insights"] = simulated_output

        return context


**Orchestrator**

In [None]:
class Orchestrator:
    def __init__(self):
        # Initializing shared memory and logger for all agents
        self.memory = Memory()
        self.logger = Logger()

        # Initializing all agents with shared memory and logging system
        self.ingest = DataIngestionAgent("Ingest", self.memory, self.logger)
        self.clean = DataCleaningAgent("Clean", self.memory, self.logger)
        self.eda = EDAAgent("EDA", self.memory, self.logger)
        self.model = ModelingAgent("Model", self.memory, self.logger)
        self.decision = DecisionAgent("Decision", self.memory, self.logger)
        self.gemini = GeminiInsightAgent("Gemini", self.memory, self.logger)

    def run(self):
        # Creating an empty context dictionary for passing data between agents
        context = {}

        # Running the data ingestion agent
        context = self.ingest.run(context)

        # Running the data cleaning agent
        context = self.clean.run(context)

        # Running the EDA agent to analyze the cleaned data
        context = self.eda.run(context)

        # Running the modeling agent to train and evaluate models
        context = self.model.run(context)

        # Running the decision agent to create rule-based model recommendations
        context = self.decision.run(context)

        # Running the simulated Gemini insights agent to generate business insights
        context = self.gemini.run(context)

        # Returning the final aggregated context containing all outputs
        return context


**Runing the FULL PIPELINE**

In [None]:
# Creating an instance of the Orchestrator, which is initializing all agents
orch = Orchestrator()

# Running the full pipeline and collecting all outputs in the context dictionary
context = orch.run()

# Displaying the final business decision generated by the DecisionAgent
print("\n=== FINAL BUSINESS DECISION ===")
print(context.get("decision"))

# Displaying the simulated Gemini insights generated by the GeminiInsightAgent
print("\n=== GEMINI INSIGHTS ===")
print(context.get("gemini_insights"))

