In [None]:
from langchain_openai import AzureChatOpenAI
import os

os.environ["AZURE_OPENAI_API_KEY"] = "<your_azure_openai_api_key>"
os.environ["AZURE_OPENAI_ENDPOINT"] = "<your_azure_openai_endpoint>"
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-02-15-preview"

llm = AzureChatOpenAI(
    azure_deployment="gpt-4o-mini",
    api_version="2024-02-15-preview",
    temperature=0
)

In [17]:

# 0. IMPORTS

import json
import numpy as np
import pandas as pd

from typing import TypedDict

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

from langgraph.graph import StateGraph
from langchain_openai import AzureChatOpenAI


# 1. OPENAI (AZURE) CONFIG

llm = AzureChatOpenAI(
    azure_deployment="gpt-4o-mini",
    api_version="2024-02-15-preview",
    temperature=0
)



# 2. LOAD DATA (5% SAMPLE)

PATH = r"..............."

df = pd.read_parquet(PATH)
df = df.sample(frac=0.05, random_state=42).reset_index(drop=True)



# 3. CLEANING + FEATURES

df = df.dropna(subset=["passenger_count", "fare_amount", "trip_distance"])

df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

df["trip_duration_min"] = (
    (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"])
    .dt.total_seconds() / 60
)

df = df[
    (df["trip_distance"] > 0) &
    (df["trip_duration_min"] > 1) &
    (df["fare_amount"] > 2.5)
]

df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
df["pickup_dayofweek"] = df["tpep_pickup_datetime"].dt.dayofweek


FEATURES = [
    "trip_distance",
    "trip_duration_min",
    "passenger_count",
    "pickup_hour",
    "pickup_dayofweek"
]

X = df[FEATURES]
y = df["fare_amount"]



# 4. SPLITS

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)



# 5. LANGGRAPH STATE

class State(TypedDict):
    lr_rmse: float
    nn_rmse: float
    best_model: str
    decision_reason: str
    test_rmse: float



# 6. NODES

def train_lr(state: State):
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ])
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    state["lr_rmse"] = float(np.sqrt(mean_squared_error(y_val, preds)))
    return state


def train_nn(state: State):
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("model", MLPRegressor(
            hidden_layer_sizes=(64, 32),
            max_iter=300,
            random_state=42
        ))
    ])
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    state["nn_rmse"] = float(np.sqrt(mean_squared_error(y_val, preds)))
    return state


def agent_decide(state: State):
    prompt = f"""
You are an ML decision agent.

Validation RMSE results:
- Linear Regression: {state["lr_rmse"]}
- Neural Network: {state["nn_rmse"]}

Decide which model should be used for final training and testing.

Return ONLY valid JSON:
{{
  "best_model": "LinearRegression or NeuralNetwork",
  "reason": "short technical explanation"
}}
"""

    response = llm.invoke(prompt)
    decision = json.loads(response.content)

    state["best_model"] = decision["best_model"]
    state["decision_reason"] = decision["reason"]
    return state


def final_test(state: State):
    if state["best_model"] == "LinearRegression":
        model = Pipeline([
            ("scaler", StandardScaler()),
            ("model", LinearRegression())
        ])
    else:
        model = Pipeline([
            ("scaler", StandardScaler()),
            ("model", MLPRegressor(
                hidden_layer_sizes=(64, 32),
                max_iter=300,
                random_state=42
            ))
        ])

    model.fit(X_train_full, y_train_full)
    preds = model.predict(X_test)

    state["test_rmse"] = float(np.sqrt(mean_squared_error(y_test, preds)))
    return state



# 7. LANGGRAPH FLOW

graph = StateGraph(State)

graph.add_node("train_lr", train_lr)
graph.add_node("train_nn", train_nn)
graph.add_node("decide", agent_decide)
graph.add_node("final_test", final_test)

graph.set_entry_point("train_lr")

graph.add_edge("train_lr", "train_nn")
graph.add_edge("train_nn", "decide")
graph.add_edge("decide", "final_test")

app = graph.compile()



# 8. RUN

result = app.invoke({})

print("LR RMSE (val):", result["lr_rmse"])
print("NN RMSE (val):", result["nn_rmse"])
print("AGENT DECISION:", result["best_model"])
print("REASON:", result["decision_reason"])
print("TEST RMSE:", result["test_rmse"])


LR RMSE (val): 4.01038598204457
NN RMSE (val): 3.24534490588303
AGENT DECISION: NeuralNetwork
REASON: The Neural Network has a lower RMSE (3.245) compared to the Linear Regression (4.010), indicating better predictive performance on the validation set.
TEST RMSE: 3.37103102769881
