In [None]:
# Add imports here
!pip install imbalanced-learn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Loading and "cleaning" data
df = pd.read_csv("California-Wildfire-Data.csv", low_memory=False)
df_obj = df.select_dtypes("object")

df_obj.head()

In [None]:
# Numeric data
df_num = df.select_dtypes("number")

df_num["* Street Number"] = df_num["* Street Number"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Street Number: {df_num["* Street Number"].isna().sum()}")

df_num["Assessed Improved Value (parcel)"] = df_num["Assessed Improved Value (parcel)"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Assessed Improved Value (parcel): {df_num["Assessed Improved Value (parcel)"].isna().sum()}")

df_num["Year Built (parcel)"] = df_num["Year Built (parcel)"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Year Built (parcel): {df_num["Year Built (parcel)"].isna().sum()}")

#df_num = df_num.dropna()
#print(f"Rows remaining after dropping na: {len(df_num.index)}")

df_num.head()

# Exploritory Data Analysis

In [None]:
# Exploratory Data Analysis
for col in df_obj.columns:
    unique_vals = df_obj[col].unique()
    n_unique = len(unique_vals)
    print(f"\n{col}:")
    print(f"Number of unique values: {n_unique}")
    if "unknown" in list(unique_vals):
        print(f"Number of missing values: {0 + df_obj[col].value_counts()["unknown"]}")
    if "Unknown" in list(unique_vals):
        print(f"Number of missing values: {0 + df_obj[col].value_counts()["Unknown"]}")
    if n_unique <= 20:
        print(f"Values: {list(unique_vals)}")
    else:
        print(f"Sample values (first 20): {list(unique_vals[:20])}")

In [None]:
for col in df_num.columns:
    print(f"\n{col}:")
    print(f"Mean: {df_num[col].mean():.2f}")
    print(f"Median: {df_num[col].median():.2f}")
    mode_vals = df_num[col].mode()
    if len(mode_vals) > 0:
        print(f"Mode: {mode_vals[0]:.2f}")
    else:
        print(f"Mode: N/A")
    print(f"Std Dev: {df_num[col].std():.2f}")
    print(f"Min: {df_num[col].min():.2f}")
    print(f"Max: {df_num[col].max():.2f}")

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df_obj.copy()
le_target = LabelEncoder()

# Fit and save the target encoder
le_target.fit(df_obj['* Damage'].astype(str))

# Encode all columns
for col in df_encoded.columns:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col].astype(str))

# combine numeric and encoded categorical data, not feature selected yet
attr = pd.concat([df_encoded.drop(['* Damage'], axis=1), df_num], axis=1)
target = df_encoded['* Damage']

attr.head()

In [None]:


from sklearn.feature_selection import SelectKBest, mutual_info_classif

selector = SelectKBest(score_func=mutual_info_classif, k=15)
# optimal attributes/features
out_feats = selector.fit_transform(attr.fillna(-1), target)
opt_attr = pd.DataFrame(out_feats, columns=selector.get_feature_names_out()).dropna(axis=1)
print("Best features:", selector.get_feature_names_out())
# Split data into training and testing sets 80-20
attr_train, attr_test, target_train, target_test = train_test_split(opt_attr, target, test_size=0.2, random_state=6)


# KNN


In [None]:
# KNN Implementation with SMOTE, Scaling and Best Params
import os
os.environ['LOKY_MAX_CPU_COUNT'] = '1'

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Apply SMOTE to Training Data Only
print("Applying SMOTE to training data...")
smote = SMOTE(random_state=42)
attr_train_res, target_train_res = smote.fit_resample(attr_train, target_train)

# Round to nearest int to keep categorical nature valid
attr_train_res = np.round(attr_train_res).astype(int)

# Scaling (Fit on Resampled Training Data)
scaler = MinMaxScaler()
attr_train_scaled = scaler.fit_transform(attr_train_res)
attr_test_scaled = scaler.transform(attr_test)

# # Grid Search (Commented out as requested)
# print("Starting Grid Search for KNN...")
# param_grid = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}
# grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3, n_jobs=1, verbose=1)
# grid_knn.fit(attr_train_scaled, target_train_res)
# print(f"Best KNN Parameters: {grid_knn.best_params_}")
# target_pred = grid_knn.predict(attr_test_scaled)

# Using Best Parameters directly
print("Training KNN with best parameters (n_neighbors=3, weights='distance')...")
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(attr_train_scaled, target_train_res)
target_pred = knn.predict(attr_test_scaled)

# Decode for report
target_test_decoded = le_target.inverse_transform(target_test)
target_pred_decoded = le_target.inverse_transform(target_pred)
labels = le_target.classes_

# Evaluation
print("KNN Classification Report (with SMOTE):")
print(classification_report(target_test_decoded, target_pred_decoded))

# Confusion Matrix
cm = confusion_matrix(target_test_decoded, target_pred_decoded, labels=labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix - KNN (SMOTE + Best Params)')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.xticks(rotation=45, ha='right')
plt.show()

K-Nearest Neighbors (KNN) is a distance-based classifier that predicts damage by finding the most similar examples in the training set; here, we use SMOTE to balance the classes and `weights='distance'` to prioritize closer neighbors. Our accurary is 83%, but improved recall for minority classes

# CART

In [None]:
# Using a label encoder here because using the get_dummies() method takes too much memory and crashes the kernel.
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


df_encoded = opt_attr.copy()
le_target = LabelEncoder()

# Fit and save the target encoder
le_target.fit(df_obj['* Damage'].astype(str))


# Encode all columns
for col in df_encoded.columns:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col].astype(str))

# attr = df_encoded.drop(['* Damage'], axis=1)
# target = df_encoded['* Damage']

attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.3, random_state=6)

# Added class_weight='balanced' to handle class imbalance
model = DecisionTreeClassifier()
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

# Decode predictions and targets
target_test_decoded = le_target.inverse_transform(target_test)
target_pred_decoded = le_target.inverse_transform(target_pred)

# Get class labels
labels = le_target.classes_

# Create confusion matrix
cm = confusion_matrix(target_test_decoded, target_pred_decoded, labels=labels)

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - CART (Balanced)')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print(classification_report(target_test_decoded, target_pred_decoded))

Using the CART method, we see that our model is 89% accurate. We can also see that most of our data either falls into "No Damage" or "Destroyed". There is significantly less data for any other categories.

# Naive Bayes


In [None]:
# Naive Bayes Implementation (with best parameters from tuning)
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split  # , GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# 1. Encode the data (similar to CART approach)
# Use LabelEncoder on each column to convert to non-negative integers
df_encoded_nb = opt_attr.copy()

for col in df_encoded_nb.columns:
    le = LabelEncoder()
    df_encoded_nb[col] = le.fit_transform(df_encoded_nb[col].astype(str))

# 2. Split Data (80-20 split)
attr_train_nb, attr_test_nb, target_train_nb, target_test_nb = train_test_split(
    df_encoded_nb, target, test_size=0.2, random_state=6
)

# 3. Compute sample weights for class balancing
sample_weights = compute_sample_weight(class_weight='balanced', y=target_train_nb)

# 4. Calculate min_categories from the full encoded dataset
min_categories = [int(df_encoded_nb[col].max() + 1) for col in df_encoded_nb.columns]

# # 5. Hyperparameter Tuning with GridSearchCV (COMMENTED OUT - using best params below)
# print("Performing hyperparameter tuning for CategoricalNB...")
# param_grid = {
#     'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
# }
# 
# nb_base = CategoricalNB(min_categories=min_categories)
# grid_search = GridSearchCV(
#     nb_base, 
#     param_grid, 
#     cv=5, 
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=1
# )
# 
# grid_search.fit(attr_train_nb, target_train_nb, sample_weight=sample_weights)
# 
# print(f"\nBest parameters: {grid_search.best_params_}")
# print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
# 
# nb = grid_search.best_estimator_

# 5. Train model with best parameters (from grid search: alpha=1.0)
print("Training CategoricalNB with best parameters (alpha=1.0)...")
nb = CategoricalNB(alpha=0.01, min_categories=min_categories)
nb.fit(attr_train_nb, target_train_nb, sample_weight=sample_weights)

# 6. Predict
target_pred_nb = nb.predict(attr_test_nb)

# 7. Evaluate
# Decode targets for readable report
target_test_decoded = le_target.inverse_transform(target_test_nb)
target_pred_decoded = le_target.inverse_transform(target_pred_nb)

print("\nNaive Bayes Classification Report:")
print(classification_report(target_test_decoded, target_pred_decoded))

# Confusion Matrix
cm_nb = confusion_matrix(target_test_decoded, target_pred_decoded, labels=le_target.classes_)

plt.figure(figsize=(10, 8))
sns.heatmap(cm_nb, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_target.classes_, yticklabels=le_target.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Naive Bayes')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


Naive Bayes is a probabilistic classifier based on Bayes' theorem; we use `CategoricalNB` suitable for our labeled data and apply SMOTE to improve the detection of rare damage categories. Our accurary is 73%, but improved recall for minority classes

# Decision Trees

In [None]:
# Code here

# ANN

In [None]:
# Code here

# HClust

In [None]:
# Code here

# KMeans

In [None]:
# Code here

# Agentic Model
The Agentic model works using a few external technologies:
1. Anthropic's Claude LLM for generating Python code
2. E2B -  a secure sandbox for running AI generated code.
Basically, we instruct the LLM to do data analysis tasks on our data set via generating Python code. Then, we analyze the results of the code and send it back to the LLM for further generation/analysis. Ultimately, we do this in a series of steps starting with EDA, going to cleaning, and then model picking and running. Finally, we use an LLM call to summarize all the code that has been generate and run to determine the best model.

In [None]:
%pip install e2b-code-interpreter
%pip install anthropic
%pip install python-dotenv

In [None]:
# This is the prompt that we will give to our agent.
eda_prompt = f"""You are a Python data analyst. Your job is to analyze the `California-Wildfire-Data.csv`
and generate a model to predict the damage done to houses based on the other features. The output of the Python code should
provide useful insights. Your first task is to do an Exploritory Data Analysis on the data. You can assume that the 
csv is accessible to you at `/home/user/California-Wildfire-Data.csv`. 

**Task**
For your first task, just load it and print the head.

Packages you are allowed to use:
- scikit-learn 
- pandas 
- numpy 
- seaborn 
- matplotlib

**CRITICAL**: Return only python code, your text response will be immediately sent to run in a secure sandbox."""

In [None]:
# Here, we send our first prompt to Claude, and it will generate our Python code for the EDA step.
from dotenv import load_dotenv
load_dotenv()
from anthropic import Anthropic
import os

client = Anthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY"), 
)

message = client.messages.create(
    max_tokens=10000,
    messages=[
        {
            "role": "user",
            "content": eda_prompt
           ,
        }
    ],
    model="claude-haiku-4-5-20251001",
)

In [None]:
eda_code = message.content[0].text.lstrip("```python").rstrip("```")
print(f"Proposed code: \n {eda_code}")

**Important note**
E2B Sandboxes only run for 5 minutes by default, so the following cells should be run sequentially within 5 minutes, as they require memory of previous code to properly perform.

In [None]:
from e2b_code_interpreter import Sandbox

sbx = Sandbox.create()
sbx.commands.run("pip install scikit-learn pandas numpy seaborn matplotlib")
with open("California-Wildfire-Data.csv", "rb") as file:
	sbx.files.write("/home/user/California-Wildfire-Data.csv", file)
result = sbx.run_code(eda_code)

In [None]:
eda_output = ""
for elm in result.logs.stdout:
    print(elm)
    eda_output += elm

In [None]:
# This is the prompt that we will give to our agent.
cleaning_prompt = f"""You are a Python data analyst. A Jupyter notebook cell has just been run to load in and do an EDA of `California-Wildfire-Data.csv`.
Based on the results of the code, please generate Python code to follow which will clean the data and report on what has been cleaned. Assume that your
code will run in the next Jupyter notebook cell.

**Task**
Generate Python code to clean the data described by the code and output below.

**Previous code block**
{eda_code}

**Output from that code**
{eda_output}

Packages you are allowed to use:
- scikit-learn 
- pandas 
- numpy 
- seaborn 
- matplotlib

**CRITICAL**: Return only python code, your text response will be immediately sent to run in a secure sandbox."""

message = client.messages.create(
    max_tokens=10000,
    messages=[
        {
            "role": "user",
            "content": cleaning_prompt
           ,
        }
    ],
    model="claude-haiku-4-5-20251001",
)

In [None]:
cleaning_code = message.content[0].text.lstrip("```python").rstrip("```")
print(f"Proposed code: \n {cleaning_code}")

In [None]:
result = sbx.run_code(cleaning_code)
cleaning_output = ""
for elm in result.logs.stdout:
    print(elm)
    cleaning_output += elm

In [None]:
# This is the prompt that we will give to our agent.
model_prompt = f"""You are a Python data analyst. EDA and Data Cleaning have just been run on our dataset in `California-Wildfire-Data.csv`.
Your job is to compare models based on their ability to predict Damage. You must generate Python code which should run each model and create
a comprehensive report based on the results, detailing which model should be used and how (which parameters)

**Task**
Based on the EDA and Data Cleaning Results, run each of the models below on the data to find the best at predicting damage based on the other features.

**Models**
- CART
- KNN
- Naive Bayes

**Previous code blocks**
{eda_code}
{cleaning_code}

**Output from that code**
{eda_output}
{cleaning_code}

Packages you are allowed to use:
- scikit-learn 
- pandas 
- numpy 
- seaborn 
- matplotlib

**CRITICAL**: Return only python code, your text response will be immediately sent to run in a secure sandbox."""

message = client.messages.create(
    max_tokens=10000,
    messages=[
        {
            "role": "user",
            "content": model_prompt
           ,
        }
    ],
    model="claude-haiku-4-5-20251001",
)

In [None]:
model_code = message.content[0].text.lstrip("```python").rstrip("```")
print(f"Proposed code: \n {model_code}")

In [None]:
result = sbx.run_code(model_code)
model_output = ""
for elm in result.logs.stdout:
    print(elm)
    model_output += elm

In [None]:
summary_prompt = f"""You are a Python data analyst. You are receiving the results of a data analysis on comparing models, working with California
Wildfire data. Based on the code and output, generate a markdown report summarizing the findings. Be concise, and be sure to explain which model 
performed the best, and why.

**Task**
Generate a markdown summary of the Python data analysis below

**Previous code blocks**
{eda_code}
{cleaning_code}
{model_code}

**Output from that code**
{eda_output}
{cleaning_code}
{model_output}

**CRITICAL**: Return only the markdown report."""

message = client.messages.create(
    max_tokens=10000,
    messages=[
        {
            "role": "user",
            "content": summary_prompt
           ,
        }
    ],
    model="claude-haiku-4-5-20251001",
)

In [None]:
print(message.content[0].text)