In [None]:
# Add imports here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Loading and "cleaning" data
df = pd.read_csv("California-Wildfire-Data.csv")
df_obj = df.select_dtypes("object")

df_obj.head()

In [None]:
# Numeric data
df_num = df.select_dtypes("number")

df_num["* Street Number"] = df_num["* Street Number"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Street Number: {df_num["* Street Number"].isna().sum()}")

df_num["Assessed Improved Value (parcel)"] = df_num["Assessed Improved Value (parcel)"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Assessed Improved Value (parcel): {df_num["Assessed Improved Value (parcel)"].isna().sum()}")

df_num["Year Built (parcel)"] = df_num["Year Built (parcel)"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Year Built (parcel): {df_num["Year Built (parcel)"].isna().sum()}")

#df_num = df_num.dropna()
#print(f"Rows remaining after dropping na: {len(df_num.index)}")

df_num.head()

# Exploritory Data Analysis

In [None]:
# Exploratory Data Analysis
for col in df_obj.columns:
    unique_vals = df_obj[col].unique()
    n_unique = len(unique_vals)
    print(f"\n{col}:")
    print(f"Number of unique values: {n_unique}")
    if "unknown" in list(unique_vals):
        print(f"Number of missing values: {0 + df_obj[col].value_counts()["unknown"]}")
    if "Unknown" in list(unique_vals):
        print(f"Number of missing values: {0 + df_obj[col].value_counts()["Unknown"]}")
    if n_unique <= 20:
        print(f"Values: {list(unique_vals)}")
    else:
        print(f"Sample values (first 20): {list(unique_vals[:20])}")

In [None]:
for col in df_num.columns:
    print(f"\n{col}:")
    print(f"Mean: {df_num[col].mean():.2f}")
    print(f"Median: {df_num[col].median():.2f}")
    mode_vals = df_num[col].mode()
    if len(mode_vals) > 0:
        print(f"Mode: {mode_vals[0]:.2f}")
    else:
        print(f"Mode: N/A")
    print(f"Std Dev: {df_num[col].std():.2f}")
    print(f"Min: {df_num[col].min():.2f}")
    print(f"Max: {df_num[col].max():.2f}")

## Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df_obj.copy()
le_target = LabelEncoder()

# Fit and save the target encoder
le_target.fit(df_obj['* Damage'].astype(str))

# Encode all columns
for col in df_encoded.columns:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col].astype(str))

# combine numeric and encoded categorical data, not feature selected yet
attr = pd.concat([df_encoded.drop(['* Damage'], axis=1), df_num], axis=1)
target = df_encoded['* Damage']

attr.head()

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

selector = SelectKBest(score_func=mutual_info_classif, k=15)
# optimal attributes/features
out_feats = selector.fit_transform(attr.fillna(-1), target)
opt_attr = pd.DataFrame(out_feats, columns=selector.get_feature_names_out()).dropna(axis=1)
print("Best features:", selector.get_feature_names_out())
# Split data into training and testing sets 80-20
attr_train, attr_test, target_train, target_test = train_test_split(opt_attr, target, test_size=0.2, random_state=6)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
k_values = [1,3, 5, 7, 10, 25, 50]

# for i in range(1,20):
#     print(i)

best_knn = None
best_accuracy = 0
 
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors = k)
#     knn = KNeighborsClassifier(n_neighbors = k,weights='distance')
    knn.fit(attr_train, target_train)
    target_pred = knn.predict(attr_test)
#     accuracy = round(np.mean(target_test==target_pred ) * 100, 2)
    accuracy = accuracy_score(target_test,target_pred ) 
    print(f'Testing accuracy of model with k = {k}: {accuracy}')
    print('')
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_knn = knn

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

target_pred = best_knn.predict(attr_test)
target_pred_dec = le_target.inverse_transform(target_pred)
target_test_dec = le_target.inverse_transform(target_test)

cm=confusion_matrix(target_test_dec, target_pred_dec)

print('Confusion Matrix')
print(confusion_matrix(target_test_dec, target_pred_dec))
print()
print('Accuracy score')
print(accuracy_score(target_test, target_pred))
print()
print('Classification Report')
print(classification_report(target_test_dec, target_pred_dec))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt     

plt.figure(figsize=(12, 10))
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);  # annot=True to annotate cells, fmt='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(le_target.inverse_transform(target.unique()))
ax.yaxis.set_ticklabels(le_target.inverse_transform(target.unique()))

Looking at the accuracy, we can clearly see that KNN is doing well, with an 86%. However, we look at the confusion matrix and notice that it does not do as well for some of the categories, as is the case of the "Destroyed" category, where we seem to have trouble between telling no damage from destroyed. We seemingly also have a bit of trouble discerning "No Damage" from "Inaccessible". Some of this could be due to the inbalanced number of datapoints for each category in the dataset. The dataset has many more "Inaccessible" and "No Damage" homes than other categories.

# CART

In [None]:
# Using a label encoder here because using the get_dummies() method takes too much memory and crashes the kernel.
import matplotlib.pyplot as plt
import seaborn as sns


model = DecisionTreeClassifier()
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

# Decode predictions and targets
target_test_decoded = le_target.inverse_transform(target_test)
target_pred_decoded = le_target.inverse_transform(target_pred)

# Get class labels
labels = le_target.classes_

# Create confusion matrix
cm = confusion_matrix(target_test_decoded, target_pred_decoded, labels=labels)

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - CART')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print(classification_report(target_test_decoded, target_pred_decoded))

Using the CART method, we see that our model is 87% accurate. We can also see that most of our data either falls into "No Damage" or "Destroyed". There is significantly less data for any other categories.

# Naive Bayes

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler 
from sklearn.naive_bayes import CategoricalNB

categorical_columns=['Latitude', 'Longitude', 'x', 'y']
# Create preprocessor for categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', MinMaxScaler(), categorical_columns) ])
# Create a Categorical Naive Bayes model
cnb = CategoricalNB()

# Create a pipeline with preprocessing and model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', cnb)])
# Create a Categorical Naive Bayes model

# Train the model
model.fit(attr_train, target_train)

# Make predictions on the test set
target_pred = model.predict(attr_test)

# Evaluate the accuracy
accuracy = accuracy_score(target_test, target_pred)
print(f'Accuracy: {accuracy}')

As we can see from the accuracy score above, naive bayes gives us a 54.73% accurate output, which is quite low when compared with other models. Thus, we conclude that Naive Bayes is unsuitable for our purposes.

# Decision Trees

In [None]:
# Code here

# ANN

In [None]:
# Code here

# HClust

In [None]:
# Code here

# KMeans

In [None]:
# Code here