In [24]:
# Add imports here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [25]:
# Loading and "cleaning" data
df = pd.read_csv("California-Wildfire-Data.csv")
df_obj = df.select_dtypes("object")

df_obj.head()

  df = pd.read_csv("California-Wildfire-Data.csv")


Unnamed: 0,* Damage,* Street Name,"* Street Type (e.g. road, drive, lane, etc.)",* City,State,* CAL FIRE Unit,County,* Incident Name,Incident Number (e.g. CAAEU 123456),Incident Start Date,...,* Vent Screen,* Exterior Siding,* Window Pane,* Deck/Porch On Grade,* Deck/Porch Elevated,* Patio Cover/Carport Attached to Structure,* Fence Attached to Structure,APN (parcel),Site Address (parcel),GLOBALID
0,Destroyed (>50%),unknown,Road,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,"Mesh Screen <= 1/8""""",Wood,Single Pane,No Deck/Porch,No Deck/Porch,No Patio Cover/Carport,No Fence,11616045,,ce752f2c-dea2-4647-b0cd-e4ed01f8619d
1,Destroyed (>50%),unknown,Road,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,"Mesh Screen <= 1/8""""",Wood,Single Pane,Wood,No Deck/Porch,No Patio Cover/Carport,No Fence,11616015,,1ddec00c-ab14-477a-a48d-cd86379d6883
2,No Damage,unknown,,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,others,11616045,,cf41a0c1-8150-4052-96cf-60a97076033c
3,Destroyed (>50%),unknown,,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,Unknown,Wood,Multi Pane,Wood,No Deck/Porch,No Patio Cover/Carport,No Fence,11616061,,d6664da1-7df2-402c-8927-bbd2f845f1de
4,Destroyed (>50%),unknown,,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,No Vents,Stucco Brick Cement,Multi Pane,No Deck/Porch,No Deck/Porch,No Patio Cover/Carport,No Fence,11616061,,cba02c94-02f9-4a24-a22c-d34ab2b49588


In [26]:
# Numeric data
df_num = df.select_dtypes("number")

df_num["* Street Number"] = df_num["* Street Number"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Street Number: {df_num["* Street Number"].isna().sum()}")

df_num["Assessed Improved Value (parcel)"] = df_num["Assessed Improved Value (parcel)"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Assessed Improved Value (parcel): {df_num["Assessed Improved Value (parcel)"].isna().sum()}")

df_num["Year Built (parcel)"] = df_num["Year Built (parcel)"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Year Built (parcel): {df_num["Year Built (parcel)"].isna().sum()}")

#df_num = df_num.dropna()
#print(f"Rows remaining after dropping na: {len(df_num.index)}")

df_num.head()

Missing values in Street Number: 2606
Missing values in Assessed Improved Value (parcel): 3208
Missing values in Year Built (parcel): 6731


Unnamed: 0,_id,* Street Number,Assessed Improved Value (parcel),Year Built (parcel),Latitude,Longitude,x,y
0,6092,,,,37.111043,-119.423521,-13294165.6,4454596.026
1,6094,,,,37.10664,-119.422092,-13294006.5,4453981.424
2,6165,,,,37.111133,-119.423931,-13294211.18,4454608.627
3,6171,,,,37.10873,-119.41896,-13293657.81,4454273.178
4,6172,,,,37.10887,-119.418843,-13293644.77,4454292.743


# Exploritory Data Analysis

In [27]:
# Exploratory Data Analysis
for col in df_obj.columns:
    unique_vals = df_obj[col].unique()
    n_unique = len(unique_vals)
    print(f"\n{col}:")
    print(f"Number of unique values: {n_unique}")
    if "unknown" in list(unique_vals):
        print(f"Number of missing values: {0 + df_obj[col].value_counts()["unknown"]}")
    if "Unknown" in list(unique_vals):
        print(f"Number of missing values: {0 + df_obj[col].value_counts()["Unknown"]}")
    if n_unique <= 20:
        print(f"Values: {list(unique_vals)}")
    else:
        print(f"Sample values (first 20): {list(unique_vals[:20])}")


* Damage:
Number of unique values: 6
Values: ['Destroyed (>50%)', 'No Damage', 'Minor (10-25%)', 'Affected (1-9%)', 'Major (26-50%)', 'Inaccessible']

* Street Name:
Number of unique values: 4354
Number of missing values: 63
Sample values (first 20): ['unknown', 'Mammoth', 'Sunset Rock', 'Minaret', 'Meadow North', 'North Meadow', 'Minarets', 'Mammoth Falls', 'Mammoth Pool', '4S81', 'Tamarack', 'Pine Cone Path', 'Beasore', 'North Silvertip', 'Meadow', 'Jose Basin', 'Sugarloaf Road', 'Kinsman Flat', 'Arrowhead', 'Point']

* Street Type (e.g. road, drive, lane, etc.):
Number of unique values: 19
Values: ['Road', nan, 'Lane', 'Trail', 'Way', 'Other', 'Drive', 'Circle', 'Place', 'Court', 'Loop', 'Route', 'Parkway', 'Terrace', 'Street', 'Avenue', 'Boulevard', 'Alley', 'Hwy']

* City:
Number of unique values: 213
Number of missing values: 1
Sample values (first 20): [nan, 'Los Angeles', 'LANCASTER', 'Gorman', 'Llano', 'Malibu', 'Pismo Beach', 'Browns Valley', 'Unincorporated', 'Potrero', 'Ch

In [28]:
for col in df_num.columns:
    print(f"\n{col}:")
    print(f"Mean: {df_num[col].mean():.2f}")
    print(f"Median: {df_num[col].median():.2f}")
    mode_vals = df_num[col].mode()
    if len(mode_vals) > 0:
        print(f"Mode: {mode_vals[0]:.2f}")
    else:
        print(f"Mode: N/A")
    print(f"Std Dev: {df_num[col].std():.2f}")
    print(f"Min: {df_num[col].min():.2f}")
    print(f"Max: {df_num[col].max():.2f}")


_id:
Mean: 40070.61
Median: 34504.50
Mode: 1.00
Std Dev: 29739.40
Min: 1.00
Max: 100230.00

* Street Number:
Mean: 37972.76
Median: 5501.00
Mode: 580.00
Std Dev: 6116971.87
Min: 1.00
Max: 1410065407.00

Assessed Improved Value (parcel):
Mean: 826885.65
Median: 159480.00
Mode: 54139901.00
Std Dev: 7729184.52
Min: 100.00
Max: 393602009.00

Year Built (parcel):
Mean: 1971.85
Median: 1976.00
Mode: 1979.00
Std Dev: 30.96
Min: 1.00
Max: 2022.00

Latitude:
Mean: 38.17
Median: 38.69
Mode: 39.75
Std Dev: 2.14
Min: 32.59
Max: 41.94

Longitude:
Mean: -121.03
Median: -121.60
Mode: -122.78
Std Dev: 1.57
Min: -123.45
Max: -116.62

x:
Mean: -13472711.53
Median: -13536060.62
Mode: -13534734.32
Std Dev: 175063.62
Min: -13741873.56
Max: -12982511.63

y:
Mean: 4608113.37
Median: 4678024.00
Mode: 4829244.90
Std Dev: 298730.14
Min: 3841345.56
Max: 5151260.83


## Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df_obj.copy()
le_target = LabelEncoder()

# Fit and save the target encoder
le_target.fit(df_obj['* Damage'].astype(str))

# Encode all columns
for col in df_encoded.columns:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col].astype(str))

# combine numeric and encoded categorical data, not feature selected yet
attr = pd.concat([df_encoded.drop(['* Damage'], axis=1), df_num], axis=1)
target = df_encoded['* Damage']

attr.head()

Unnamed: 0,* Street Name,"* Street Type (e.g. road, drive, lane, etc.)",* City,State,* CAL FIRE Unit,County,* Incident Name,Incident Number (e.g. CAAEU 123456),Incident Start Date,Hazard Type,...,Site Address (parcel),GLOBALID,_id,* Street Number,Assessed Improved Value (parcel),Year Built (parcel),Latitude,Longitude,x,y
0,4351,12,211,0,5,7,38,44,61,0,...,30698,45026,6092,,,,37.111043,-119.423521,-13294165.6,4454596.026
1,4351,12,211,0,5,7,38,44,61,0,...,30698,6566,6094,,,,37.10664,-119.422092,-13294006.5,4453981.424
2,4351,18,211,0,5,7,38,44,61,0,...,30698,45205,6165,,,,37.111133,-119.423931,-13294211.18,4454608.627
3,4351,18,211,0,5,7,38,44,61,0,...,30698,46702,6171,,,,37.10873,-119.41896,-13293657.81,4454273.178
4,4351,18,211,0,5,7,38,44,61,0,...,30698,44421,6172,,,,37.10887,-119.418843,-13293644.77,4454292.743


In [53]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

selector = SelectKBest(score_func=mutual_info_classif, k=15)
# optimal attributes/features
out_feats = selector.fit_transform(attr.fillna(-1), target)
opt_attr = pd.DataFrame(opt_feats, columns=selector.get_feature_names_out()).dropna(axis=1)
print("Best features:", selector.get_feature_names_out())
# Split data into training and testing sets 80-20
attr_train, attr_test, target_train, target_test = train_test_split(opt_attr, target, test_size=0.2, random_state=6)

Best features: ['* Street Name' '* City' '* CAL FIRE Unit' 'County' '* Incident Name'
 'Incident Number (e.g. CAAEU 123456)' 'Incident Start Date'
 '* Exterior Siding' 'APN (parcel)' 'Site Address (parcel)' '_id'
 'Latitude' 'Longitude' 'x' 'y']


# KNN

In [54]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
k_values = [1,3, 5, 7, 10, 25, 50]

# for i in range(1,20):
#     print(i)

best_knn = None
best_accuracy = 0
 
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors = k)
#     knn = KNeighborsClassifier(n_neighbors = k,weights='distance')
    knn.fit(attr_train, target_train)
    target_pred = knn.predict(attr_test)
#     accuracy = round(np.mean(target_test==target_pred ) * 100, 2)
    accuracy = accuracy_score(target_test,target_pred ) 
    print(f'Testing accuracy of model with k = {k}: {accuracy}')
    print('')
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_knn = knn

Testing accuracy of model with k = 1: 0.8604484304932736

Testing accuracy of model with k = 3: 0.8600896860986547

Testing accuracy of model with k = 5: 0.8646636771300449

Testing accuracy of model with k = 7: 0.8625112107623318

Testing accuracy of model with k = 10: 0.8624215246636772

Testing accuracy of model with k = 25: 0.8506726457399103

Testing accuracy of model with k = 50: 0.83847533632287



In [55]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

target_pred = best_knn.predict(attr_test)
cm=confusion_matrix(target_test, target_pred)

print('Confusion Matrix')
print(confusion_matrix(target_test, target_pred))
print()
print('Accuracy score')
print(accuracy_score(target_test, target_pred))
print()
print('Classification Report')
print(classification_report(target_test, target_pred))


Confusion Matrix
[[  23  246    2    2    2  105]
 [  67 5559    3    3    2  468]
 [   1    9   11    0    0   13]
 [   3   32    0    0    0   15]
 [   7   49    0    0    1   26]
 [  27  417    8    2    0 4047]]

Accuracy score
0.8646636771300449

Classification Report
              precision    recall  f1-score   support

           0       0.18      0.06      0.09       380
           1       0.88      0.91      0.90      6102
           2       0.46      0.32      0.38        34
           3       0.00      0.00      0.00        50
           4       0.20      0.01      0.02        83
           5       0.87      0.90      0.88      4501

    accuracy                           0.86     11150
   macro avg       0.43      0.37      0.38     11150
weighted avg       0.84      0.86      0.85     11150



In [61]:
import seaborn as sns
import matplotlib.pyplot as plt     

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(selector.get_feature_names_out())
ax.yaxis.set_ticklabels(selector.get_feature_names_out())

ImportError: /usr/lib/libc.so.6: version `GLIBC_ABI_DT_X86_64_PLT' not found (required by /usr/lib/libdl.so.2)

# CART

In [56]:
# Using a label encoder here because using the get_dummies() method takes too much memory and crashes the kernel.
import matplotlib.pyplot as plt
import seaborn as sns


model = DecisionTreeClassifier()
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

# Decode predictions and targets
target_test_decoded = le_target.inverse_transform(target_test)
target_pred_decoded = le_target.inverse_transform(target_pred)

# Get class labels
labels = le_target.classes_

# Create confusion matrix
cm = confusion_matrix(target_test_decoded, target_pred_decoded, labels=labels)

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - CART')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print(classification_report(target_test_decoded, target_pred_decoded))

ImportError: /usr/lib/libc.so.6: version `GLIBC_ABI_DT_X86_64_PLT' not found (required by /usr/lib/libdl.so.2)

Using the CART method, we see that our model is 88% accurate. We can also see that most of our data either falls into "No Damage" or "Destroyed". There is significantly less data for any other categories.

# Naive Bayes

In [60]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler 
from sklearn.naive_bayes import CategoricalNB

categorical_columns=['Latitude', 'Longitude', 'x', 'y']
# Create preprocessor for categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', MinMaxScaler(), categorical_columns) ])
# Create a Categorical Naive Bayes model
cnb = CategoricalNB()

# Create a pipeline with preprocessing and model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', cnb)])
# Create a Categorical Naive Bayes model

# Train the model
model.fit(attr_train, target_train)

# Make predictions on the test set
target_pred = model.predict(attr_test)

# Evaluate the accuracy
accuracy = accuracy_score(target_test, target_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.5472645739910313


# Decision Trees

In [None]:
# Code here

# ANN

In [None]:
# Code here

# HClust

In [None]:
# Code here

# KMeans

In [None]:
# Code here