In [1]:
# Add imports here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Loading and "cleaning" data
df = pd.read_csv("California-Wildfire-Data.csv")
df_obj = df.select_dtypes("object")

df_obj.head()

  df = pd.read_csv("California-Wildfire-Data.csv")


Unnamed: 0,* Damage,* Street Name,"* Street Type (e.g. road, drive, lane, etc.)",* City,State,* CAL FIRE Unit,County,* Incident Name,Incident Number (e.g. CAAEU 123456),Incident Start Date,...,* Vent Screen,* Exterior Siding,* Window Pane,* Deck/Porch On Grade,* Deck/Porch Elevated,* Patio Cover/Carport Attached to Structure,* Fence Attached to Structure,APN (parcel),Site Address (parcel),GLOBALID
0,Destroyed (>50%),unknown,Road,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,"Mesh Screen <= 1/8""""",Wood,Single Pane,No Deck/Porch,No Deck/Porch,No Patio Cover/Carport,No Fence,11616045,,ce752f2c-dea2-4647-b0cd-e4ed01f8619d
1,Destroyed (>50%),unknown,Road,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,"Mesh Screen <= 1/8""""",Wood,Single Pane,Wood,No Deck/Porch,No Patio Cover/Carport,No Fence,11616015,,1ddec00c-ab14-477a-a48d-cd86379d6883
2,No Damage,unknown,,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,others,11616045,,cf41a0c1-8150-4052-96cf-60a97076033c
3,Destroyed (>50%),unknown,,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,Unknown,Wood,Multi Pane,Wood,No Deck/Porch,No Patio Cover/Carport,No Fence,11616061,,d6664da1-7df2-402c-8927-bbd2f845f1de
4,Destroyed (>50%),unknown,,,CA,FKU,Fresno,Creek,CAFKU 013369,09-04-2020 00:00,...,No Vents,Stucco Brick Cement,Multi Pane,No Deck/Porch,No Deck/Porch,No Patio Cover/Carport,No Fence,11616061,,cba02c94-02f9-4a24-a22c-d34ab2b49588


In [3]:
# Numeric data
df_num = df.select_dtypes("number")

df_num["* Street Number"] = df_num["* Street Number"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Street Number: {df_num["* Street Number"].isna().sum()}")

df_num["Assessed Improved Value (parcel)"] = df_num["Assessed Improved Value (parcel)"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Assessed Improved Value (parcel): {df_num["Assessed Improved Value (parcel)"].isna().sum()}")

df_num["Year Built (parcel)"] = df_num["Year Built (parcel)"].replace(to_replace=0, value=np.nan)
print(f"Missing values in Year Built (parcel): {df_num["Year Built (parcel)"].isna().sum()}")

df_num = df_num.dropna()
print(f"Rows remaining after dropping na: {len(df_num.index)}")

df_num.head()

Missing values in Street Number: 2606
Missing values in Assessed Improved Value (parcel): 3208
Missing values in Year Built (parcel): 6731
Rows remaining after dropping na: 47395


Unnamed: 0,_id,* Street Number,Assessed Improved Value (parcel),Year Built (parcel),Latitude,Longitude,x,y
2127,1556,27939.0,132406.0,1952.0,34.416623,-117.864967,-13120668.1,4084882.485
2128,1557,27940.0,258900.0,1957.0,34.416593,-117.86658,-13120847.73,4084878.413
2129,1560,27939.0,83933.0,1974.0,34.416921,-117.869039,-13121121.42,4084922.616
2130,1564,27811.0,100744.0,1974.0,34.415676,-117.864654,-13120633.23,4084754.699
2131,1570,29329.0,173161.0,1955.0,34.436117,-117.936557,-13128637.45,4087513.343


# Exploritory Data Analysis

In [4]:
# Exploratory Data Analysis
for col in df_obj.columns:
    unique_vals = df_obj[col].unique()
    n_unique = len(unique_vals)
    print(f"\n{col}:")
    print(f"Number of unique values: {n_unique}")
    if "unknown" in list(unique_vals):
        print(f"Number of missing values: {0 + df_obj[col].value_counts()["unknown"]}")
    if "Unknown" in list(unique_vals):
        print(f"Number of missing values: {0 + df_obj[col].value_counts()["Unknown"]}")
    if n_unique <= 20:
        print(f"Values: {list(unique_vals)}")
    else:
        print(f"Sample values (first 20): {list(unique_vals[:20])}")


* Damage:
Number of unique values: 6
Values: ['Destroyed (>50%)', 'No Damage', 'Minor (10-25%)', 'Affected (1-9%)', 'Major (26-50%)', 'Inaccessible']

* Street Name:
Number of unique values: 4354
Number of missing values: 63
Sample values (first 20): ['unknown', 'Mammoth', 'Sunset Rock', 'Minaret', 'Meadow North', 'North Meadow', 'Minarets', 'Mammoth Falls', 'Mammoth Pool', '4S81', 'Tamarack', 'Pine Cone Path', 'Beasore', 'North Silvertip', 'Meadow', 'Jose Basin', 'Sugarloaf Road', 'Kinsman Flat', 'Arrowhead', 'Point']

* Street Type (e.g. road, drive, lane, etc.):
Number of unique values: 19
Values: ['Road', nan, 'Lane', 'Trail', 'Way', 'Other', 'Drive', 'Circle', 'Place', 'Court', 'Loop', 'Route', 'Parkway', 'Terrace', 'Street', 'Avenue', 'Boulevard', 'Alley', 'Hwy']

* City:
Number of unique values: 213
Number of missing values: 1
Sample values (first 20): [nan, 'Los Angeles', 'LANCASTER', 'Gorman', 'Llano', 'Malibu', 'Pismo Beach', 'Browns Valley', 'Unincorporated', 'Potrero', 'Ch

In [5]:
for col in df_num.columns:
    print(f"\n{col}:")
    print(f"Mean: {df_num[col].mean():.2f}")
    print(f"Median: {df_num[col].median():.2f}")
    mode_vals = df_num[col].mode()
    if len(mode_vals) > 0:
        print(f"Mode: {mode_vals[0]:.2f}")
    else:
        print(f"Mode: N/A")
    print(f"Std Dev: {df_num[col].std():.2f}")
    print(f"Min: {df_num[col].min():.2f}")
    print(f"Max: {df_num[col].max():.2f}")


_id:
Mean: 40299.21
Median: 35128.00
Mode: 1.00
Std Dev: 28254.95
Min: 1.00
Max: 100230.00

* Street Number:
Mean: 39784.73
Median: 5382.00
Mode: 580.00
Std Dev: 6477053.27
Min: 1.00
Max: 1410065407.00

Assessed Improved Value (parcel):
Mean: 846757.99
Median: 163419.00
Mode: 54139901.00
Std Dev: 7763757.66
Min: 100.00
Max: 393602009.00

Year Built (parcel):
Mean: 1972.28
Median: 1976.00
Mode: 1979.00
Std Dev: 30.75
Min: 1.00
Max: 2022.00

Latitude:
Mean: 38.33
Median: 39.56
Mode: 33.99
Std Dev: 2.09
Min: 32.59
Max: 41.94

Longitude:
Mean: -121.18
Median: -121.60
Mode: -122.64
Std Dev: 1.48
Min: -123.20
Max: -116.62

x:
Mean: -13489443.30
Median: -13536623.49
Mode: -13534734.32
Std Dev: 165277.64
Min: -13714761.62
Max: -12982511.63

y:
Mean: 4630033.20
Median: 4802665.77
Mode: 4027491.15
Std Dev: 291640.67
Min: 3841345.56
Max: 5151260.83


# KNN

In [6]:
# Code here

# CART

In [None]:
# Using a label encoder here because using the get_dummies() method takes too much memory and crashes the kernel.
from sklearn.preprocessing import LabelEncoder

df_encoded = df_obj.copy()
le = LabelEncoder()

for col in df_encoded.columns:
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

attr = df_encoded.drop(['* Damage'], axis=1)
target = df_encoded['* Damage']

attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.3, random_state=6)

model = DecisionTreeClassifier()
model.fit(attr_train, target_train)
target_pred = model.predict(attr_test)

print(confusion_matrix(target_test, target_pred))
print(classification_report(target_test, target_pred))

[[ 126  261    0   14   18  145]
 [ 304 8477    9   37   52  322]
 [   4    2   28    0    1   19]
 [  11   29    0    9    6   19]
 [  23   55    0    3   11   31]
 [ 170  411   25   24   41 6038]]
              precision    recall  f1-score   support

           0       0.20      0.22      0.21       564
           1       0.92      0.92      0.92      9201
           2       0.45      0.52      0.48        54
           3       0.10      0.12      0.11        74
           4       0.09      0.09      0.09       123
           5       0.92      0.90      0.91      6709

    accuracy                           0.88     16725
   macro avg       0.45      0.46      0.45     16725
weighted avg       0.88      0.88      0.88     16725



# Naive Bayes

In [8]:
# Code here

# Decision Trees

In [9]:
# Code here

# ANN

In [10]:
# Code here

# HClust

In [11]:
# Code here

# KMeans

In [12]:
# Code here