In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [33]:
# import data
df = pd.read_csv('/Users/rachanon.cho/Documents/Kaggle Practice/California Wildfires/data/cal_wildfire.csv')
df.info()

  df = pd.read_csv('/Users/rachanon.cho/Documents/Kaggle Practice/California Wildfires/data/cal_wildfire.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100230 entries, 0 to 100229
Data columns (total 47 columns):
 #   Column                                                        Non-Null Count   Dtype  
---  ------                                                        --------------   -----  
 0   _id                                                           100230 non-null  int64  
 1   OBJECTID                                                      100230 non-null  int64  
 2   * Damage                                                      100230 non-null  object 
 3   * Street Number                                               95810 non-null   float64
 4   * Street Name                                                 94752 non-null   object 
 5   * Street Type (e.g. road, drive, lane, etc.)                  93525 non-null   object 
 6   Street Suffix (e.g. apt. 23, blding C)                        44149 non-null   object 
 7   * City                                                  

In [34]:
# Define the target variable
TARGET_COLUMN = '* Damage'

# Remove 'Inaccessible' entries from target variable
df = df[df[TARGET_COLUMN] != 'Inaccessible']

# Merge similar damage categories
df[TARGET_COLUMN] = df[TARGET_COLUMN].replace(
    ['Minor (10-25%)', 'Major (26-50%)', 'Affected (1-9%)'], 'Affected')
df[TARGET_COLUMN] = df[TARGET_COLUMN].replace(['Destroyed (>50%)'], 'Destroyed')


In [35]:
# Encoding target variable (convert labels to numbers)
label_encoder = LabelEncoder()
df[TARGET_COLUMN] = label_encoder.fit_transform(df[TARGET_COLUMN])

# Select relevant features (modify as needed)
FEATURE_COLUMNS = ['# Units in Structure (if multi unit)', 'Assessed Improved Value (parcel)', 'Latitude', 'Longitude']
df = df[FEATURE_COLUMNS + [TARGET_COLUMN]].dropna()

In [36]:
# Split dataset into train and test sets
X = df[FEATURE_COLUMNS]
y = df[TARGET_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [37]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [38]:
# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Dataset prepared. Ready for model training.")

Dataset prepared. Ready for model training.


In [39]:
# Hyper parameter tuning using GridSearhCV
param_grid = {
    'n_estimators ': [100,200,300],
    'max_depth': [10,20,None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4]
}

In [40]:
# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
grid_search = GridSearchCV(rf_model, param_grid,cv=3,scoring='accuracy',n_jobs=1)
grid_search.fit(X_train,y_train)

# best model after tuning
rf_model = grid_search.best_estimator_
rf_model.fit(X_train, y_train)

In [41]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [42]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.8662
Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.25      0.22       215
           1       0.86      0.84      0.85      2030
           2       0.92      0.92      0.92      3630

    accuracy                           0.87      5875
   macro avg       0.66      0.67      0.66      5875
weighted avg       0.87      0.87      0.87      5875

