## Classifier model 
(since linear regression doesnt seem to work when we only look at the weather)

#### Load dataframe, sort to 1984-2023, change date column into date format

In [16]:
#import both dataframe (fire and weather), sort to 1984-2023
import pandas as pd
weather = pd.read_csv(r'weather_pivoted.csv')
fire = pd.read_csv(r'fire data with 83 firestn.csv')


weather['date']= pd.to_datetime(weather['date'])
fire['ALARM_DATE'] = pd.to_datetime(fire['ALARM_DATE'])

weather = weather.loc[(weather['date'] >= '1984-01-01') & (weather['date'] <= '2023-12-31')]


#weather.set_index(['date'])
#fire.set_index(['ALARM_DATE'])



  fire['ALARM_DATE'] = pd.to_datetime(fire['ALARM_DATE'])


### Merge 2 dataframe and add a column to indicate fire occurence (1/0)

In [17]:
#merge dataframes

weather_fire_merged = weather.merge(fire, left_on='date', right_on='ALARM_DATE', how='left')

#adding a column to indicate whether there is a fire on the date
weather_fire_merged['Fire_occur'] = weather_fire_merged['GIS_ACRES'].notna().astype(int)




In [18]:
weather_fire_merged= pd.read_csv(r'weather_fire_merged pivoted.csv')
a = weather_fire_merged.isnull().sum()
weather_fire_merged = weather_fire_merged.drop(columns=weather_fire_merged.columns[weather_fire_merged.isnull().sum() > 400000])

# Drop specified columns
weather_fire_class = weather_fire_merged.drop(columns=['GIS_ACRES', 'Shape__Area', 'Shape__Length'])



pd.merge on column is too slow, try making id and date index now

### Classification model: Random forest classifier

In [19]:
print(weather_fire_merged['Fire_occur'].value_counts())



Fire_occur
0    270065
1    225800
Name: count, dtype: int64


## with all the features, with feature importance (finalised)

In [20]:
weather_fire_merged_all= pd.read_csv(r'weather_fire_merged pivoted.csv')


In [21]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Ensure ALARM_DATE is in datetime format
weather_fire_merged_all['ALARM_DATE'] = pd.to_datetime(weather_fire_merged_all['ALARM_DATE'])

# Feature Engineering for Time Series Classification
weather_fire_merged_all['month'] = weather_fire_merged_all['ALARM_DATE'].dt.month  # Capture seasonality
weather_fire_merged_all['year'] = weather_fire_merged_all['ALARM_DATE'].dt.year    # Capture trend

# Define Features and Target
X = weather_fire_merged_all.loc[:, 'ACMH':'WT22']  # Select all columns from ACMH to WT22
y = weather_fire_merged_all['Fire_occur']          # Target variable (binary)

# Ensure `y` is numeric (in case it's not already)
y = y.astype(int)  # 1 = Fire occurred, 0 = No Fire

# Store ALARM_DATE separately for reference
dates = weather_fire_merged_all['ALARM_DATE']

# Split the data into train and test (stratified by 'Fire_occur' to keep class balance)
X_train, X_test, y_train, y_test, train_dates, test_dates = train_test_split(
    X, y, dates, test_size=0.2, random_state=42, stratify=y
)

# Check the class distribution in the training and testing sets
print("Train class distribution:\n", y_train.value_counts())
print("Test class distribution:\n", y_test.value_counts())

# Define and Train the RandomForestClassifier model
model = RandomForestClassifier(random_state=42, n_estimators=50, class_weight='balanced')
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Evaluate the Classification Model
print("🔥 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Train class distribution:
 Fire_occur
0    216052
1    180640
Name: count, dtype: int64
Test class distribution:
 Fire_occur
0    54013
1    45160
Name: count, dtype: int64
🔥 Accuracy: 0.7777116755568552

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.78      0.79     54013
           1       0.74      0.78      0.76     45160

    accuracy                           0.78     99173
   macro avg       0.78      0.78      0.78     99173
weighted avg       0.78      0.78      0.78     99173


🧩 Confusion Matrix:
 [[41976 12037]
 [10008 35152]]


### Feature importance

In [23]:
feature_importances = model.feature_importances_
features = X.columns
sorted_idx = np.flip(np.argsort(feature_importances))

print("Feature importances (sorted):")
for idx in sorted_idx:
    print(f"{features[idx]}: {feature_importances[idx]}")


Feature importances (sorted):
TMAX: 0.29141924814145226
TMIN: 0.24647869058268823
TAVG: 0.178644456387171
TOBS: 0.1189346988571379
PRCP: 0.04135673249044761
SNWD: 0.01802602677303745
WDMV: 0.01161796265505858
EVAP: 0.010365830985421426
MXPN: 0.00856584331447554
SNOW: 0.007847634525315233
WESD: 0.007561139340960549
PGTM: 0.007430125864413406
MNPN: 0.007220720737543712
AWND: 0.007157029462290783
WDF5: 0.004230749655858599
WDF2: 0.004009159301187329
WSF5: 0.003965164011390282
WSFG: 0.0037357971830394277
WSF2: 0.003661813451163019
WDFG: 0.003241405112120282
RHAV: 0.0024182546125888615
AWBT: 0.002225762355102353
RHMX: 0.0016223254783401398
ASLP: 0.0015075731379648512
RHMN: 0.0014599672910270384
ASTP: 0.0014472217750699074
FMTM: 0.0012638044784920054
ADPT: 0.001123792018748994
MDPR: 0.0010239591375674154
DAPR: 0.000361493342823275
WESF: 4.792322844308201e-05
MDSF: 2.230198366127509e-05
DASF: 2.8658421332982403e-06
MDWM: 1.8340557550621575e-06
DAWM: 6.550768589264064e-07
DAEV: 1.9384073495767

### getting probability from classifier instead of binary yes or no

In [27]:
# Ensure X_test is a DataFrame so we can retrieve the date and location
X_test_df = pd.DataFrame(X_test, columns=weather_fire_merged_all.loc[:, 'PRCP':'WDMV'].columns)

# Get predicted probabilities
y_probs = model.predict_proba(X_test)

# Add fire probability column (Class 1 probability)
X_test_df['Fire_Prob'] = y_probs[:, 1]

# Restore the date and location columns from the original dataset
X_test_df['Date'] = weather_fire_merged_all.loc[X_test.index, 'date'].values
X_test_df['Location'] = weather_fire_merged_all.loc[X_test.index, 'id'].values

# Create a final DataFrame with Date, Location, and Fire Probability
fire_prob_date = X_test_df[['Date', 'Location', 'Fire_Prob']].sort_values(by=['Date'])

# Show the first few rows
print(fire_prob_date.head())

# Extract and print the first 10 fire probabilities
fire_probs = fire_prob_date['Fire_Prob'].values
print("\n🔥 Fire occurrence probabilities (first 10 predictions):")
print(fire_probs[:10])


              Date     Location  Fire_Prob
189952  1984-01-01  USC00048135   0.194104
133390  1984-01-02  USC00046144   0.451906
171888  1984-01-02  USC00047085   0.020000
223859  1984-01-02  USC00049053   0.040000
133391  1984-01-03  USC00046144   0.000000

🔥 Fire occurrence probabilities (first 10 predictions):
[0.1941039  0.45190573 0.02       0.04       0.         0.44742164
 0.02       0.44742164 0.02       0.05694334]


### making a map from coordinate, fire yes or no

In [None]:
fire_coord_occurence = pd.read_csv