In [2]:
import pandas as pd

# Load the dataset
file_path = 'FireData.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,OBJECTID,Shape,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,...,LATITUDE,LONGITUDE,OWNER_DESCR,STATE,COUNTY,FIPS_CODE,FIPS_NAME,DURATION_HOURS,Precipitation_In_Month,Avg_Temp_In_Month
0,1,b'\x00\x01\xad\x10\x00\x00\xc8\xce\n[_@^\xc0\x...,1,FS-1418826,FED,FS-FIRESTAT,FS,USCAPNF,Plumas National Forest,511,...,40.036944,-121.005833,USFS,CA,63.0,6063.0,Plumas County,4.5,3.69,45.6
1,2,b'\x00\x01\xad\x10\x00\x00\xc8\xe594\xe2\x19^\...,2,FS-1418827,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,...,38.933056,-120.404444,USFS,CA,61.0,6061.0,Placer County,6.75,0.08,60.2
2,3,b'\x00\x01\xad\x10\x00\x00x{\xac \x13/^\xc0@\x...,3,FS-1418835,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,...,38.984167,-120.735556,STATE OR PRIVATE,CA,17.0,6017.0,El Dorado County,1.05,0.08,60.2
3,4,b'\x00\x01\xad\x10\x00\x00\xc8\x13u\xd7s\xfa]\...,4,FS-1418845,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,...,38.559167,-119.913333,USFS,CA,3.0,6003.0,Alpine County,118.0,0.06,66.8
4,5,b'\x00\x01\xad\x10\x00\x00\xd0\x11y\xf8\xb6\xf...,5,FS-1418847,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,...,38.559167,-119.933056,USFS,CA,3.0,6003.0,Alpine County,116.0,0.06,66.8


In [3]:
# Values for NWCG_GENERAL_CAUSE
data['NWCG_GENERAL_CAUSE'].value_counts()


Natural                                       4820
Recreation and ceremony                       1370
Arson/incendiarism                            1193
Missing data/not specified/undetermined        808
Debris and open burning                        679
Equipment and vehicle use                      525
Power generation/transmission/distribution     175
Smoking                                        164
Fireworks                                       77
Misuse of fire by a minor                       62
Railroad operations and maintenance             52
Other causes                                    33
Firearms and explosives use                      3
Name: NWCG_GENERAL_CAUSE, dtype: int64

In [4]:
# Natural causes mapped to 0, all other human-induced causes mapped to 1

def map_to_binary_classification(cause):
    if cause == "Natural":
        return 0
    elif cause == "Missing data/not specified/undetermined":
        # handle missing data 
        return None 
    else:
        return 1

data['Cause_Classification'] = data['NWCG_GENERAL_CAUSE'].apply(map_to_binary_classification)

data.dropna(subset=['Cause_Classification'], inplace=True)

data['Cause_Classification'] = data['Cause_Classification'].astype(int)

data[['NWCG_GENERAL_CAUSE', 'Cause_Classification']].head(), data['Cause_Classification'].value_counts()


(                           NWCG_GENERAL_CAUSE  Cause_Classification
 0  Power generation/transmission/distribution                     1
 1                                     Natural                     0
 2                     Debris and open burning                     1
 3                                     Natural                     0
 4                                     Natural                     0,
 0    4820
 1    4333
 Name: Cause_Classification, dtype: int64)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# Feature selection
# These features provide a mix of temporal, geographical,
# and environmental data that could influence the classification of a fire's cause.
features = ['FIRE_YEAR', 'FIRE_SIZE', 'LATITUDE', 'LONGITUDE', 'Avg_Temp_In_Month', 'Precipitation_In_Month']
X = data[features]
y = data['Cause_Classification']

# Data splitting: 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

mse = mean_squared_error(y_test, y_pred)

print(f"MSE: {mse}")


0.7307482250136538
MSE: 0.2692517749863463


In [8]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt


# Generating confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Natural', 'Actual Human'], columns=['Predicted Natural', 'Predicted Human'])

cm_df


Unnamed: 0,Predicted Natural,Predicted Human
Actual Natural,831,144
Actual Human,349,507


In [9]:
from sklearn.metrics import classification_report

# Generating a classification report for the model on the test set
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()

report_df


Unnamed: 0,precision,recall,f1-score,support
0,0.704237,0.852308,0.77123,975.0
1,0.778802,0.59229,0.67286,856.0
accuracy,0.730748,0.730748,0.730748,0.730748
macro avg,0.74152,0.722299,0.722045,1831.0
weighted avg,0.739097,0.730748,0.725241,1831.0
