FOLLOW UP ANALYSIS of indices [COMPLETE and little conclusive]

How can we predict the costs that are going to follow the damages? Lets build a model for it :D First import the data!

In [None]:
import pandas as pd
import numpy as np
import joblib
df = pd.read_csv("../DATASETS/California Wildfire Damage.csv")
df

Unnamed: 0,Incident_ID,Date,Location,Area_Burned (Acres),Homes_Destroyed,Businesses_Destroyed,Vehicles_Damaged,Injuries,Fatalities,Estimated_Financial_Loss (Million $),Cause
0,INC1000,2020-11-22,Sonoma County,14048,763,474,235,70,19,2270.57,Lightning
1,INC1001,2021-09-23,Sonoma County,33667,1633,4,263,100,2,1381.14,Lightning
2,INC1002,2022-02-10,Shasta County,26394,915,291,31,50,6,2421.96,Human Activity
3,INC1003,2021-05-17,Sonoma County,20004,1220,128,34,28,0,3964.16,Unknown
4,INC1004,2021-09-22,Sonoma County,40320,794,469,147,0,15,1800.09,Unknown
...,...,...,...,...,...,...,...,...,...,...,...
95,INC1095,2018-01-16,Mendocino County,13112,1468,46,281,89,17,4008.58,Human Activity
96,INC1096,2022-07-07,Shasta County,39209,710,282,189,34,14,376.72,Human Activity
97,INC1097,2016-05-18,Shasta County,11863,702,9,40,100,1,1474.78,Human Activity
98,INC1098,2014-06-24,Napa Valley,1338,1619,262,293,94,3,3308.74,Human Activity


We will need a regression model that takes the damage counts as inputs and outputs the cost. We should also enable the option to not have the user input a value, in which the sklearn model will simply impute a valude.

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = df.dropna()
cols = list(df.columns)
print(cols)

['Incident_ID', 'Date', 'Location', 'Area_Burned (Acres)', 'Homes_Destroyed', 'Businesses_Destroyed', 'Vehicles_Damaged', 'Injuries', 'Fatalities', 'Estimated_Financial_Loss (Million $)', 'Cause']


In [33]:
feature_columns = ['Location', 'Area_Burned (Acres)', 'Homes_Destroyed', 'Businesses_Destroyed', 'Vehicles_Damaged', 'Injuries', 'Fatalities', 'Cause']
focus_columns = ['Area_Burned (Acres)', 'Homes_Destroyed', 'Businesses_Destroyed', 'Vehicles_Damaged', 'Injuries', 'Fatalities']
outcome = 'Estimated_Financial_Loss (Million $)'

X = df[focus_columns]
y = df[outcome]

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def train_dynamic_regression(df, target_feature):
    X = df.drop(columns=[target_feature, "Incident_ID", "Date", "Location", "Cause"])
    y = df[target_feature]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Model trained to predict '{target_feature}'")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")
    
    return model

In [57]:
target_feature = 'Estimated_Financial_Loss (Million $)'
model = train_dynamic_regression(df, target_feature)

Model trained to predict 'Estimated_Financial_Loss (Million $)'
Mean Squared Error: 2088188.523564085
R-squared: -0.07344321269291099


Lets try an example of using it. First without an imputer.

In [65]:
for i in range(50):
    random_row = df.sample(n=1, random_state=i)
    actual_value = random_row[target_feature].values[0]
    features = random_row.drop(columns=[target_feature, "Incident_ID", "Date", "Location", "Cause"]).values

    features_interrupted = features.copy()
    print(features_interrupted)
    features = scaler.transform(features)
    print(features)

    predicted_value = model.predict(features)[0]

    print(f"Actual Value: {actual_value}")
    print(f"Predicted Value: {predicted_value}")
    print(f"Difference: {abs(actual_value - predicted_value)}")
    print("---------------------")

[[14713   465    56   163    59    10]]
[[-0.8337964  -0.80316786 -1.33851556  0.07040311  0.53517112  0.08327105]]
Actual Value: 1840.53
Predicted Value: 2625.695543632556
Difference: 785.1655436325561
---------------------
[[6620 1054  354   13   16   15]]
[[-1.38903005  0.27911507  0.83876412 -1.66437578 -0.91101007  0.98349864]]
Actual Value: 3685.18
Predicted Value: 2573.20462966468
Difference: 1111.97537033532
---------------------
[[18920   896   144   200    35     6]]
[[-0.5451682  -0.0112087  -0.69556049  0.49831524 -0.27199978 -0.63691102]]
Actual Value: 1177.54
Predicted Value: 2297.7370473170486
Difference: 1120.1970473170486
---------------------
[[2047 1401   75  153   37    9]]
[[-1.70276828  0.91672488 -1.19969572 -0.04524882 -0.20473554 -0.09677447]]
Actual Value: 4508.65
Predicted Value: 2711.043171865596
Difference: 1797.6068281344037
---------------------
[[36784   461   215   195    33     9]]
[[ 0.68042107 -0.81051783 -0.17681265  0.44048927 -0.33926402 -0.096774

Then with an imputer.

In [None]:
random_row = df.sample(n=1, random_state=i)
actual_value = random_row[target_feature].values[0]
features = random_row.drop(columns=[target_feature, "Incident_ID", "Date", "Location", "Cause"]).values
features_interrupted = features.copy()
print(features_interrupted)
features = scaler.transform(features)
print(features)

predicted_value = model.predict(features)[0]

print(f"Actual Value: {actual_value}")
print(f"Predicted Value: {predicted_value}")
print(f"Difference: {abs(actual_value - predicted_value)}")
print("---------------------")

random_row = df.sample(n=1, random_state=40)
actual_value = random_row[target_feature].values[0]
features = random_row.drop(columns=[target_feature, "Incident_ID", "Date", "Location", "Cause"]).values

features_interrupted = list(features.copy()[0])
print(features_interrupted)
features_interrupted[1] = np.nan
features_interrupted[4] = np.nan
print(features_interrupted)
print()

features_interrupted = np.array(features_interrupted).reshape(1, -1)
features = imputer.transform(features_interrupted)
features = scaler.transform(features)

imputer = SimpleImputer(missing_values = np.nan, strategy='mean')
imputer.fit(df[focus_columns])

features = imputer.transform(features_interrupted)
features = scaler.transform(features)
print(features)

predicted_value = model.predict(features)[0]

print(f"Actual Value: {actual_value}")
print(f"Predicted Value: {predicted_value}")
print(f"Difference: {abs(actual_value - predicted_value)}")

joblib.dump(model, 'regression_model.pkl')

[[9866  705  101    5    4   14]]
[[-1.16633285 -0.36216972 -1.00973172 -1.75689732 -1.31459553  0.80345312]]
Actual Value: 2987.13
Predicted Value: 2740.561297584057
Difference: 246.56870241594333
---------------------
[41584, 1609, 12, 212, 42, 7]
[41584, nan, 12, 212, nan, 7]

[[ 1.00973301  0.07311382 -1.6599931   0.63709755 -0.03522965 -0.4568655 ]]
Actual Value: 1314.42
Predicted Value: 2571.835927571749
Difference: 1257.4159275717489




['regression_model.pkl']