# Climate Risk and Disaster Management
**Author:** Varshita  
**Course:** B Tech  
**Date:** September 2025

**Notebook:** Step-by-step pipeline to load data, EDA, preprocess, train regression/classification models, evaluate, visualize, and save artifacts.


## Problem Statement
Climate change has significantly increased the frequency and severity of natural disasters. A lack of accurate and scalable predictive systems leads to delayed response, poor preparedness, and higher economic and human losses. This project builds an AI-based solution to analyze climate and disaster-related data, predict a risk score (or risk level), and produce visualizations to help decision-makers.


In [10]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix

import joblib


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [4]:
file_path = r"C:\Users\hp\Downloads\climate-risk-index-1.csv"
df = pd.read_csv(file_path)

# clean column names
df.columns = (df.columns.str.strip()
                           .str.lower()
                           .str.replace(r'[^0-9a-zA-Z]+', '_', regex=True)
                           .str.strip('_'))

print("Loaded dataset with shape:", df.shape)
df.head()


Loaded dataset with shape: (182, 17)


Unnamed: 0,index,cartodb_id,the_geom,the_geom_webmercator,country,cri_rank,cri_score,fatalities_per_100k_rank,fatalities_per_100k_total,fatalities_rank,fatalities_total,losses_per_gdp_rank,losses_per_gdp_total,losses_usdm_ppp_rank,losses_usdm_ppp_total,rw_country_code,rw_country_name
0,0,1,,,Saudi Arabia,79,72.5,18,0.45,18,140,131,0.0001,119,1.229,SAU,Saudi Arabia
1,1,2,,,Romania,61,61.5,112,0.01,102,1,16,0.6746,11,2797.884,ROU,Romania
2,2,3,,,Spain,69,66.33,74,0.05,47,22,86,0.0394,31,637.07,ESP,Spain
3,3,4,,,Slovenia,135,124.5,114,0.0,114,0,135,,135,0.0,SVN,Slovenia
4,4,5,,,South Sudan,133,117.33,114,0.0,114,0,120,0.0021,122,0.508,SSD,South Sudan


In [5]:
# Preview, dtypes, and missing count
display(df.head(5))
print("\nData types:\n", df.dtypes)
print("\nMissing values per column:\n", df.isnull().sum())


Unnamed: 0,index,cartodb_id,the_geom,the_geom_webmercator,country,cri_rank,cri_score,fatalities_per_100k_rank,fatalities_per_100k_total,fatalities_rank,fatalities_total,losses_per_gdp_rank,losses_per_gdp_total,losses_usdm_ppp_rank,losses_usdm_ppp_total,rw_country_code,rw_country_name
0,0,1,,,Saudi Arabia,79,72.5,18,0.45,18,140,131,0.0001,119,1.229,SAU,Saudi Arabia
1,1,2,,,Romania,61,61.5,112,0.01,102,1,16,0.6746,11,2797.884,ROU,Romania
2,2,3,,,Spain,69,66.33,74,0.05,47,22,86,0.0394,31,637.07,ESP,Spain
3,3,4,,,Slovenia,135,124.5,114,0.0,114,0,135,,135,0.0,SVN,Slovenia
4,4,5,,,South Sudan,133,117.33,114,0.0,114,0,120,0.0021,122,0.508,SSD,South Sudan



Data types:
 index                          int64
cartodb_id                     int64
the_geom                     float64
the_geom_webmercator         float64
country                       object
cri_rank                       int64
cri_score                    float64
fatalities_per_100k_rank       int64
fatalities_per_100k_total    float64
fatalities_rank                int64
fatalities_total               int64
losses_per_gdp_rank            int64
losses_per_gdp_total         float64
losses_usdm_ppp_rank           int64
losses_usdm_ppp_total        float64
rw_country_code               object
rw_country_name               object
dtype: object

Missing values per column:
 index                          0
cartodb_id                     0
the_geom                     182
the_geom_webmercator         182
country                        0
cri_rank                       0
cri_score                      0
fatalities_per_100k_rank       0
fatalities_per_100k_total      0
fatalities_rank  

In [6]:
# Descriptive stats
display(df.describe(include='all').T)

# If there is a risk score-like column, try to locate it
risk_cols = [c for c in df.columns if 'risk' in c]
print("Columns with 'risk' in name:", risk_cols)

# Plot distribution if 'risk_score' exists (or first matched)
if 'risk_score' in df.columns:
    plt.figure(figsize=(6,4))
    df['risk_score'].hist(bins=30)
    plt.title('risk_score distribution')
    plt.xlabel('risk_score')
    plt.ylabel('count')
    plt.tight_layout()
    plt.show()
elif len(risk_cols)>0:
    plt.figure(figsize=(6,4))
    df[risk_cols[0]].hist(bins=30)
    plt.title(f'{risk_cols[0]} distribution')
    plt.tight_layout()
    plt.show()
else:
    print("No risk_score-like column found; continue with your numeric features.")


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
index,182.0,,,,90.5,52.683014,0.0,45.25,90.5,135.75,181.0
cartodb_id,182.0,,,,91.5,52.683014,1.0,46.25,91.5,136.75,182.0
the_geom,0.0,,,,,,,,,,
the_geom_webmercator,0.0,,,,,,,,,,
country,182.0,182.0,Saudi Arabia,1.0,,,,,,,
cri_rank,182.0,,,,85.230769,44.708529,1.0,46.25,91.0,135.0,135.0
cri_score,182.0,,,,81.791923,34.582412,12.17,52.8725,77.5,124.5,124.5
fatalities_per_100k_rank,182.0,,,,78.60989,37.858511,1.0,46.25,91.5,114.0,114.0
fatalities_per_100k_total,182.0,,,,0.51967,3.393934,0.0,0.0,0.02,0.12,43.66
fatalities_rank,182.0,,,,77.626374,37.500688,1.0,46.25,90.0,114.0,114.0


Columns with 'risk' in name: []
No risk_score-like column found; continue with your numeric features.


In [7]:
# If a continuous risk_score exists but no categorical risk_level, create risk_level
if 'risk_score' in df.columns and 'risk_level' not in df.columns:
    df['risk_level'] = pd.cut(df['risk_score'], bins=[-np.inf,33,66,np.inf], labels=['Low','Medium','High'])
    print("Created 'risk_level' from 'risk_score' with bins Low/Medium/High.")
else:
    print("No change: either 'risk_score' missing or 'risk_level' already exists.")
    
# Quick counts
if 'risk_level' in df.columns:
    print("\nRisk level distribution:")
    print(df['risk_level'].value_counts(dropna=False))


No change: either 'risk_score' missing or 'risk_level' already exists.


In [8]:
# Helper to find first matching column given candidate substrings
def find_col(df, candidate_list):
    for cand in candidate_list:
        for col in df.columns:
            if cand in col:
                return col
    return None

# Candidate names to search for (will adapt to your file)
col_mag = find_col(df, ['magnitude','mag','intensity'])
col_fatal = find_col(df, ['fatalitie','death','fatal'])
col_econ = find_col(df, ['economic','economic_loss','loss','economic_loss'])
col_pop = find_col(df, ['population','population_density','pop_density'])
col_prec = find_col(df, ['precip','rain','precipitation'])
col_temp = find_col(df, ['temp','temperature','anomaly'])
col_lat = find_col(df, ['lat','latitude'])
col_lon = find_col(df, ['lon','longitude'])
col_date = find_col(df, ['date','year','timestamp'])

# Build features list from found columns
features = [c for c in [col_mag, col_fatal, col_econ, col_pop, col_prec, col_temp, col_lat, col_lon] if c is not None]

print("Auto-detected feature columns:", features)
if col_date:
    print("Detected date column:", col_date)


Auto-detected feature columns: ['fatalities_per_100k_rank', 'losses_per_gdp_rank']


In [12]:
# Select target columns (prefer continuous risk_score)
target_reg = 'risk_score' if 'risk_score' in df.columns else None
target_clf = 'risk_level' if 'risk_level' in df.columns else None

# Safety check: if no features detected, ask user to manually specify
if len(features)==0:
    raise ValueError("No numeric feature columns auto-detected. Edit the `features` list manually using actual column names from df.columns.")

# Subset X and targets
X = df[features].copy()
y_reg = df[target_reg] if target_reg else None
y_clf = df[target_clf] if target_clf else None

# Impute numeric columns using median
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
print("Imputed missing numeric values with median. Any NaNs left:", X_imputed.isnull().sum().sum())

# Optionally scale later when training
X_imputed.head()


Imputed missing numeric values with median. Any NaNs left: 0


Unnamed: 0,fatalities_per_100k_rank,losses_per_gdp_rank
0,18.0,131.0
1,112.0,16.0
2,74.0,86.0
3,114.0,135.0
4,114.0,120.0


In [13]:
# Common split settings
RANDOM_STATE = 42
TEST_SIZE = 0.2

def make_split_and_scale(X_df, y_series):
    X_train, X_test, y_train, y_test = train_test_split(X_df, y_series, test_size=TEST_SIZE, random_state=RANDOM_STATE)
    scaler = StandardScaler().fit(X_train)
    X_train_s = scaler.transform(X_train)
    X_test_s  = scaler.transform(X_test)
    return X_train, X_test, X_train_s, X_test_s, y_train, y_test, scaler


In [14]:
# Run regression if target available
if y_reg is not None:
    X_train, X_test, X_train_s, X_test_s, y_train, y_test, scaler_reg = make_split_and_scale(X_imputed, y_reg)
    rfr = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
    rfr.fit(X_train_s, y_train)
    y_pred = rfr.predict(X_test_s)

    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    print("Regression results → MAE: {:.3f}, RMSE: {:.3f}, R2: {:.3f}".format(mae, rmse, r2))

    # Feature importances
    fi = pd.Series(rfr.feature_importances_, index=X_imputed.columns).sort_values(ascending=False)
    print("\nFeature importances:\n", fi)

    # Plot actual vs predicted
    plt.figure(figsize=(6,5))
    plt.scatter(y_test, y_pred, alpha=0.6)
    mn = min(y_test.min(), y_pred.min()); mx = max(y_test.max(), y_pred.max())
    plt.plot([mn,mx],[mn,mx], 'r--', lw=1)
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Actual vs Predicted (Regression)")
    plt.tight_layout()
    plt.savefig("plots/actual_vs_pred_regression.png", dpi=150)
    plt.show()

    # Residuals histogram
    plt.figure(figsize=(6,4))
    (y_test - y_pred).hist(bins=30)
    plt.title("Regression residuals (y_test - y_pred)")
    plt.tight_layout()
    plt.savefig("plots/residuals_regression.png", dpi=150)
    plt.show()

    # Save model & scaler
    joblib.dump(rfr, "models/rfr_risk_score.joblib")
    joblib.dump(scaler_reg, "models/scaler_reg.joblib")
    print("Saved RandomForestRegressor and scaler in /models.")
else:
    print("No continuous 'risk_score' target found — skipping regression.")


No continuous 'risk_score' target found — skipping regression.


In [15]:
# Run classification if categorical risk_level exists
if y_clf is not None:
    # Remove possible NaN labels
    mask = y_clf.notnull() & X_imputed.index.isin(y_clf.index)
    Xc = X_imputed.loc[mask]
    yc = y_clf.loc[mask]

    # Encode labels and split
    le = LabelEncoder()
    y_enc = le.fit_transform(yc.astype(str))
    X_train, X_test, X_train_s, X_test_s, y_train, y_test, scaler_clf = make_split_and_scale(Xc, y_enc)

    clf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
    clf.fit(X_train_s, y_train)
    y_pred = clf.predict(X_test_s)

    # Metrics and report
    print("Classification report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:\n", cm)

    # Plot confusion matrix
    plt.figure(figsize=(5,4))
    plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title("Confusion matrix")
    plt.colorbar()
    tick_marks = np.arange(len(le.classes_))
    plt.xticks(tick_marks, le.classes_, rotation=45)
    plt.yticks(tick_marks, le.classes_)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i,j], ha="center", va="center", color="black")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig("plots/confusion_matrix_clf.png", dpi=150)
    plt.show()

    # Feature importances
    fi_clf = pd.Series(clf.feature_importances_, index=Xc.columns).sort_values(ascending=False)
    display(fi_clf)

    # Save classifier & scaler
    joblib.dump(clf, "models/rfc_risk_level.joblib")
    joblib.dump(scaler_clf, "models/scaler_clf.joblib")
    print("Saved RandomForestClassifier and scaler in /models.")
else:
    print("No categorical 'risk_level' target found — skipping classification.")


No categorical 'risk_level' target found — skipping classification.


In [16]:
# Quick geo-plot: scatter latitude/longitude color by risk_score (if lat/lon exist)
if ('latitude' in X_imputed.columns or 'lat' in X_imputed.columns) and ('longitude' in X_imputed.columns or 'lon' in X_imputed.columns):
    # choose exact column names from features if present
    lat_col = find_col(df, ['latitude','lat']) or None
    lon_col = find_col(df, ['longitude','lon']) or None

    if lat_col and lon_col and 'risk_score' in df.columns:
        plt.figure(figsize=(8,4))
        sc = plt.scatter(df[lon_col], df[lat_col], c=df['risk_score'], s=20, cmap='viridis', alpha=0.7)
        plt.colorbar(sc, label='risk_score')
        plt.xlabel('longitude'); plt.ylabel('latitude'); plt.title('Geo Scatter colored by risk_score')
        plt.tight_layout()
        plt.savefig("plots/geo_scatter_risk_score.png", dpi=150)
        plt.show()
    else:
        print("Latitude/Longitude exist but no 'risk_score' to color by (or columns not detected).")
else:
    print("No latitude/longitude columns detected for geo-scatter.")


No latitude/longitude columns detected for geo-scatter.


In [17]:
if 'risk_score' not in df.columns and 'risk_level' not in df.columns:
    # create a synthetic risk_score using a simple heuristic — only use if you understand it's an approximation
    print("No risk_score or risk_level in dataset. Creating a simple heuristic risk_score (approx).")
    # heuristic combines available features (if present) — scale each and combine
    comb = pd.Series(0, index=df.index, dtype=float)
    count = 0
    for col in [col_mag, col_fatal, col_econ, col_pop, col_prec, col_temp]:
        if col is not None:
            s = df[col].astype(float)
            # normalize 0-1
            s_norm = (s - s.min()) / (s.max() - s.min()) if (s.max() - s.min())!=0 else s*0
            comb += s_norm
            count += 1
    if count>0:
        df['risk_score'] = 100 * (comb / count)  # scaled to 0-100
        df['risk_level'] = pd.cut(df['risk_score'], bins=[-np.inf,33,66,np.inf], labels=['Low','Medium','High'])
        print("Created heuristic 'risk_score' and 'risk_level' (use only for demo/testing).")
    else:
        print("Not enough numeric features to create heuristic risk_score. Please provide a risk column or appropriate features.")


No risk_score or risk_level in dataset. Creating a simple heuristic risk_score (approx).
Created heuristic 'risk_score' and 'risk_level' (use only for demo/testing).
