Import Packages

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import kurtosis, skew, pearsonr

Set theme for seaborn

In [None]:
sns.set_theme()

Load Data

In [None]:
df = pd.read_parquet("../data/aggregated_data.parquet")
us_states_map = gpd.read_file("../data/maps/cb_2022_us_state_500k.zip")

Display basic dataset information

In [None]:
print(df.head())
print(df.shape)
print(us_states_map.head())

Univariate Analysis: Compute Basic Statistics

In [None]:
stats = df.describe(percentiles=[0.25, 0.75])
stats.loc["variance"] = df.var()
stats.loc["skewness"] = df.skew()
stats.loc["kurtosis"] = df.kurtosis()
stats = stats.drop(index=["count", "25%", "50%", "75%"])
print(stats.T)

Function to plot variable distribution and save plots

In [None]:
def plot_and_save_variable(data, var_name, output_folder=None):
    fig, axs = plt.subplots(ncols=3, figsize=(15, 5))
    sns.histplot(data=data, x=var_name, bins=10, ax=axs[0])
    axs[0].set_title("PDF")
    sns.ecdfplot(data=data, x=var_name, ax=axs[1])
    axs[1].set_title("CDF")
    sns.boxplot(data=data, x=var_name, ax=axs[2])
    axs[2].set_title("Box Plot")
    skewness_value = skew(data[var_name])
    kurtosis_value = kurtosis(data[var_name])
    fig.suptitle(
        f"{var_name}: Skewness={skewness_value:.2f}, Kurtosis={kurtosis_value:.2f}"
    )
    fig.tight_layout()
    if output_folder:
        plt.savefig(f"{output_folder}/{var_name}_distribution.png", bbox_inches="tight")
    plt.show()

Plot and save all variables in the dataset

In [None]:
for col in df.columns:
    plot_and_save_variable(df, col, "../figures")

Bivariate Analysis: Correlation Matrix and Heatmap

In [None]:
corr = df.corr()
fig = plt.figure(figsize=(20, 9))
mask = np.triu(np.ones_like(corr, dtype=bool))
ax = sns.heatmap(
    corr,
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    annot=True,
    mask=mask,
    vmin=-1,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.5,
    annot_kws={"size": 10},
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.tight_layout()
plt.savefig("../figures/correlation_heatmap.png", bbox_inches="tight")
plt.show()

Pearson Correlation with Drought

In [None]:
def compute_pearson_correlations(df, x_columns, target_column):
    results = {}
    for col in x_columns:
        correlation, p_value = pearsonr(df[col], df[target_column])
        results[col] = {"Correlation": correlation, "P-value": p_value}
    results_df = pd.DataFrame(results).T
    results_df["Hypothesis Testing"] = results_df["P-value"].apply(
        lambda p: "Reject" if p < 0.05 else "Fail to reject"
    )
    return results_df

In [None]:
x_columns = ["PRECTOT", "PS", "T2M_MAX", "T2M_RANGE", "WS10M_RANGE"]
correlation_results_df = compute_pearson_correlations(df, x_columns, "drought")
print(correlation_results_df)
print(
    correlation_results_df.to_latex(
        index=True, escape="underscore", float_format="{:.2f}".format
    )
)

Scatterplots for Drought vs Other Variables

In [None]:
for x_col in x_columns:
    sns.scatterplot(data=df, x=x_col, y="drought", alpha=0.1)
    plt.title(f"Scatterplot: {x_col} vs Drought")
    plt.show()

Spatial Analysis

In [None]:
us_state_region_mapping = pd.read_csv(
    "https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv"
)
df = df.merge(
    right=us_state_region_mapping, left_on="state_name", right_on="State"
).drop(columns=["State", "State Code", "Division"])

In [None]:
sns.scatterplot(data=df, x="PRECTOT", y="drought", hue="Region")
plt.title("Scatterplot: PRECTOT vs Drought by Region")
plt.show()

In [None]:
sns.displot(
    data=df,
    x="PRECTOT",
    y="drought",
    col="Region",
    facet_kws={"sharex": False, "sharey": False},
)
plt.show()

Detailed Correlation and P-Value Table Creation

In [None]:
def compute_detailed_correlations(df, x_columns, target_column):
    results = []
    for col in x_columns:
        correlation, p_value = pearsonr(df[col], df[target_column])
        results.append(
            {
                "Variable": col,
                "Correlation": correlation,
                "P-value": p_value,
                "Significant": "Yes" if p_value < 0.05 else "No",
            }
        )
    return pd.DataFrame(results)

In [None]:
detailed_correlation_df = compute_detailed_correlations(df, x_columns, "drought")
print(detailed_correlation_df)
detailed_correlation_df.to_csv(
    "../figures/detailed_correlation_results.csv", index=False
)