<a href="https://colab.research.google.com/github/SkyRanger2010/DE2025_ETL_HW/blob/main/HW%2314/PSI_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
import pandas as pd
import datetime as dt

In [18]:

def _psi(expected: np.ndarray, actual: np.ndarray, bucket_type: str = "bins", n_bins: int = 10) -> float:
    """Calculate PSI metric for two arrays.

    Parameters
    ----------
        expected : list-like
            Array of expected values
        actual : list-like
            Array of actual values
        bucket_type : str
            Binning strategy. Accepts two options: 'bins' and 'quantiles'. Defaults to 'bins'.
            'bins': input arrays are splitted into bins with equal
                and fixed steps based on 'expected' array
            'quantiles': input arrays are binned according to 'expected' array
                with given number of n_bins
        n_bins : int
            Number of buckets for binning. Defaults to 10.

    Returns
    -------
        A single float number
    """
    breakpoints = np.arange(0, n_bins + 1) / (n_bins) * 100
    if bucket_type == "bins":
        breakpoints = np.histogram(expected, n_bins)[1]
    elif bucket_type == "quantiles":
        breakpoints = np.percentile(expected, breakpoints)

    # Calculate frequencies
    expected_percents = np.histogram(expected, breakpoints)[0] / len(expected)
    actual_percents = np.histogram(actual, breakpoints)[0] / len(actual)
    # Clip freaquencies to avoid zero division
    expected_percents = np.clip(expected_percents, a_min=0.0001, a_max=None)
    actual_percents = np.clip(actual_percents, a_min=0.0001, a_max=None)
    # Calculate PSI
    psi_value = (expected_percents - actual_percents) * np.log(expected_percents / actual_percents)
    psi_value = sum(psi_value)

    return psi_value


def calculate_psi(
        expected: np.ndarray, actual: np.ndarray, bucket_type: str = "bins", n_bins: int = 10, axis: int = 0
) -> np.ndarray:
    """Apply PSI calculation to 2 1-d or 2-d arrays.

    Parameters
    ----------
    expected : list-like
        Array of expected values
    actual : list-like
        Array of actual values
    bucket_type : str
        Binning strategy. Accepts two options: 'bins' and 'quantiles'. Defaults to 'bins'.
            'bins' - input arrays are splitted into bins with equal
                and fixed steps based on ’expected' array
            'quantiles' - input arrays are binned according to ’expected’ array
                with given number of n_bins
    n_bins : int
        Number of buckets for binning. Defaults to 10.

    Returns
    -------
        np.ndarray
    """
    if len(expected.shape) == 1:
        psi_values = np.empty(len(expected.shape))
    else:
        psi_values = np.empty(expected.shape[axis])

    for i in range(0, len(psi_values)):
        if len(psi_values) == 1:
            psi_values = _psi(expected, actual, bucket_type, n_bins)
        elif axis == 0:
            psi_values[i] = _psi(expected[:, i], actual[:, i], bucket_type, n_bins)
        elif axis == 1:
            psi_values[i] = _psi(expected[i, :], actual[i, :], bucket_type, n_bins)
        return np.array(psi_values)

In [96]:
df=pd.read_csv("weather_data.csv")

In [97]:
df['Day'] = pd.DatetimeIndex(df['Date_Time']).day
df.head(1)

Unnamed: 0,Location,Date_Time,Temperature_C,Humidity_pct,Precipitation_mm,Wind_Speed_kmh,Day
0,San Diego,2024-01-14 21:12:46,10.683001,41.195754,4.020119,8.23354,14


In [98]:
df_expected = df[df['Day'] == 1]['Temperature_C']
df_expected.describe()

Unnamed: 0,Temperature_C
count,36042.0
mean,14.753494
std,14.533268
min,-19.478652
25%,2.14791
50%,14.757672
75%,27.233637
max,39.997953


In [111]:
df_actual = df[df['Day'] == 2]['Temperature_C']
df_actual.describe()

Unnamed: 0,Temperature_C
count,36304.0
mean,14.733945
std,14.451686
min,-19.57201
25%,2.275788
50%,14.704125
75%,27.138387
max,39.999015


In [112]:
calculate_psi(df_expected, df_actual, bucket_type="bins", n_bins=10,axis=0)

array(0.0006413)

In [113]:
df.loc[(df['Day'] == 2) & (df['Temperature_C'] > 30), 'Temperature_C'] = 30
df_damage_actual=df[df['Day'] == 2]['Temperature_C']
df_damage_actual.describe()

Unnamed: 0,Temperature_C
count,36304.0
mean,13.775086
std,13.123984
min,-19.57201
25%,2.275788
50%,14.704125
75%,27.138387
max,30.0


In [114]:
calculate_psi(df_expected, df_damage_actual, bucket_type="bins", n_bins=10,axis=0)

array(0.88657246)