In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("data/application_train.csv")

In [4]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

In [6]:
df.shape

(307511, 122)

In [9]:
df["TARGET"].value_counts(normalize=True)

TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64

In [11]:
# creating the decison column for buisness thiking
df["decision"] = df["TARGET"].map({
     0: 'APPROVE',
    1: 'REJECT'
})

In [15]:

df["decision"].value_counts(normalize=True)

decision
APPROVE    0.919271
REJECT     0.080729
Name: proportion, dtype: float64

In [22]:
# as our data is static we are making it dynamic
#1)creating a time column
#2) split data into branches
#3) treat earliers data as baseline
#4) treat later data as monitoring period

# decision_date colum  added

start_date = pd.to_datetime("2020-01-01")
end_date = pd.to_datetime("2023-12-31")

df['decision_date'] = np.random.choice(
    pd.date_range(start_date, end_date, freq='D'),
    size=len(df)
)


In [23]:
# sort the date for time analysis
df = df.sort_values("decision_date")

In [32]:
df['month'] = df['decision_date'].dt.to_period('M')


In [None]:
df["month"]

282183    2020-01
184852    2020-01
260670    2020-01
11149     2020-01
296998    2020-01
           ...   
204828    2023-12
218835    2023-12
302170    2023-12
172983    2023-12
30740     2023-12
Name: month, Length: 307511, dtype: period[M]

In [38]:
approval_trend = (
    df.groupby("month")["decision"].apply(
        lambda x : (x=="APPROVE").mean()
    )
)
approval_trend.head()

month
2020-01    0.923960
2020-02    0.917595
2020-03    0.920357
2020-04    0.924150
2020-05    0.917440
Freq: M, Name: decision, dtype: float64

# splitting into baseline and monitoring

In [None]:
baseline_months = df["month"].sort_values().unique()[:6]


In [42]:
# spliting the data

baseline_df = df[df["month"].isin(baseline_months)]
monitor_df = df[~df["month"].isin(baseline_months)]

In [43]:
print("Baseline approval rate:",
      (baseline_df['decision'] == 'APPROVE').mean())

print("Monitoring approval rate:",
      (monitor_df['decision'] == 'APPROVE').mean())


Baseline approval rate: 0.9210169137607016
Monitoring approval rate: 0.9190227303964725


# descision drifit detection

In [62]:
baseline_approval_rate = (
    baseline_df["decision"]=="APPROVE"
).mean()

baseline_approval_rate

np.float64(0.9210169137607016)

In [72]:
# monitoring approval rate
monitoring_trend = (
    monitor_df
    .groupby('month')['decision']
    .apply(lambda x: (x == 'APPROVE').mean())
)
print(monitoring_trend)


month
2020-07    0.917873
2020-08    0.922434
2020-09    0.924029
2020-10    0.917961
2020-11    0.924152
2020-12    0.919835
2021-01    0.918649
2021-02    0.913863
2021-03    0.923645
2021-04    0.919449
2021-05    0.920012
2021-06    0.913569
2021-07    0.922617
2021-08    0.914703
2021-09    0.916890
2021-10    0.922956
2021-11    0.917989
2021-12    0.922587
2022-01    0.922450
2022-02    0.920492
2022-03    0.918756
2022-04    0.918769
2022-05    0.916998
2022-06    0.919917
2022-07    0.917541
2022-08    0.917513
2022-09    0.912910
2022-10    0.919400
2022-11    0.912819
2022-12    0.916769
2023-01    0.915272
2023-02    0.920240
2023-03    0.918981
2023-04    0.918700
2023-05    0.916261
2023-06    0.924284
2023-07    0.920166
2023-08    0.918753
2023-09    0.922235
2023-10    0.917311
2023-11    0.917786
2023-12    0.921345
Freq: M, Name: decision, dtype: float64


# comparing baseline vs monitoring (this is DRIFT)

In [73]:
approval_drift = baseline_approval_rate - monitoring_trend
print(approval_drift)

month
2020-07    0.003144
2020-08   -0.001417
2020-09   -0.003012
2020-10    0.003055
2020-11   -0.003135
2020-12    0.001182
2021-01    0.002368
2021-02    0.007154
2021-03   -0.002628
2021-04    0.001568
2021-05    0.001005
2021-06    0.007448
2021-07   -0.001600
2021-08    0.006314
2021-09    0.004127
2021-10   -0.001939
2021-11    0.003028
2021-12   -0.001570
2022-01   -0.001433
2022-02    0.000525
2022-03    0.002261
2022-04    0.002248
2022-05    0.004018
2022-06    0.001100
2022-07    0.003475
2022-08    0.003504
2022-09    0.008106
2022-10    0.001617
2022-11    0.008198
2022-12    0.004248
2023-01    0.005745
2023-02    0.000777
2023-03    0.002036
2023-04    0.002316
2023-05    0.004756
2023-06   -0.003267
2023-07    0.000851
2023-08    0.002264
2023-09   -0.001218
2023-10    0.003705
2023-11    0.003231
2023-12   -0.000328
Freq: M, Name: decision, dtype: float64


In [74]:
DRIFT_THRESHOLD = 0.10


drift_alerts = abs(approval_drift) > DRIFT_THRESHOLD
drift_alerts


month
2020-07    False
2020-08    False
2020-09    False
2020-10    False
2020-11    False
2020-12    False
2021-01    False
2021-02    False
2021-03    False
2021-04    False
2021-05    False
2021-06    False
2021-07    False
2021-08    False
2021-09    False
2021-10    False
2021-11    False
2021-12    False
2022-01    False
2022-02    False
2022-03    False
2022-04    False
2022-05    False
2022-06    False
2022-07    False
2022-08    False
2022-09    False
2022-10    False
2022-11    False
2022-12    False
2023-01    False
2023-02    False
2023-03    False
2023-04    False
2023-05    False
2023-06    False
2023-07    False
2023-08    False
2023-09    False
2023-10    False
2023-11    False
2023-12    False
Freq: M, Name: decision, dtype: bool

In [76]:
for month, drift in approval_drift.items():
    if abs(drift) > DRIFT_THRESHOLD:
        print(
            f"⚠️ Drift detected in {month}: "
            f"approval rate changed by {drift:.2%}"
        )
