# 1. Import libraries and define global variables

In [1]:
import os

from sympy.logic.boolalg import Boolean

# Import other libraries where needed.

INPUT_DATA_FILPATH = os.path.join("database", "application_bureau_general.csv")


# 2. Import data

In [2]:
import pandas as pd

df = pd.read_csv(INPUT_DATA_FILPATH)

df


Unnamed: 0,APP_ID,BUREAU_ID,CREDIT_ACTIVE,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,-497,0,-153.0,-153.0,,0,91323.00,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,-208,0,1075.0,,,0,225000.00,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,-203,0,528.0,,,0,464323.50,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,-203,0,,,,0,90000.00,,,0.0,Credit card,-16,
4,215354,5714466,Active,-629,0,1197.0,,77674.5,0,2700000.00,,,0.0,Consumer credit,-21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1716423,259355,5057750,Active,-44,0,-30.0,,0.0,0,11250.00,11250.0,0.0,0.0,Microloan,-19,
1716424,100044,5057754,Closed,-2648,0,-2433.0,-2493.0,5476.5,0,38130.84,0.0,0.0,0.0,Consumer credit,-2493,
1716425,100044,5057762,Closed,-1809,0,-1628.0,-970.0,,0,15570.00,,,0.0,Consumer credit,-967,
1716426,246829,5057770,Closed,-1878,0,-1513.0,-1513.0,,0,36000.00,0.0,0.0,0.0,Consumer credit,-1508,


# 3. Treat data

## 3.1. Ensure APP_ID and BUREAU_ID are strings

In [3]:
df[["APP_ID", "BUREAU_ID"]] = df[["APP_ID", "BUREAU_ID"]].astype(str)

df.dtypes


APP_ID                     object
BUREAU_ID                  object
CREDIT_ACTIVE              object
DAYS_CREDIT                 int64
CREDIT_DAY_OVERDUE          int64
DAYS_CREDIT_ENDDATE       float64
DAYS_ENDDATE_FACT         float64
AMT_CREDIT_MAX_OVERDUE    float64
CNT_CREDIT_PROLONG          int64
AMT_CREDIT_SUM            float64
AMT_CREDIT_SUM_DEBT       float64
AMT_CREDIT_SUM_LIMIT      float64
AMT_CREDIT_SUM_OVERDUE    float64
CREDIT_TYPE                object
DAYS_CREDIT_UPDATE          int64
AMT_ANNUITY               float64
dtype: object

## 3.2. Filter the datasets CREDIT_ACTIVE is "Active", CREDIT_TYPE is "Credit Card", AMT_CREDIT_SUM_LIMIT > 0

In [5]:
df = df[
    (df["CREDIT_ACTIVE"] == "Active") &
    (df["CREDIT_TYPE"] == "Credit card") &
    (df["AMT_CREDIT_SUM_LIMIT"] > 0)
]

print(f"Filtered dataset shape: {df.shape}")
df.head()

Filtered dataset shape: (68752, 16)


Unnamed: 0,APP_ID,BUREAU_ID,CREDIT_ACTIVE,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
5,215354,5714467,Active,-273,0,27460.0,,0.0,0,180000.0,71017.38,108982.62,0.0,Credit card,-31,
20,238881,5714489,Active,-392,0,,,0.0,0,252000.0,23679.0,228320.1,0.0,Credit card,-22,
21,222183,5714491,Active,-784,0,1008.0,,0.0,0,0.0,-411.615,411.615,0.0,Credit card,-694,
75,303740,5714554,Active,-581,0,,,0.0,0,384750.0,263056.5,121690.17,0.0,Credit card,-22,
89,119939,5714570,Active,-1390,0,-696.0,,0.0,0,4500.0,-2.565,2.565,0.0,Credit card,-691,


## 3.3. Select columns of interest: ["APP_ID", "DAYS_CREDIT", "AMT_CREDIT_SUM", "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_LIMIT"]

In [6]:
columns_of_interest = ["APP_ID", "DAYS_CREDIT", "AMT_CREDIT_SUM",
                       "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_LIMIT"]
df = df[columns_of_interest]

print(f"Selected columns shape: {df.shape}")
df.head()


Selected columns shape: (68752, 5)


Unnamed: 0,APP_ID,DAYS_CREDIT,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT
5,215354,-273,180000.0,71017.38,108982.62
20,238881,-392,252000.0,23679.0,228320.1
21,222183,-784,0.0,-411.615,411.615
75,303740,-581,384750.0,263056.5,121690.17
89,119939,-1390,4500.0,-2.565,2.565


## 3.4. Create new features: 
1. SUM_TO_LIMIT: credit sum to credit limit.
2. DEBT_TO_LIMIT: credit sum debt to credit sum limit.
3. DEBT_TO_LIMIT_GRADE: "LOW" if DEBT_TO_LIMIT if <= 0.3, "MEDIUM" if (0.3, 0.6], "HIGH" if >0.6.

In [8]:
# 1. SUM_TO_LIMIT: credit sum to credit limit
df["SUM_TO_LIMIT"] = df["AMT_CREDIT_SUM"] / df["AMT_CREDIT_SUM_LIMIT"]

# 2. DEBT_TO_LIMIT: credit sum debt to credit sum limit
df["DEBT_TO_LIMIT"] = df["AMT_CREDIT_SUM_DEBT"] / df["AMT_CREDIT_SUM_LIMIT"]

# 3. DEBT_TO_LIMIT_GRADE: categorize based on DEBT_TO_LIMIT
def categorize_debt_ratio(ratio: float) -> str:
    if pd.isna(ratio):
        return None
    elif ratio <= 0.3:
        return "LOW"
    elif ratio <= 0.6:
        return "MEDIUM"
    else:
        return "HIGH"

df["DEBT_TO_LIMIT_GRADE"] = df["DEBT_TO_LIMIT"].apply(categorize_debt_ratio)

print("New features created:")
df.head()

New features created:


Unnamed: 0,APP_ID,DAYS_CREDIT,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,SUM_TO_LIMIT,DEBT_TO_LIMIT,DEBT_TO_LIMIT_GRADE
5,215354,-273,180000.0,71017.38,108982.62,1.651639,0.651639,HIGH
20,238881,-392,252000.0,23679.0,228320.1,1.103714,0.10371,LOW
21,222183,-784,0.0,-411.615,411.615,0.0,-1.0,LOW
75,303740,-581,384750.0,263056.5,121690.17,3.161718,2.161691,HIGH
89,119939,-1390,4500.0,-2.565,2.565,1754.385965,-1.0,LOW


## 3.5. Sort by APP_ID and DAYS_CREDIT

In [9]:
df = df.sort_values(by=["APP_ID", "DAYS_CREDIT"], ascending=[True, False])

print("Dataset sorted by APP_ID and DAYS_CREDIT:")
df.head(10)


Dataset sorted by APP_ID and DAYS_CREDIT:


Unnamed: 0,APP_ID,DAYS_CREDIT,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,SUM_TO_LIMIT,DEBT_TO_LIMIT,DEBT_TO_LIMIT_GRADE
675689,100002,-103,31988.565,0.0,31988.565,1.0,0.0,LOW
1024669,100003,-606,810000.0,0.0,810000.0,1.0,0.0,LOW
201486,100019,-495,270000.0,0.0,270000.0,1.0,0.0,LOW
829617,100028,-269,128971.08,27576.0,101390.76,1.27202,0.271977,LOW
105466,100030,-1053,45000.0,19341.0,25656.03,1.753974,0.753858,HIGH
272932,100044,-1739,22500.0,-78.885,247578.885,0.09088,-0.000319,LOW
272931,100044,-1827,135000.0,-130.005,418630.005,0.32248,-0.000311,LOW
126564,100049,-700,45000.0,36180.0,8819.235,5.102483,4.102397,HIGH
1186867,100065,-358,180000.0,176831.235,3168.765,56.804465,55.804465,HIGH
1186855,100065,-1060,63000.0,60218.775,2781.225,22.651889,21.651889,HIGH


## 4. Get count, average, min, max, first, and last per ID for the following fields:
["AMT_CREDIT_SUM", "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_LIMIT", "SUM_TO_LIMIT", "DEBT_TO_LIMIT"]

In [12]:
agg_fields = ["AMT_CREDIT_SUM", "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_LIMIT",
              "SUM_TO_LIMIT", "DEBT_TO_LIMIT"]

aggregated = df.groupby("APP_ID")[agg_fields].agg([
    ('count', 'count'),
    ('avg', 'mean'),
    ('min', 'min'),
    ('max', 'max'),
    ('first', 'first'),
    ('last', 'last')
])

# Flatten column names
aggregated.columns = ['_'.join(col).strip() for col in aggregated.columns.values]
aggregated = aggregated.reset_index()

print(f"Aggregated data shape: {aggregated.shape}")
aggregated.head()

Aggregated data shape: (54147, 31)


Unnamed: 0,APP_ID,AMT_CREDIT_SUM_count,AMT_CREDIT_SUM_avg,AMT_CREDIT_SUM_min,AMT_CREDIT_SUM_max,AMT_CREDIT_SUM_first,AMT_CREDIT_SUM_last,AMT_CREDIT_SUM_DEBT_count,AMT_CREDIT_SUM_DEBT_avg,AMT_CREDIT_SUM_DEBT_min,...,SUM_TO_LIMIT_min,SUM_TO_LIMIT_max,SUM_TO_LIMIT_first,SUM_TO_LIMIT_last,DEBT_TO_LIMIT_count,DEBT_TO_LIMIT_avg,DEBT_TO_LIMIT_min,DEBT_TO_LIMIT_max,DEBT_TO_LIMIT_first,DEBT_TO_LIMIT_last
0,100002,1,31988.565,31988.565,31988.565,31988.565,31988.565,1,0.0,0.0,...,1.0,1.0,1.0,1.0,1,0.0,0.0,0.0,0.0,0.0
1,100003,1,810000.0,810000.0,810000.0,810000.0,810000.0,1,0.0,0.0,...,1.0,1.0,1.0,1.0,1,0.0,0.0,0.0,0.0,0.0
2,100019,1,270000.0,270000.0,270000.0,270000.0,270000.0,1,0.0,0.0,...,1.0,1.0,1.0,1.0,1,0.0,0.0,0.0,0.0,0.0
3,100028,1,128971.08,128971.08,128971.08,128971.08,128971.08,1,27576.0,27576.0,...,1.27202,1.27202,1.27202,1.27202,1,0.271977,0.271977,0.271977,0.271977,0.271977
4,100030,1,45000.0,45000.0,45000.0,45000.0,45000.0,1,19341.0,19341.0,...,1.753974,1.753974,1.753974,1.753974,1,0.753858,0.753858,0.753858,0.753858,0.753858


# 5. Get the most recent debt-to-limit and least recent debt-to-limit

In [13]:
most_recent = df.groupby("APP_ID")["DEBT_TO_LIMIT"].first().reset_index()
most_recent.columns = ["APP_ID", "MOST_RECENT_DEBT_TO_LIMIT"]

least_recent = df.groupby("APP_ID")["DEBT_TO_LIMIT"].last().reset_index()
least_recent.columns = ["APP_ID", "LEAST_RECENT_DEBT_TO_LIMIT"]

recent_comparison = most_recent.merge(least_recent, on="APP_ID")

print("Most recent vs least recent debt-to-limit:")
recent_comparison.head()


Most recent vs least recent debt-to-limit:


Unnamed: 0,APP_ID,MOST_RECENT_DEBT_TO_LIMIT,LEAST_RECENT_DEBT_TO_LIMIT
0,100002,0.0,0.0
1,100003,0.0,0.0
2,100019,0.0,0.0
3,100028,0.271977,0.271977
4,100030,0.753858,0.753858


# 6. Filter so that days credit is less than 12 months (>-365 DAYS), then get min and max debt-to-limit

In [14]:
df_12_months = df[df["DAYS_CREDIT"] > -365]

min_max_12_months = df_12_months.groupby("APP_ID")["DEBT_TO_LIMIT"].agg([
    ('min_debt_to_limit_12m', 'min'),
    ('max_debt_to_limit_12m', 'max')
]).reset_index()

print(f"12-month filtered data shape: {df_12_months.shape}")
print("Min and max debt-to-limit for last 12 months:")
min_max_12_months.head()


12-month filtered data shape: (21367, 8)
Min and max debt-to-limit for last 12 months:


Unnamed: 0,APP_ID,min_debt_to_limit_12m,max_debt_to_limit_12m
0,100002,0.0,0.0
1,100028,0.271977,0.271977
2,100065,55.804465,55.804465
3,100067,18.912301,18.912301
4,100073,4.647225,4.647225


# 7. Filter so that days credit is less than 30 days, then calculate the percentage distribution of "Low", "Medium", and "High" debt-to-ratio values

In [16]:
df_30_days = df[df["DAYS_CREDIT"] > -30]

grade_distribution = df_30_days["DEBT_TO_LIMIT_GRADE"].value_counts(normalize=True) * 100

print("Percentage distribution of debt-to-limit grades (last 30 days):")
print(grade_distribution)


Percentage distribution of debt-to-limit grades (last 30 days):
DEBT_TO_LIMIT_GRADE
HIGH      54.644809
LOW       39.890710
MEDIUM     5.464481
Name: proportion, dtype: float64


# 8. Get data of the APP_IDs with the oldest day credit.

In [17]:
oldest_day_credit = df.groupby("APP_ID")["DAYS_CREDIT"].min().reset_index()
oldest_day_credit.columns = ["APP_ID", "OLDEST_DAYS_CREDIT"]

# Get the minimum value across all APP_IDs
min_days_credit = oldest_day_credit["OLDEST_DAYS_CREDIT"].min()

# Filter for APP_IDs with that minimum value
app_ids_with_oldest = oldest_day_credit[
    oldest_day_credit["OLDEST_DAYS_CREDIT"] == min_days_credit
]["APP_ID"].tolist()

# Get all data for those APP_IDs
oldest_data = df[df["APP_ID"].isin(app_ids_with_oldest)]

print(f"Number of APP_IDs with oldest day credit ({min_days_credit}): {len(app_ids_with_oldest)}")
print("Data for APP_IDs with oldest day credit:")
oldest_data

Number of APP_IDs with oldest day credit (-2922): 9
Data for APP_IDs with oldest day credit:


Unnamed: 0,APP_ID,DAYS_CREDIT,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,SUM_TO_LIMIT,DEBT_TO_LIMIT,DEBT_TO_LIMIT_GRADE
548988,203763,-2922,180000.0,63575.505,282924.495,0.636212,0.224708,LOW
1429151,220662,-2922,225000.0,-425.79,112925.79,1.992459,-0.003771,LOW
1207341,268011,-2922,135000.0,472504.77,22495.23,6.001272,21.004665,HIGH
450051,309230,-326,180000.0,120883.5,59115.735,3.044875,2.044862,HIGH
450040,309230,-2922,135000.0,8702.73,22797.27,5.921762,0.381744,MEDIUM
300853,340131,-2922,135000.0,-457.425,108457.425,1.244728,-0.004218,LOW
1369974,344182,-2922,135000.0,-183.465,13683.465,9.865922,-0.013408,LOW
284956,357750,-2922,110191.5,0.0,110191.5,1.0,0.0,LOW
366605,406064,-2922,157500.0,55571.4,79458.615,1.982164,0.699375,HIGH
388089,436177,-1536,4500.0,-514.485,5014.485,0.8974,-0.1026,LOW


# 9. Export data

In [18]:
# Export the main processed dataframe
output_path_main = os.path.join("database", "processed_credit_data.csv")
df.to_csv(output_path_main, index=False)
print(f"Main processed data exported to: {output_path_main}")

# Export aggregated data
output_path_agg = os.path.join("database", "aggregated_credit_data.csv")
aggregated.to_csv(output_path_agg, index=False)
print(f"Aggregated data exported to: {output_path_agg}")

# Export recent comparison
output_path_recent = os.path.join("database", "recent_comparison.csv")
recent_comparison.to_csv(output_path_recent, index=False)
print(f"Recent comparison exported to: {output_path_recent}")

# Export 12-month analysis
output_path_12m = os.path.join("database", "twelve_month_analysis.csv")
min_max_12_months.to_csv(output_path_12m, index=False)
print(f"12-month analysis exported to: {output_path_12m}")

# Export oldest data
output_path_oldest = os.path.join("database", "oldest_credit_data.csv")
oldest_data.to_csv(output_path_oldest, index=False)
print(f"Oldest credit data exported to: {output_path_oldest}")

print("\nAll exports completed successfully!")

Main processed data exported to: database\processed_credit_data.csv
Aggregated data exported to: database\aggregated_credit_data.csv
Recent comparison exported to: database\recent_comparison.csv
12-month analysis exported to: database\twelve_month_analysis.csv
Oldest credit data exported to: database\oldest_credit_data.csv

All exports completed successfully!
