In [4]:
import pandas as pd

# Load CSVs (make sure they are inside the 'data/' folder)
app_df = pd.read_csv('../data/application_record.csv')
credit_df = pd.read_csv('../data/credit_record.csv')

# Preview
print("Application Record:")
print(app_df.head(), "\n")

print("Credit Record:")
print(credit_df.head())


Application Record:
        ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
0  5008804           M            Y               Y             0   
1  5008805           M            Y               Y             0   
2  5008806           M            Y               Y             0   
3  5008808           F            N               Y             0   
4  5008809           F            N               Y             0   

   AMT_INCOME_TOTAL      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
0          427500.0               Working               Higher education   
1          427500.0               Working               Higher education   
2          112500.0               Working  Secondary / secondary special   
3          270000.0  Commercial associate  Secondary / secondary special   
4          270000.0  Commercial associate  Secondary / secondary special   

     NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  \
0        Civil marriage   Rented ap

In [12]:
print("Application data shape:", app_df.shape)


Application data shape: (438557, 18)


In [7]:
app_df.isnull().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134203
CNT_FAM_MEMBERS             0
dtype: int64

In [13]:
print("Info:")
print(app_df.info())


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL 

In [11]:
# Unique values for categorical columns
cat_cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
            'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
            'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']

for col in cat_cols:
    print(f"\n{col} unique values: ")
    print(app_df[col].value_counts())


CODE_GENDER unique values: 
F    294440
M    144117
Name: CODE_GENDER, dtype: int64

FLAG_OWN_CAR unique values: 
N    275459
Y    163098
Name: FLAG_OWN_CAR, dtype: int64

FLAG_OWN_REALTY unique values: 
Y    304074
N    134483
Name: FLAG_OWN_REALTY, dtype: int64

NAME_INCOME_TYPE unique values: 
Working                 226104
Commercial associate    100757
Pensioner                75493
State servant            36186
Student                     17
Name: NAME_INCOME_TYPE, dtype: int64

NAME_EDUCATION_TYPE unique values: 
Secondary / secondary special    301821
Higher education                 117522
Incomplete higher                 14851
Lower secondary                    4051
Academic degree                     312
Name: NAME_EDUCATION_TYPE, dtype: int64

NAME_FAMILY_STATUS unique values: 
Married                 299828
Single / not married     55271
Civil marriage           36532
Separated                27251
Widow                    19675
Name: NAME_FAMILY_STATUS, dtype: int64

N

In [16]:
app_df['OCCUPATION_TYPE'] = app_df ['OCCUPATION_TYPE'].fillna('Unknown')

print(app_df['OCCUPATION_TYPE'].isnull().sum())

0


In [17]:
print(app_df['OCCUPATION_TYPE'].value_counts())

Unknown                  134203
Laborers                  78240
Core staff                43007
Sales staff               41098
Managers                  35487
Drivers                   26090
High skill tech staff     17289
Accountants               15985
Medicine staff            13520
Cooking staff              8076
Security staff             7993
Cleaning staff             5845
Private service staff      3456
Low-skill Laborers         2140
Secretaries                2044
Waiters/barmen staff       1665
Realty agents              1041
HR staff                    774
IT staff                    604
Name: OCCUPATION_TYPE, dtype: int64


In [21]:
# Basic info
print("Credit record shape:", credit_df.shape)
print("\nNull Values: ", credit_df.isnull().sum())
print("\nUnique STATUS values:", credit_df['STATUS'].unique())

# Distribution of status values
print("\nSTATUS value counts:")
print(credit_df['STATUS'].value_counts())


Credit record shape: (1048575, 3)

Null Values:  ID                0
MONTHS_BALANCE    0
STATUS            0
dtype: int64

Unique STATUS values: ['X' '0' 'C' '1' '2' '3' '4' '5']

STATUS value counts:
C    442031
0    383120
X    209230
1     11090
5      1693
2       868
3       320
4       223
Name: STATUS, dtype: int64


In [24]:
# Create binary label
credit_df['bad_credit'] = credit_df['STATUS'].isin(['2', '3', '4', '5']).astype(int)

# One row per person (1 if ever bad credit)
target_df = credit_df.groupby('ID')['bad_credit'].max().reset_index()
target_df.rename(columns={'bad_credit': 'TARGET'}, inplace=True)

print(target_df['TARGET'].value_counts())


0    45318
1      667
Name: TARGET, dtype: int64


In [25]:
# Merge target back with applicant data
merged_df = pd.merge(app_df, target_df, on='ID', how='inner')

# Check merged result
print("Merged shape:", merged_df.shape)
print(merged_df.head())


Merged shape: (36457, 19)
        ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
0  5008804           M            Y               Y             0   
1  5008805           M            Y               Y             0   
2  5008806           M            Y               Y             0   
3  5008808           F            N               Y             0   
4  5008809           F            N               Y             0   

   AMT_INCOME_TOTAL      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
0          427500.0               Working               Higher education   
1          427500.0               Working               Higher education   
2          112500.0               Working  Secondary / secondary special   
3          270000.0  Commercial associate  Secondary / secondary special   
4          270000.0  Commercial associate  Secondary / secondary special   

     NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  \
0        Civil marriage   Ren

In [33]:
# Check for missing values
print("Missing values in merged dataset:")
print(merged_df.isnull().sum())


# Convert DAYS_BIRTH and DAYS_EMPLOYED to positive numbers (years)
merged_df['AGE_YEARS'] = (-merged_df['DAYS_BIRTH']) // 365
merged_df['EMPLOYED_YEARS'] = (-merged_df['DAYS_EMPLOYED']) // 365

# Drop original days columns if you want
merged_df.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)



Missing values in merged dataset:
ID                     0
CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
OCCUPATION_TYPE        0
CNT_FAM_MEMBERS        0
TARGET                 0
AGE_YEARS              0
EMPLOYED_YEARS         0
dtype: int64


KeyError: 'DAYS_BIRTH'

In [34]:
# Class balance
print("\nTarget distribution:")
print(merged_df['TARGET'].value_counts(normalize=True))

# Save cleaned dataset for modeling
merged_df.to_csv('../data/cleaned_credit_data.csv', index=False)


Target distribution:
0    0.983103
1    0.016897
Name: TARGET, dtype: float64
