# Data Collection and Preparation
In this notebook, we will be collating and processing our dataset,

---

### Importing the dataset
We are using the **Credit Card Approval Prediction** dataset from https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction

In [1]:
# Import libraries
import pandas as pd
import numpy as np

---
#### Import applications dataset
This dataset contains information about the applicants

In [2]:
application_record = pd.read_csv("data/application_record.csv")
application_record.shape
application_record.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

---
#### Import credit record dataset
This dataset contains a history of the applicant's credit payments

In [3]:
# Import credit record dataset
credit_record = pd.read_csv("data/credit_record.csv")
credit_record.shape
credit_record.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


---
### Basic data processing and cleaning

In [4]:
# Remove Duplicates
application_record.drop_duplicates(subset ="ID", keep = False, inplace = True)

In [5]:
# Remove Y with 1 and N with 0
application_record['FLAG_OWN_CAR'] = np.where(application_record['FLAG_OWN_CAR'].eq('Y', 'N'), 1, 0)
application_record['FLAG_OWN_REALTY'] = np.where(application_record['FLAG_OWN_REALTY'].eq('Y', 'N'), 1, 0)

In [6]:
# Calculate age and working years
application_record['APPLICANT_AGE'] = round(-application_record['DAYS_BIRTH'] / 365)
application_record['YEARS_WORKING'] = round(-application_record['DAYS_EMPLOYED'] / 365)
application_record = application_record.drop(columns=['DAYS_BIRTH','DAYS_EMPLOYED'])

In [7]:
# Remove applcations < 21 years old
application_record = application_record[application_record['APPLICANT_AGE'] >= 21]  

In [8]:
# Remove rows with null columns
application_record = application_record.dropna()

---
### Classifying if an applicant has good or bad debt
We will credit card payments that are less then one month overdue as good, anything after that will be considered bad\
If a applicant has more bad debt then good debt, he/she will be classified as a "bad" applicant\

In [9]:
debt_map = {
    'C': 'Good Debt', # Loan paid on time
    'X': 'Good Debt', # No loan for the month
    '0': 'Good Debt', # Loan 1-29 days overdue
    '1': 'Bad Debt', # Loan 30-59 days overdue
    '2': 'Bad Debt', # Loan 60-89 days overdue
    '3': 'Bad Debt', # Loan 90-119 days overdue
    '4': 'Bad Debt', # Loan 120-149 days overdue
    '5': 'Bad Debt', # Loan > 150 days overdue
}
# Set a temperory variable to hold the calculated status of a credit payment
credit_record['STATUS_2'] = credit_record['STATUS'].map(debt_map)

# Create variables TOTAL_GOOD_DEBT and TOTAL_BAD_DEBT for easier calculation when summing all credit payments for each applicant
credit_record['TOTAL_GOOD_DEBT'] = np.where(credit_record['STATUS_2'].eq('Good Debt', 'Bad Debt'), 1, 0)
credit_record['TOTAL_BAD_DEBT'] = np.where(credit_record['STATUS_2'].eq('Good Debt', 'Bad Debt'), 0, 1)

# Calculate the total number of good and bad debt for each applicant
df2 = credit_record.groupby('ID').sum()

# Decide if applicant is "good" or "bad"
df2['STATUS'] = np.where(df2['TOTAL_GOOD_DEBT'] >= df2['TOTAL_BAD_DEBT'], 1, 0)

# Calculate percentage of good/bad payments for each applicant
df2['GOOD_PERCENT'] = np.where(df2['TOTAL_BAD_DEBT']!= 0, df2['TOTAL_GOOD_DEBT']/(df2['TOTAL_GOOD_DEBT']+df2['TOTAL_BAD_DEBT']), 1)

# Additional data that we do not need 
df2 = df2.drop(columns=['MONTHS_BALANCE'])

# Remove empty debt
df2 = df2.dropna()

---
### Merge the two different datasets together based on the applicant's ID

In [10]:
new_df = pd.merge(application_record, df2, on="ID")
new_df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,APPLICANT_AGE,YEARS_WORKING,TOTAL_GOOD_DEBT,TOTAL_BAD_DEBT,STATUS,GOOD_PERCENT
0,5008806,M,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,0,0,Security staff,2.0,59.0,3.0,30,0,1,1.0
1,5008808,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,1,Sales staff,1.0,52.0,8.0,5,0,1,1.0
2,5008809,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,1,Sales staff,1.0,52.0,8.0,5,0,1,1.0
3,5008810,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,1,Sales staff,1.0,52.0,8.0,27,0,1,1.0
4,5008811,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,1,Sales staff,1.0,52.0,8.0,39,0,1,1.0


In [11]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25134 entries, 0 to 25133
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   25134 non-null  int64  
 1   CODE_GENDER          25134 non-null  object 
 2   FLAG_OWN_CAR         25134 non-null  int32  
 3   FLAG_OWN_REALTY      25134 non-null  int32  
 4   CNT_CHILDREN         25134 non-null  int64  
 5   AMT_INCOME_TOTAL     25134 non-null  float64
 6   NAME_INCOME_TYPE     25134 non-null  object 
 7   NAME_EDUCATION_TYPE  25134 non-null  object 
 8   NAME_FAMILY_STATUS   25134 non-null  object 
 9   NAME_HOUSING_TYPE    25134 non-null  object 
 10  FLAG_MOBIL           25134 non-null  int64  
 11  FLAG_WORK_PHONE      25134 non-null  int64  
 12  FLAG_PHONE           25134 non-null  int64  
 13  FLAG_EMAIL           25134 non-null  int64  
 14  OCCUPATION_TYPE      25134 non-null  object 
 15  CNT_FAM_MEMBERS      25134 non-null 

In [12]:
#Convert columns to int
#new_df = new_df.round(0).astype({"AMT_INCOME_TOTAL":"int","CNT_FAM_MEMBERS":"int","APPLICANT_AGE":"int","YEARS_WORKING":"int","GOOD_PERCENT":"int"})
new_df.info()
new_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25134 entries, 0 to 25133
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   25134 non-null  int64  
 1   CODE_GENDER          25134 non-null  object 
 2   FLAG_OWN_CAR         25134 non-null  int32  
 3   FLAG_OWN_REALTY      25134 non-null  int32  
 4   CNT_CHILDREN         25134 non-null  int64  
 5   AMT_INCOME_TOTAL     25134 non-null  float64
 6   NAME_INCOME_TYPE     25134 non-null  object 
 7   NAME_EDUCATION_TYPE  25134 non-null  object 
 8   NAME_FAMILY_STATUS   25134 non-null  object 
 9   NAME_HOUSING_TYPE    25134 non-null  object 
 10  FLAG_MOBIL           25134 non-null  int64  
 11  FLAG_WORK_PHONE      25134 non-null  int64  
 12  FLAG_PHONE           25134 non-null  int64  
 13  FLAG_EMAIL           25134 non-null  int64  
 14  OCCUPATION_TYPE      25134 non-null  object 
 15  CNT_FAM_MEMBERS      25134 non-null 

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,APPLICANT_AGE,YEARS_WORKING,TOTAL_GOOD_DEBT,TOTAL_BAD_DEBT,STATUS,GOOD_PERCENT
0,5008806,M,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,0,0,Security staff,2.0,59.0,3.0,30,0,1,1.0
1,5008808,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,1,Sales staff,1.0,52.0,8.0,5,0,1,1.0
2,5008809,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,1,Sales staff,1.0,52.0,8.0,5,0,1,1.0
3,5008810,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,1,Sales staff,1.0,52.0,8.0,27,0,1,1.0
4,5008811,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,1,1,Sales staff,1.0,52.0,8.0,39,0,1,1.0


---
### Save the dataframe to a csv file

In [13]:
new_df.to_csv('data/processed_data.csv', index=False)