In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from fairlearn.metrics import demographic_parity_difference

In [4]:
df = pd.read_json('../data/raw_credit_applications.json')

In [9]:
import pprint
pprint.pprint(df.loc[1].to_dict(), sort_dicts=False)

{'_id': 'app_037',
 'applicant_info': {'full_name': 'Brandon Walker',
                    'email': 'brandon.walker2@yahoo.com',
                    'ssn': '425-69-4784',
                    'ip_address': '10.1.102.112',
                    'gender': 'M',
                    'date_of_birth': '1992-03-31',
                    'zip_code': '10032'},
 'financials': {'annual_income': 78000,
                'credit_history_months': 51,
                'debt_to_income': 0.18,
                'savings_balance': 17915},
 'spending_behavior': [{'category': 'Rent', 'amount': 608},
                       {'category': 'Dining', 'amount': 96},
                       {'category': 'Healthcare', 'amount': 243}],
 'decision': {'loan_approved': False,
              'rejection_reason': 'algorithm_risk_score'},
 'processing_timestamp': nan,
 'loan_purpose': nan,
 'notes': nan}


In [10]:
flat_df = df.copy()

applicant_df = pd.json_normalize(flat_df["applicant_info"]).add_prefix("applicant_")
financials_df = pd.json_normalize(flat_df["financials"]).add_prefix("financial_")
decision_df = pd.json_normalize(flat_df["decision"]).add_prefix("decision_")

flat_df = pd.concat(
    [flat_df.drop(["applicant_info", "financials", "decision"], axis=1),
     applicant_df,
     financials_df,
     decision_df],
    axis=1
)
flat_df.head()

Unnamed: 0,_id,spending_behavior,processing_timestamp,loan_purpose,notes,applicant_full_name,applicant_email,applicant_ssn,applicant_ip_address,applicant_gender,...,applicant_zip_code,financial_annual_income,financial_credit_history_months,financial_debt_to_income,financial_savings_balance,financial_annual_salary,decision_loan_approved,decision_rejection_reason,decision_interest_rate,decision_approved_amount
0,app_200,"[{'category': 'Shopping', 'amount': 480}, {'ca...",2024-01-15T00:00:00Z,,,Jerry Smith,jerry.smith17@hotmail.com,596-64-4340,192.168.48.155,Male,...,10036,73000,23,0.2,31212,,False,algorithm_risk_score,,
1,app_037,"[{'category': 'Rent', 'amount': 608}, {'catego...",,,,Brandon Walker,brandon.walker2@yahoo.com,425-69-4784,10.1.102.112,M,...,10032,78000,51,0.18,17915,,False,algorithm_risk_score,,
2,app_215,"[{'category': 'Rent', 'amount': 109}]",,vacation,,Scott Moore,scott.moore94@mail.com,370-78-5178,10.240.193.250,Male,...,10075,61000,41,0.21,37909,,True,,3.7,59000.0
3,app_024,"[{'category': 'Fitness', 'amount': 575}]",,,,Thomas Lee,thomas.lee6@protonmail.com,194-35-1833,192.168.175.67,Male,...,10077,103000,70,0.35,0,,True,,4.3,34000.0
4,app_184,"[{'category': 'Entertainment', 'amount': 463}]",2024-01-15T00:00:00Z,,,Brian Rodriguez,brian.rodriguez86@aol.com,480-41-2475,172.29.125.105,M,...,10080,57000,14,0.23,31763,,False,algorithm_risk_score,,


In [11]:
def summarize_spending(spend_list):
    if not isinstance(spend_list, list):
        return pd.Series({
            "spend_total": 0,
            "spend_rent": 0,
            "spend_dining": 0,
            "spend_healthcare": 0,
        })

    summary = {
        "spend_total": 0,
        "spend_rent": 0,
        "spend_dining": 0,
        "spend_healthcare": 0,
    }

    for item in spend_list:
        cat = item["category"].lower()
        amt = item["amount"]
        summary["spend_total"] += amt

        if cat == "rent":
            summary["spend_rent"] += amt
        elif cat == "dining":
            summary["spend_dining"] += amt
        elif cat == "healthcare":
            summary["spend_healthcare"] += amt

    return pd.Series(summary)

spend_features = df["spending_behavior"].apply(summarize_spending)

flat_df = pd.concat(
    [flat_df.drop("spending_behavior", axis=1), spend_features],
    axis=1
)

flat_df.head()

Unnamed: 0,_id,processing_timestamp,loan_purpose,notes,applicant_full_name,applicant_email,applicant_ssn,applicant_ip_address,applicant_gender,applicant_date_of_birth,...,financial_savings_balance,financial_annual_salary,decision_loan_approved,decision_rejection_reason,decision_interest_rate,decision_approved_amount,spend_total,spend_rent,spend_dining,spend_healthcare
0,app_200,2024-01-15T00:00:00Z,,,Jerry Smith,jerry.smith17@hotmail.com,596-64-4340,192.168.48.155,Male,2001-03-09,...,31212,,False,algorithm_risk_score,,,1517,790,0,0
1,app_037,,,,Brandon Walker,brandon.walker2@yahoo.com,425-69-4784,10.1.102.112,M,1992-03-31,...,17915,,False,algorithm_risk_score,,,947,608,96,243
2,app_215,,vacation,,Scott Moore,scott.moore94@mail.com,370-78-5178,10.240.193.250,Male,1989-10-24,...,37909,,True,,3.7,59000.0,109,109,0,0
3,app_024,,,,Thomas Lee,thomas.lee6@protonmail.com,194-35-1833,192.168.175.67,Male,1983-04-25,...,0,,True,,4.3,34000.0,575,0,0,0
4,app_184,2024-01-15T00:00:00Z,,,Brian Rodriguez,brian.rodriguez86@aol.com,480-41-2475,172.29.125.105,M,1999-05-21,...,31763,,False,algorithm_risk_score,,,463,0,0,0


In [13]:
flat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502 entries, 0 to 501
Data columns (total 24 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   _id                              502 non-null    object 
 1   processing_timestamp             62 non-null     object 
 2   loan_purpose                     50 non-null     object 
 3   notes                            2 non-null      object 
 4   applicant_full_name              502 non-null    object 
 5   applicant_email                  502 non-null    object 
 6   applicant_ssn                    497 non-null    object 
 7   applicant_ip_address             497 non-null    object 
 8   applicant_gender                 501 non-null    object 
 9   applicant_date_of_birth          501 non-null    object 
 10  applicant_zip_code               501 non-null    object 
 11  financial_annual_income          497 non-null    object 
 12  financial_credit_histo

In [18]:
flat_df.financial_annual_income

0       73000
1       78000
2       61000
3      103000
4       57000
        ...  
497     22000
498     78000
499     96000
500    106000
501    104000
Name: financial_annual_income, Length: 502, dtype: object

In [19]:
flat_df.applicant_gender.unique()

array(['Male', 'M', 'F', 'Female', '', nan], dtype=object)