# Task 1: Germam Credict Risk Prediction Using Machine Learning

In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer  # ← this is the missing one
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [3]:
df = pd.read_csv("GermanCredit.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_length,installment_rate,personal_status,...,age,installment_plan,housing,existing_credits,default,dependents,telephone,foreign_worker,job,gender
0,0,-43.0,6,critical,radio/tv,1169,,13 years,4,single,...,67,none,own,2,0,1,2349340000.0,yes,skilled employee,male
1,1,75.0,48,repaid,radio/tv,5951,89.0,2 years,2,,...,22,none,own,1,1,1,,yes,skilled employee,female
2,2,,12,critical,education,2096,24.0,5 years,2,single,...,49,none,own,1,0,2,,yes,unskilled resident,male
3,3,-32.0,42,repaid,furniture,7882,9.0,5 years,2,single,...,45,none,for free,1,0,2,,yes,skilled employee,male
4,4,-23.0,24,delayed,car (new),4870,43.0,3 years,3,single,...,53,none,for free,2,1,2,,yes,skilled employee,male


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            1000 non-null   int64  
 1   checking_balance      606 non-null    float64
 2   months_loan_duration  1000 non-null   int64  
 3   credit_history        1000 non-null   object 
 4   purpose               1000 non-null   object 
 5   amount                1000 non-null   int64  
 6   savings_balance       817 non-null    float64
 7   employment_length     938 non-null    object 
 8   installment_rate      1000 non-null   int64  
 9   personal_status       690 non-null    object 
 10  other_debtors         1000 non-null   object 
 11  residence_history     870 non-null    object 
 12  property              1000 non-null   object 
 13  age                   1000 non-null   int64  
 14  installment_plan      1000 non-null   object 
 15  housing               

In [5]:
# Clean column names (strip spaces, fix naming issues)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Handle missing values (replace blank strings with NaN)
df.replace(r'^\s*$', pd.NA, regex=True, inplace=True)

In [6]:
df.isna().sum()

unnamed:_0                0
checking_balance        394
months_loan_duration      0
credit_history            0
purpose                   0
amount                    0
savings_balance         183
employment_length        62
installment_rate          0
personal_status         310
other_debtors             0
residence_history       130
property                  0
age                       0
installment_plan          0
housing                   0
existing_credits          0
default                   0
dependents                0
telephone               596
foreign_worker            0
job                       0
gender                    0
dtype: int64

## Convert columns to appropriate types

In [7]:
df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['months_loan_duration'] = pd.to_numeric(df['months_loan_duration'], errors='coerce')
df['default'] = df['default'].astype(int)
df['default'] = df['default'].apply(lambda x: 0 if x == 1 else 1)  # 1: good, 2: default → 0: good, 1: default

### Define features and target

In [8]:
target = 'default'
drop_cols = ['telephone', 'observation_id'] if 'observation_id' in df.columns else ['telephone']

X = df.drop(columns=[target] + drop_cols, errors='ignore')
y = df[target]


print('X shape: ' , X.shape)
print(X.head())

X shape:  (1000, 21)
   unnamed:_0  checking_balance  months_loan_duration credit_history  \
0           0             -43.0                     6       critical   
1           1              75.0                    48         repaid   
2           2               NaN                    12       critical   
3           3             -32.0                    42         repaid   
4           4             -23.0                    24        delayed   

     purpose  amount  savings_balance employment_length  installment_rate  \
0   radio/tv    1169              NaN          13 years                 4   
1   radio/tv    5951             89.0           2 years                 2   
2  education    2096             24.0           5 years                 2   
3  furniture    7882              9.0           5 years                 2   
4  car (new)    4870             43.0           3 years                 3   

  personal_status  ... residence_history                  property age  \
0        

In [9]:
# Identify feature types
numeric_features = ['amount', 'age', 'months_loan_duration']
categorical_features = X.select_dtypes(include='object').columns.tolist()

In [12]:
# Preprocessing pipeline that also handles missing values
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])



# Combine preprocessing + model
clf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])



In [13]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train model
clf_pipeline.fit(X_train, y_train)

# Evaluate
y_pred = clf_pipeline.predict(X_test)
y_proba = clf_pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save model
joblib.dump(clf_pipeline, "credit_risk_model.pkl")


              precision    recall  f1-score   support

           0       0.54      0.22      0.31        60
           1       0.73      0.92      0.82       140

    accuracy                           0.71       200
   macro avg       0.64      0.57      0.56       200
weighted avg       0.68      0.71      0.66       200

ROC-AUC: 0.6921428571428572
Confusion Matrix:
 [[ 13  47]
 [ 11 129]]


['credit_risk_model.pkl']

# Scoring New Customers


In [14]:
def score_new_customer(new_data: pd.DataFrame):
    model = joblib.load("credit_risk_model.pkl")
    new_data.replace(r'^\s*$', pd.NA, regex=True, inplace=True)
    return model.predict_proba(new_data)[:, 1]


In [15]:
new_customer = pd.DataFrame([{
    'checking_balance': '169',
    'months_loan_duration': 36,
    'credit_history': 'repaid',
    'purpose': 'car (used)',
    'amount': 6948,
    'savings_balance': '57',
    'employment_length': '2 years',
    'installment_rate': 2,
    'personal_status': 'single',
    'other_debtors': 'none',
    'residence_history': '3 months',
    'property': 'other',
    'age': 35,
    'installment_plan': 'none',
    'housing': 'rent',
    'existing_credits': 1,
    'dependents': 0,
    'foreign_worker': 'yes',
    'job': 'mangement self-employed',
    'gender': 'male'
}])

score = score_new_customer(new_customer)
print(f"Probability of default: {score[0]:.4f}")


Probability of default: 0.8800


# Task 2: Feature Engineering Challenge 

In [17]:
import json
import pandas as pd
import numpy as np


with open('Credit_bureau_sample_data.json', 'r') as f:
    data = json.load(f)


In [18]:
import pprint
pprint.pprint(data[0], depth=3)


{'application_id': 97,
 'data': {'consumerfullcredit': {'accountmonthlypaymenthistory': [...],
                                 'accountmonthlypaymenthistoryheader': {...},
                                 'accountrating': {...},
                                 'creditaccountsummary': {...},
                                 'creditagreementsummary': [...],
                                 'deliquencyinformation': {...},
                                 'employmenthistory': [...],
                                 'enquirydetails': {...},
                                 'enquiryhistorytop': [...],
                                 'guarantorcount': {...},
                                 'guarantordetails': {...},
                                 'personaldetailssummary': {...},
                                 'subjectlist': {...},
                                 'telephonehistory': [...]}}}


## Goal:
### To Write a Python class that:
Parses relevant sections (like creditaccountsummary, deliquencyinformation, etc.)

Extracts meaningful flat numeric or categorical features

Outputs a dictionary or pandas row per report

## 1

| Section                  | Features to Extract                                                               | Why They Matter                                      |
| ------------------------ | --------------------------------------------------------------------------------- | ---------------------------------------------------- |
| `creditaccountsummary`   | - total\_accounts<br>- total\_balance<br>- total\_overdue<br>- avg\_credit\_limit | Core credit risk indicators                          |
| `deliquencyinformation`  | - total\_defaults<br>- recent\_delinquency\_months<br>- worst\_status             | Past behavior predicts future risk                   |
| `employmenthistory`      | - current\_employer\_duration<br>- stable\_employment flag                        | Stable jobs = better repayment likelihood            |
| `enquirydetails`         | - total\_recent\_enquiries<br>- recent\_enquiry\_types                            | Too many recent inquiries may signal credit shopping |
| `creditagreementsummary` | - avg\_loan\_amount<br>- avg\_tenure<br>- open\_loan\_count                       | Indicates borrowing behavior and exposure            |
| `personaldetailssummary` | - age<br>- marital\_status<br>- home\_ownership                                   | Useful for segmenting risk profiles                  |


In [23]:

class CreditBureauFeatureExtractor:
    def __init__(self):
        pass

    def extract_features(self, record):
        features = {}
        report = record.get("data", {}).get("consumerfullcredit", {})
        
        # Basic info
        features["application_id"] = record.get("application_id")

        # --- CREDIT ACCOUNT SUMMARY ---
        acc_summary = report.get("creditaccountsummary", {})
        features["total_accounts"] = acc_summary.get("noofaccounts", np.nan)
        features["total_balance"] = acc_summary.get("totalbalanceamount", np.nan)
        features["total_overdue"] = acc_summary.get("totaloverdueamount", np.nan)

        # --- DELINQUENCY INFO ---
        delinquency = report.get("deliquencyinformation", {})
        features["worst_account_status"] = delinquency.get("worstaccountstatus", np.nan)
        features["months_since_last_deliquency"] = delinquency.get("monthsincelastdeliquency", np.nan)
        features["noofaccountswithhistory30dayspastdue"] = delinquency.get("noofaccountswithhistory30dayspastdue", np.nan)

        # --- ENQUIRY DETAILS ---
        enquiries = report.get("enquirydetails", {})
        features["total_enquiries"] = enquiries.get("totalenquiriescount", np.nan)
        features["recent_enquiries_30d"] = enquiries.get("noofenquiriesinlast30days", np.nan)

        # --- AGREEMENT SUMMARY ---
        agreements = report.get("creditagreementsummary", [])
        loan_amounts = [float(a.get("highcreditamount", 0)) for a in agreements if "highcreditamount" in a]
        tenures = [int(a.get("tenure", 0)) for a in agreements if "tenure" in a]
        statuses = [a.get("accountstatus") for a in agreements]

        features["avg_loan_amount"] = np.mean(loan_amounts) if loan_amounts else np.nan
        features["avg_loan_tenure"] = np.mean(tenures) if tenures else np.nan
        features["open_loans"] = sum(1 for s in statuses if s == "Open")

        # --- EMPLOYMENT HISTORY ---
        employment = report.get("employmenthistory", [])
        if employment:
            current_job = employment[0]  # Assume the first is most recent
            features["employment_status"] = current_job.get("employmenttype")
            features["months_in_current_job"] = current_job.get("durationinmonths")

        # --- PERSONAL DETAILS (if available) ---
        personal = report.get("personaldetailssummary", {})
        features["marital_status"] = personal.get("maritalstatus")
        features["residencetype"] = personal.get("residencetype")

        return features

    def transform(self, dataset):
        return pd.DataFrame([self.extract_features(record) for record in dataset])


In [25]:
import json

with open("Credit_bureau_sample_data.json", "r") as f:
    data = json.load(f)

extractor = CreditBureauFeatureExtractor()
df_features = extractor.transform(data)
df_features.head()


Unnamed: 0,application_id,total_accounts,total_balance,total_overdue,worst_account_status,months_since_last_deliquency,noofaccountswithhistory30dayspastdue,total_enquiries,recent_enquiries_30d,avg_loan_amount,avg_loan_tenure,open_loans,employment_status,months_in_current_job,marital_status,residencetype
0,97,,,,,,,,,,,3,,,,
1,9714953,,,,,,,,,,,4,,,,
2,9714978,,,,,,,,,,,3,,,,


## 2

| Feature                       | Description                             | Why it Matters for Risk                                                           |
| ----------------------------- | --------------------------------------- | --------------------------------------------------------------------------------- |
| `num_total_accounts`          | Total number of credit lines            | More accounts may mean experience with credit, but also higher potential exposure |
| `num_active_accounts`         | Accounts currently open                 | High number could indicate risk if utilization is also high                       |
| `total_credit_limit`          | Sum of all credit limits                | Indicates available credit                                                        |
| `credit_utilization_ratio`    | Total balance / Total limit             | A high ratio suggests over-leveraging                                             |
| `num_delinquent_accounts`     | Number of accounts with missed payments | Direct indicator of past risk                                                     |
| `months_since_oldest_account` | Age of credit history                   | Longer history usually means lower risk                                           |
| `num_recent_inquiries`        | Number of recent hard checks            | High recent checks may suggest financial stress                                   |


In [27]:
import json
from typing import List, Union
import pandas as pd
from datetime import datetime

class CreditBureauFeatureExtractor:
    def __init__(self, current_date: str = None):
        self.current_date = pd.to_datetime(current_date or datetime.today().strftime('%Y-%m-%d'))

    def extract_features(self, credit_reports: Union[str, List[dict]]) -> pd.DataFrame:
        if isinstance(credit_reports, str):
            with open(credit_reports, "r") as f:
                data = json.load(f)
        else:
            data = credit_reports

        feature_rows = []
        for record in data:
            app_id = record.get("application_id")
            report = record.get("data", {})

            # Parse key features (assuming structure)
            accounts = report.get("accounts", [])
            inquiries = report.get("inquiries", [])
            delinquencies = report.get("delinquencies", [])
            public_records = report.get("public_records", [])

            total_accounts = len(accounts)
            active_accounts = sum(1 for acc in accounts if acc.get("status") == "active")
            total_limit = sum(acc.get("credit_limit", 0) for acc in accounts if acc.get("credit_limit"))
            total_balance = sum(acc.get("balance", 0) for acc in accounts if acc.get("balance"))
            utilization_ratio = total_balance / total_limit if total_limit > 0 else 0

            # Time-based feature
            account_open_dates = [pd.to_datetime(acc.get("opened_date")) for acc in accounts if acc.get("opened_date")]
            if account_open_dates:
                oldest_account_age = (self.current_date - min(account_open_dates)).days // 30
            else:
                oldest_account_age = 0

            recent_inquiries = sum(1 for inq in inquiries if pd.to_datetime(inq.get("date")) > self.current_date - pd.DateOffset(months=6))
            num_delinquent_accounts = len(delinquencies)
            num_public_records = len(public_records)

            # Assemble feature dict
            features = {
                "application_id": app_id,
                "num_total_accounts": total_accounts,
                "num_active_accounts": active_accounts,
                "total_credit_limit": total_limit,
                "total_balance": total_balance,
                "credit_utilization_ratio": utilization_ratio,
                "months_since_oldest_account": oldest_account_age,
                "num_recent_inquiries_6m": recent_inquiries,
                "num_delinquent_accounts": num_delinquent_accounts,
                "num_public_records": num_public_records
            }

            feature_rows.append(features)

        return pd.DataFrame(feature_rows)


In [29]:
extractor = CreditBureauFeatureExtractor()
df_features = extractor.extract_features("Credit_bureau_sample_data.json")
df_features.head()


Unnamed: 0,application_id,num_total_accounts,num_active_accounts,total_credit_limit,total_balance,credit_utilization_ratio,months_since_oldest_account,num_recent_inquiries_6m,num_delinquent_accounts,num_public_records
0,97,0,0,0,0,0,0,0,0,0
1,9714953,0,0,0,0,0,0,0,0,0
2,9714978,0,0,0,0,0,0,0,0,0


## 3

| Feature Name                 | Description                       | Reason for Use                   |
| ---------------------------- | --------------------------------- | -------------------------------- |
| `num_open_accounts`          | Count of currently open accounts  | Reflects current credit exposure |
| `total_outstanding_balance`  | Sum of outstanding balances       | Measures total debt              |
| `average_account_age_months` | Avg. age of accounts in months    | Indicates credit experience      |
| `num_delinquent_accounts`    | Accounts with overdue payments    | High correlation with default    |
| `recent_inquiries`           | Credit inquiries in last 6 months | Measures credit-seeking behavior |
| `num_closed_accounts`        | Closed loan accounts              | Historical credit behavior       |
| `num_accounts_with_defaults` | Accounts flagged for default      | Direct risk indicator            |


In [30]:
import json
import pandas as pd
from datetime import datetime

class CreditReportFeatureExtractor:
    def __init__(self):
        pass

    def extract_features(self, credit_reports):
        features = []
        for entry in credit_reports:
            app_id = entry.get("application_id")
            data = entry.get("data", {})

            feature_dict = {"application_id": app_id}

            # Example features (adjust based on your actual structure)
            accounts = data.get("accounts", [])
            inquiries = data.get("inquiries", [])
            defaults = data.get("defaults", [])
            
            feature_dict["num_open_accounts"] = sum(1 for a in accounts if a.get("account_status") == "open")
            feature_dict["num_closed_accounts"] = sum(1 for a in accounts if a.get("account_status") == "closed")
            feature_dict["total_outstanding_balance"] = sum(float(a.get("outstanding_balance", 0)) for a in accounts)
            feature_dict["num_delinquent_accounts"] = sum(1 for a in accounts if a.get("delinquency_status") == "delinquent")
            feature_dict["num_accounts_with_defaults"] = len(defaults)
            feature_dict["recent_inquiries"] = sum(1 for i in inquiries if self._is_recent(i.get("inquiry_date")))

            # Account age
            account_ages = [
                self._months_between(a.get("opened_date")) for a in accounts if a.get("opened_date")
            ]
            feature_dict["average_account_age_months"] = sum(account_ages)/len(account_ages) if account_ages else 0

            features.append(feature_dict)

        return pd.DataFrame(features)

    def _is_recent(self, date_str, months=6):
        """Check if the inquiry date is within the last `months` months"""
        if not date_str:
            return False
        try:
            date = datetime.strptime(date_str, "%Y-%m-%d")
            delta = datetime.now() - date
            return delta.days <= months * 30
        except:
            return False

    def _months_between(self, start_date_str):
        """Calculate months between a date and today"""
        if not start_date_str:
            return 0
        try:
            start = datetime.strptime(start_date_str, "%Y-%m-%d")
            now = datetime.now()
            return (now.year - start.year) * 12 + now.month - start.month
        except:
            return 0


In [32]:
# Load JSON
with open("Credit_bureau_sample_data.json", "r") as f:
    data = json.load(f)

extractor = CreditReportFeatureExtractor()
df = extractor.extract_features(data)

df.head()


Unnamed: 0,application_id,num_open_accounts,num_closed_accounts,total_outstanding_balance,num_delinquent_accounts,num_accounts_with_defaults,recent_inquiries,average_account_age_months
0,97,0,0,0,0,0,0,0
1,9714953,0,0,0,0,0,0,0
2,9714978,0,0,0,0,0,0,0


## 4

| Feature Name              | Description                              | Why it's Relevant for Credit Risk                 |
| ------------------------- | ---------------------------------------- | ------------------------------------------------- |
| `num_accounts`            | Number of credit accounts                | More accounts may suggest experience or risk      |
| `num_open_accounts`       | Number of currently open/active accounts | High number may signal risk                       |
| `num_delinquent_accounts` | Number of accounts with late payments    | Strong indicator of financial distress            |
| `total_credit_limit`      | Total credit available across accounts   | Indicates creditworthiness and trust from lenders |
| `total_current_balance`   | Sum of outstanding balances              | High balance-to-limit ratio suggests higher risk  |
| `num_recent_inquiries`    | Number of recent hard credit checks      | Too many = possible financial strain              |
| `avg_account_age_months`  | Average age of accounts in months        | Longer histories typically signal lower risk      |


In [34]:
import json
import pandas as pd
from typing import List, Dict, Any
from datetime import datetime

class CreditReportFeatureExtractor:
    def __init__(self):
        pass

    def extract_features(self, reports: List[Dict[str, Any]]) -> pd.DataFrame:
        feature_rows = []

        for report in reports:
            app_id = report.get("application_id")
            data = report.get("data", {})

            accounts = data.get("accounts", [])
            inquiries = data.get("inquiries", [])
            delinquencies = [acc for acc in accounts if acc.get("status") == "delinquent"]
            open_accounts = [acc for acc in accounts if acc.get("status") == "open"]

            total_credit_limit = sum(acc.get("credit_limit", 0) or 0 for acc in accounts)
            total_current_balance = sum(acc.get("balance", 0) or 0 for acc in accounts)

            # Account age in months
            account_ages = []
            for acc in accounts:
                open_date_str = acc.get("open_date")
                if open_date_str:
                    try:
                        open_date = datetime.strptime(open_date_str, "%Y-%m-%d")
                        months = (datetime.today().year - open_date.year) * 12 + (datetime.today().month - open_date.month)
                        account_ages.append(months)
                    except:
                        pass

            avg_account_age_months = round(sum(account_ages) / len(account_ages), 2) if account_ages else 0

            # Recent inquiries (past 6 months)
            recent_inquiries = [
                iq for iq in inquiries if "date" in iq and self._is_recent(iq["date"])
            ]

            feature_rows.append({
                "application_id": app_id,
                "num_accounts": len(accounts),
                "num_open_accounts": len(open_accounts),
                "num_delinquent_accounts": len(delinquencies),
                "total_credit_limit": total_credit_limit,
                "total_current_balance": total_current_balance,
                "balance_to_limit_ratio": round(total_current_balance / total_credit_limit, 2) if total_credit_limit else 0,
                "num_recent_inquiries": len(recent_inquiries),
                "avg_account_age_months": avg_account_age_months,
            })

        return pd.DataFrame(feature_rows)

    def _is_recent(self, date_str: str, months: int = 6) -> bool:
        """Check if a date string is within the last `months` months."""
        try:
            date = datetime.strptime(date_str, "%Y-%m-%d")
            delta_months = (datetime.today().year - date.year) * 12 + (datetime.today().month - date.month)
            return delta_months <= months
        except:
            return False


In [35]:
with open("Credit_bureau_sample_data.json", "r") as f:
    reports = json.load(f)

extractor = CreditReportFeatureExtractor()
features_df = extractor.extract_features(reports)

features_df.head()

Unnamed: 0,application_id,num_accounts,num_open_accounts,num_delinquent_accounts,total_credit_limit,total_current_balance,balance_to_limit_ratio,num_recent_inquiries,avg_account_age_months
0,97,0,0,0,0,0,0,0,0
1,9714953,0,0,0,0,0,0,0,0
2,9714978,0,0,0,0,0,0,0,0


## 5

| **Feature**                                                          | **Description**                                                                                                  | **Relevance for Credit Risk Scoring**                                                                                                                                                              |
| -------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **No of bad accounts (e.g., noofotheraccountsbad)**                  | Total number of accounts with negative performance (e.g., overdue, defaulted).                                   | Indicates the borrower's past difficulties in managing credit accounts. A higher number of bad accounts suggests a higher likelihood of future default, which is crucial for assessing risk.       |
| **No of good accounts (e.g., noofotheraccountsgood)**                | Total number of accounts in good standing (e.g., no overdue payments).                                           | A higher number of good accounts suggests better credit management and a lower likelihood of default. This feature can improve the model by balancing out the effect of bad accounts.              |
| **Total outstanding debt (e.g., totaloutstandingdebt)**              | Total amount of outstanding debt across all accounts.                                                            | The larger the total debt, the higher the borrower’s financial burden, which increases the likelihood of default. Understanding the size of the borrower’s financial obligations is critical.      |
| **Total arrears (e.g., totalaccountarrear)**                         | Total overdue amount on accounts.                                                                                | Arrears are a strong indicator of financial distress. A larger arrears balance signals greater credit risk.                                                                                        |
| **Total monthly installment (e.g., totalmonthlyinstalment)**         | Total monthly payment commitment across all loans.                                                               | Higher monthly obligations can strain a borrower’s finances, especially if they are close to or exceed their income. This feature is key for assessing repayment capacity and default likelihood.  |
| **Employment status (e.g., currentemployer\_duration)**              | Duration of current employment or employer stability.                                                            | Stable employment is linked to higher income stability, reducing the likelihood of default. Longer tenure with an employer improves predictability of future financial behavior.                   |
| **Guarantor count (e.g., guarantorcount)**                           | Number of guarantors associated with the borrower’s loans.                                                       | A guarantor provides a safety net for lenders. If there are no guarantors, it may signal higher risk. This can influence creditworthiness by adding a layer of security or increasing risk.        |
| **Recent inquiries (e.g., total\_recent\_enquiries)**                | Number of recent credit inquiries made by the borrower.                                                          | Numerous recent inquiries might suggest credit shopping or financial distress, which increases the likelihood of default. High inquiry frequency is often associated with higher credit risk.      |
| **Age (e.g., birthdate)**                                            | Borrower's age.                                                                                                  | Age can correlate with financial stability. Younger borrowers may be riskier due to limited credit history, while older borrowers may have more established financial habits.                      |
| **Account performance (e.g., amountoverdue, currentbalanceamt)**     | Amount overdue and current balance in loan accounts.                                                             | Overdue amounts and large current balances suggest financial strain, directly influencing the risk of default. A low overdue amount or zero balance indicates better financial health.             |
| **Account status (e.g., accountstatus)**                             | Status of loan accounts (e.g., open, written off, performing, etc.).                                             | The status of accounts shows whether the borrower is actively managing their debt or whether they have written off accounts, signaling a higher risk of default in the latter case.                |
| **Type of loan (e.g., loan type like personal loan, overdraft)**     | Type of loan or credit agreement (e.g., personal loan, secured loan, overdraft).                                 | Different types of loans have varying risk profiles. Secured loans typically have lower risk, while unsecured loans or overdrafts carry higher risk, which can impact scoring.                     |
| **Guarantor information (e.g., guarantorgender, guarantoraddress1)** | Details about the guarantor (if available).                                                                      | The presence and quality of a guarantor can mitigate risk by providing additional security. Missing or inadequate guarantor information could increase risk.                                       |
| **Monthly payment history (e.g., accountmonthlypaymenthistory)**     | Payment history for the last 24 months.                                                                          | Payment history is one of the most powerful indicators of future behavior. Late or missed payments increase the likelihood of default. This feature can significantly improve prediction accuracy. |
| **Default frequency (e.g., monthsinarrears)**                        | Number of months an account has been in arrears.                                                                 | Prolonged arrears (e.g., 13 months in arrears) indicate severe payment issues and signal high credit risk, which should be strongly factored into the scoring model.                               |
| **Loan duration (e.g., loanduration)**                               | Duration of the loan account (e.g., the length of time since the loan was opened or is expected to be paid off). | Loan duration helps assess whether the borrower has a long-term financial commitment or has been carrying debt for an extended period, which may indicate either stability or distress.            |
| **Credit usage pattern (e.g., total\_number\_of\_judgements)**       | Total number of judgments made on the borrower's accounts.                                                       | Legal judgments are a critical indicator of financial and legal disputes, which directly affect credit risk. The higher the number of judgments, the higher the risk.                              |
| **Property ownership (e.g., propertyownedtype)**                     | Whether the borrower owns property.                                                                              | Homeownership typically indicates financial stability and reduces default risk. Borrowers who rent may be more financially unstable.                                                               |


In [54]:
import json
import pandas as pd
from datetime import datetime
import re

class CreditBureauFeatureExtractor:
    def __init__(self):
        pass

    def clean_numeric(self, value):
        """Convert string numbers with commas to float"""
        if value is None or isinstance(value, (int, float)):
            return value if value is not None else 0
        if isinstance(value, str) and value.strip() in ('', '-', 'null', 'None'):
            return 0
        try:
            return float(re.sub(r'[^\d.]', '', str(value)))
        except:
            return 0

    def calculate_age(self, birthdate_str):
        """Calculate age from birthdate string"""
        if not birthdate_str or birthdate_str.strip() in ('', '-'):
            return None
        try:
            birthdate = datetime.strptime(birthdate_str, "%d/%m/%Y")
            today = datetime.today()
            return today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
        except:
            return None

    def process_account_ratings(self, account_rating):
        """Extract good/bad account counts"""
        features = {}
        features['no_of_bad_accounts'] = sum(
            self.clean_numeric(account_rating.get(field, 0))
            for field in [
                'noofotheraccountsbad', 'noofretailaccountsbad', 'nooftelecomaccountsbad',
                'noofautoloanaccountsbad', 'noofhomeloanaccountsbad', 'noofjointloanaccountsbad',
                'noofstudyloanaccountsbad', 'noofcreditcardaccountsbad', 'noofpersonalloanaccountsbad'
            ]
        )
        features['no_of_good_accounts'] = sum(
            self.clean_numeric(account_rating.get(field, 0))
            for field in [
                'noofotheraccountsgood', 'noofretailaccountsgood', 'nooftelecomaccountsgood',
                'noofautoloanccountsgood', 'noofhomeloanaccountsgood', 'noofjointloanaccountsgood',
                'noofstudyloanaccountsgood', 'noofcreditcardaccountsgood', 'noofpersonalloanaccountsgood'
            ]
        )
        return features

    def process_credit_summary(self, credit_summary):
        """Extract debt and arrears information"""
        features = {}
        features['total_outstanding_debt'] = self.clean_numeric(credit_summary.get('totaloutstandingdebt', 0))
        features['total_arrears'] = self.clean_numeric(credit_summary.get('amountarrear', 0))
        features['total_monthly_instalment'] = self.clean_numeric(credit_summary.get('totalmonthlyinstalment', 0))
        features['total_number_of_judgements'] = self.clean_numeric(credit_summary.get('totalnumberofjudgement', 0))
        return features

    def process_enquiry_history(self, enquiry_history):
        """Count recent credit inquiries"""
        features = {'total_recent_enquiries': 0}
        if not enquiry_history:
            return features
            
        recent_count = 0
        for enquiry in enquiry_history:
            try:
                enquiry_date = datetime.strptime(enquiry['daterequested'], "%d/%m/%Y %H:%M:%S")
                if (datetime.now() - enquiry_date).days <= 90:
                    recent_count += 1
            except:
                continue
        features['total_recent_enquiries'] = recent_count
        return features

    def process_credit_agreements(self, credit_agreements):
        """Analyze loan accounts"""
        features = {
            'personal_loan_count': 0,
            'overdraft_count': 0,
            'max_amount_overdue': 0,
            'avg_loan_duration_days': 0,
            'written_off_accounts': 0
        }
        if not credit_agreements:
            return features
            
        total_duration = 0
        valid_durations = 0
        max_overdue = 0
        personal_loans = 0
        overdrafts = 0
        written_off = 0
        
        for account in credit_agreements:
            # Loan type counts
            desc = str(account.get('indicatordescription', '')).lower()
            if 'personal' in desc:
                personal_loans += 1
            if 'overdraft' in desc:
                overdrafts += 1
            
            # Account status
            if account.get('accountstatus') == 'WrittenOff':
                written_off += 1
            
            # Amount overdue
            overdue = self.clean_numeric(account.get('amountoverdue', 0))
            if overdue > max_overdue:
                max_overdue = overdue
            
            # Loan duration
            duration = self.clean_numeric(account.get('loanduration', 0))
            if duration > 0:
                total_duration += duration
                valid_durations += 1
        
        features['personal_loan_count'] = personal_loans
        features['overdraft_count'] = overdrafts
        features['max_amount_overdue'] = max_overdue
        features['written_off_accounts'] = written_off
        if valid_durations > 0:
            features['avg_loan_duration_days'] = total_duration / valid_durations
        
        return features

    def process_delinquency(self, delinquency_info):
        """Extract months in arrears"""
        features = {'max_months_in_arrears': 0}
        if not delinquency_info:
            return features
            
        months = self.clean_numeric(delinquency_info.get('monthsinarrears', 0))
        features['max_months_in_arrears'] = months
        return features

    def process_personal_details(self, personal_details):
        """Extract demographic information"""
        features = {
            'age': self.calculate_age(personal_details.get('birthdate')),
            'property_owned': 1 if personal_details.get('propertyownedtype') else 0,
            'employment_status': 'Employed' if personal_details.get('employerdetail') else 'Unknown'
        }
        return features

    def process_guarantor_info(self, guarantor_details, guarantor_count):
        """Analyze guarantor information"""
        features = {
            'guarantor_count': self.clean_numeric(guarantor_count.get('accounts', 0)),
            'has_guarantor': 0
        }
        
        if guarantor_details:
            for k, v in guarantor_details.items():
                if k != 'guarantordateofbirth' and v not in (None, '', 'null', '1900-01-01T00:00:00+01:00'):
                    features['has_guarantor'] = 1
                    break
        return features

    def extract_features(self, credit_report):
        """Main feature extraction method"""
        features = {'application_id': credit_report.get('application_id')}
        
        if not credit_report or 'data' not in credit_report:
            return features
            
        data = credit_report['data']
        consumer_data = data.get('consumerfullcredit', {})
        
        # Process each data section
        features.update(self.process_account_ratings(consumer_data.get('accountrating', {})))
        features.update(self.process_credit_summary(consumer_data.get('creditaccountsummary', {})))
        features.update(self.process_enquiry_history(consumer_data.get('enquiryhistorytop', [])))
        features.update(self.process_credit_agreements(consumer_data.get('creditagreementsummary', [])))
        features.update(self.process_delinquency(consumer_data.get('deliquencyinformation', {})))
        features.update(self.process_personal_details(consumer_data.get('personaldetailssummary', {})))
        features.update(self.process_guarantor_info(
            consumer_data.get('guarantordetails', {}),
            consumer_data.get('guarantorcount', {})
        ))
        
        return features

    def process_reports(self, credit_reports):
        """Process multiple credit reports into a DataFrame"""
        features_list = []
        
        for report in credit_reports:
            if not isinstance(report, dict) or 'application_id' not in report:
                continue
            features_list.append(self.extract_features(report))
        
        return pd.DataFrame(features_list).set_index('application_id')



In [52]:
# Example Usage
if __name__ == "__main__":
    # Assuming 'json_data' is a valid JSON string (from a file or API)
# Example usage
    with open('Credit_bureau_sample_data.json', 'r') as f:
        json_data = f.read()
    
    parser = CreditReportParser(json_data)
    df = parser.to_dataframe()
    print(df)

Type of the loaded JSON data: <class 'list'>
Structure of the loaded JSON data: [{'application_id': 97, 'data': {'consumerfullcredit': {'subjectlist': {'reference': '12876566', 'consumerid': '17628566', 'searchoutput': 'XXX '}, 'accountrating': {'noofotheraccountsbad': '0', 'noofotheraccountsgood': '3', 'noofretailaccountsbad': '0', 'noofretailaccountsgood': '2', 'nooftelecomaccountsbad': '0', 'noofautoloanaccountsbad': '0', 'noofautoloanccountsgood': '0', 'noofhomeloanaccountsbad': '0', 'nooftelecomaccountsgood': '0', 'noofhomeloanaccountsgood': '0', 'noofjointloanaccountsbad': '0', 'noofstudyloanaccountsbad': '0', 'noofcreditcardaccountsbad': '0', 'noofjointloanaccountsgood': '0', 'noofstudyloanaccountsgood': '0', 'noofcreditcardaccountsgood': '1', 'noofpersonalloanaccountsbad': '0', 'noofpersonalloanaccountsgood': '1'}, 'enquirydetails': {'productid': '45', 'matchingrate': '90', 'subscriberenquiryengineid': '5012874225', 'subscriberenquiryresultid': '6381470'}, 'guarantorcount': {'a

In [57]:
# Example usage:
# if __name__ == "__main__":
    
# Load sample data
with open('Credit_bureau_sample_data.json') as f:
    credit_reports = json.load(f)

# Initialize and process
extractor = CreditBureauFeatureExtractor()
features_df = extractor.process_reports(credit_reports)

features_df.head()

Unnamed: 0_level_0,no_of_bad_accounts,no_of_good_accounts,total_outstanding_debt,total_arrears,total_monthly_instalment,total_number_of_judgements,total_recent_enquiries,personal_loan_count,overdraft_count,max_amount_overdue,avg_loan_duration_days,written_off_accounts,max_months_in_arrears,age,property_owned,employment_status,guarantor_count,has_guarantor
application_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
97,0.0,7.0,105435.0,24041.0,77404.0,0.0,0,3,2,22441.39,1775.75,1,13.0,34,0,Employed,0.0,0
9714953,0.0,17.0,294770.0,0.0,132176.0,0.0,0,13,1,0.0,414.4,0,2.0,39,0,Unknown,0.0,0
9714978,1.0,2.0,110919.0,12000.0,7000.0,0.0,0,3,1,12000.0,187.5,0,109.0,41,0,Unknown,0.0,0
