In [3]:
# Import the libraries and functions necessary for the pipeline

import joblib
import pandas as pd
from numpy import inf, nan, where
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

In [4]:
# Create constant variables containing the groups of variables 
# that need specific transformations or are used as input

# The features needed to create all input features
FEATURES_ALL = [
    'new_customer', 'application_date', 'income_verification', 'language',
    'date_of_birth', 'gender', 'country', 'loan_amount', 'county', 'city',
    'use_of_loan', 'education', 'marital_status', 'nr_dependants',
    'employment_status', 'employment_duration', 'employment_position',
    'work_experience', 'occupation', 'home_ownership',
    'income_from_employer', 'income_from_pension',
    'income_from_family_allowance', 'income_from_social_welfare',
    'income_from_leave_pay', 'income_from_child_support', 'income_other',
    'nr_debt_items', 'total_debt', 'credit_score_1', 'credit_score_2',
    'credit_score_3', 'credit_score_4', 'nr_previous_loans',
    'amount_previous_loans', 'previous_repayments',
    'previous_early_repayments', 'previous_early_repayments_count'
]
 
# Input features to the model (important: mind 
# the order used when training the model)
FEATURES_INPUT = [
    'new_customer', 'income_verification', 'language', 'gender', 'country',
    'loan_amount', 'use_of_loan', 'education', 'marital_status',
    'nr_dependants', 'employment_status', 'employment_duration',
    'work_experience', 'occupation', 'home_ownership',
    'income_from_employer', 'income_from_pension',
    'income_from_family_allowance', 'income_from_social_welfare',
    'income_from_leave_pay', 'income_from_child_support', 'income_other',
    'nr_debt_items', 'total_debt', 'credit_score_1', 'credit_score_2',
    'credit_score_3', 'credit_score_4', 'nr_previous_loans',
    'amount_previous_loans', 'previous_repayments',
    'previous_early_repayments', 'previous_early_repayments_count',
    'total_income', 'dti', 'cash', 'age', 'dow', 'dom', 'month', 'hour',
]
 
FEATURES_INCOME = [
    'income_from_employer',
    'income_from_pension',
    'income_from_family_allowance',
    'income_from_social_welfare',
    'income_from_leave_pay',
    'income_from_child_support',
    'income_other',
]
 
FEATURES_ENCODE = [
    'income_verification',
    'language',
    'gender',
    'country',
    'use_of_loan',
    'education',
    'marital_status',
    'employment_status',
    'employment_duration',
    'work_experience',
    'occupation',
    'home_ownership',
    'credit_score_1',
    'credit_score_2',
    'credit_score_3',
]

In [5]:
# Hard-code the dictionaries with the imputation parameters and the frequent categories 
# learned during the development of the credit risk model

IMPUTATION_DICT = {
    'nr_dependants': -1,
    'credit_score_4': -1,
    'previous_repayments': -1,
    'previous_early_repayments': -1,
    'gender': 'missing',
    'education': 'missing',
    'marital_status': 'missing',
    'employment_status': 'missing',
    'employment_duration': 'missing',
    'work_experience': 'missing',
    'occupation': 'missing',
    'home_ownership': 'missing',
    'credit_score_1': 'missing',
    'credit_score_2': 'missing',
    'credit_score_3': 'missing',
}
 
# Frequent categories
 
FREQUENT_CAT_DICT = {
    'language': ['estonian', 'finnish', 'spanish', 'russian'],
    'use_of_loan': ['unknown', 'other', 'home_improvement', 'loan_consolidation'],
    'occupation': ['missing', 'other', 'retail'],
    'home_ownership': ['owner',
                       'tenant_furnished',
                       'living_with_parents',
                       'mortgage',
                       'tenant_unfurnished'],
    'credit_score_1': ['missing', 'M', 'M1'],
    'credit_score_2': ['missing', 'B'],
    'credit_score_3': ['missing', 'RL2']
}

In [6]:
# Create a function containing the complete sequence of feature transformations and creation:

def feature_engineering_pipe(df):
    
    # Make a copy of the input features
    df = df[FEATURES_ALL].copy()
    
    # Impute missing data
    df.fillna(IMPUTATION_DICT, inplace=True)
    
    # Create income related variables
    df['total_income'] = df[FEATURES_INCOME].sum(axis=1)
    
    df["dti"] = df["total_debt"].div(df["total_income"])
    df["dti"].replace([inf, nan], 0, inplace=True)
    
    df["cash"] = df["total_income"].sub(df["total_debt"])
 
    # Create datetime related features
    df["age"] = ((pd.to_datetime(df["application_date"]) -
                  pd.to_datetime(df["date_of_birth"])).dt.days/365).astype(int)
        
    df["application_date"] = pd.to_datetime(df["application_date"])    
    df["dow"] = df["application_date"].dt.day_of_week
    df["dom"] = df["application_date"].dt.day
    df["month"] = df["application_date"].dt.month
    df["hour"] = df["application_date"].dt.hour
        
    # Group infrequent labels
    for var in FREQUENT_CAT_DICT.keys():
        df[var] = where(df[var].isin(
            FREQUENT_CAT_DICT[var]), df[var], "Rare")
 
    # Encode categorical variables
    df[FEATURES_ENCODE] = enc.transform(df[FEATURES_ENCODE])
 
    # Return features in the order in which they were passed to the 
    # model during training
    
    return df[FEATURES_INPUT]

In [None]:
# Load the encoder and the lightGBM

enc = joblib.load("encoder.pkl")
gbm = joblib.load("lightGBM.pkl")

In [7]:
# Test the pipeline! Load and split the data

df = pd.read_csv("loan_data.csv", low_memory=False)
 
seed=10
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("default", axis=1),
    df["default"],
    test_size=0.20,
    random_state=seed,
)

In [None]:
# Remove observations from customers who entered the date of birth wrongly and therefore seem to be under 18

train_index = [
    30394,  8998,  2634, 20910,  2560,  2434,  1148,  2448,   889,
    11656, 18764,  1166, 11647,  2565, 18823,  2475,  2403,  8802,
    9081,  2479, 11557, 19035,  1132,  2439, 30369,  2423,  1174,
    20932, 20892,   919, 18765,  2589,  8770, 11386, 20941, 20895,
    956, 11555,   930, 20925,  2509,
]
 
test_index = [
    915, 30378, 18914, 922, 18842, 8939, 2486, 30412, 11676, 1137,
    2440, 2447
]
 
X_train = X_train.drop(index=train_index)
X_test = X_test.drop(index=test_index)
 
y_train = y_train.drop(index=train_index)
y_test = y_test.drop(index=test_index)

> Note that you do not make this part of the pipeline, because in the platform, customers that enter their date of birth wrongly or are otherwise under 18, would be rejected automatically and not passed on to the model.

In [None]:
# Obtain the input features for the lightGBM using the pipeline

X_train_t = feature_engineering_pipe(X_train)
X_test_t = feature_engineering_pipe(X_test)

In [None]:
# To corroborate the pipeline functionality, evaluate the ROC-AUC of the returned probabilities

pred_train = gbm.predict_proba(X_train_t)[:, 1]
pred_test = gbm.predict_proba(X_test_t)[:, 1]
 
roc_train = roc_auc_score(y_train, pred_train)
roc_test = roc_auc_score(y_test, pred_test)
 
print(f"Train set roc-auc: {roc_train}")
print(f"Eval set roc-auc: {roc_test}")

In [None]:
# Now produce the full classification report for the train and test sets

pred_train = gbm.predict(X_train_t)
pred_test = gbm.predict(X_test_t)
 
cr_train = classification_report(y_train, pred_train)
cr_test = classification_report(y_test, pred_test)
 
print(f"Train set:\n {cr_train}")
print(f"Eval set:\n {cr_test}")

### Evaluate a cohort of recent customers

We've got another data set with information about our most recent customer applications. To be fully confident that the model continues to have good performance on recent customer cohorts, could you please obtain the predictions of credit risk for these customers as well?

We've placed the raw data, as it comes through our platform, in a file called latest_customers.csv at the root of your workspace. The dataset also contains the target variable default.

Please pass this data through your pipeline, obtain the model predictions, and then evaluate the model's performance.

One thing to notice is that, as these customers have just gotten their loans, we do not have enough history to accurately determine if they are going to default in the near future. As a result, we most likely labeled customers as "no default", even though they may likely default in the coming months. Hence, if the performance metrics are a bit lower than observed, don't worry. As the maturity of the loans develops, we will be able to better assess their performance in the coming months.

In [None]:
# Load latest cohort of customers

df = pd.read_csv("latest_customers.csv", low_memory=False)

In [None]:
# Capture the target in a separate variable

df_target = df["default"]

In [None]:
# Create the input features for the model

df_t = feature_engineering_pipe(df)

In [None]:
# Obtain the predictions and determine the ROC-AUC

pred_df = gbm.predict_proba(df_t)[:, 1]
roc_df = roc_auc_score(df_target, pred_df)
print(f"roc-auc: {roc_df}")

In [None]:
# Now, classify the customers and obtain the full classification report

pred_df = gbm.predict(df_t)
cr_df = classification_report(df_target, pred_df)
print(f"New data set:\n {cr_df}")

> As expected, the performance metrics are a bit lower than those observed in the previous task. This is probably because some customers that are flagged as no-default, will likely default in the coming months. This is quite common when creating and assessing credit risk models.