In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score
)
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
gps_fixes = pd.read_csv('gps_fixes_new.csv')
loan_outcomes = pd.read_csv('loan_outcomes_new.csv')
user_attributes = pd.read_csv('user_attributes.csv')

We are aggregating the GPS data to create meaningful features that represent each user's general location and the reliability of their GPS information. This will help improve the model's ability to predict loan outcomes by incorporating location-based insights and data consistency.

In [9]:
# Aggregate GPS data
gps_agg = gps_fixes.groupby('user_id').agg({
    'accuracy': ['mean', 'max'],
    'upload_delay': ['mean', 'max'],
    'longitude': 'first',
    'latitude': 'first'
}).reset_index()
gps_agg.columns = ['user_id', 'avg_accuracy', 'max_accuracy', 'avg_upload_delay', 'max_upload_delay', 'longitude', 'latitude']

Comprehensive dataset where all relevant features are available for training the loan outcome prediction model.

In [32]:
merged_data = loan_outcomes.merge(user_attributes, on='user_id', how='left')
merged_data = merged_data.merge(gps_agg, on='user_id', how='left')

In [33]:
merged_data.head()

Unnamed: 0,user_id,application_at,loan_outcome,application_year,application_month,application_day,is_weekend,age,cash_incoming_30days,avg_accuracy,max_accuracy,avg_upload_delay,max_upload_delay,longitude,latitude
0,1,2017-08-14 09:08:50.000000,0,2017,8,14,0,42,8988.12,1105.084571,4434.0,999.1,6735.0,36.84054,-1.294342
1,2,2016-05-17 10:10:12.447976,1,2016,5,17,0,36,9968.12,48.596,310.59,3810.1,37609.0,36.761261,-1.284719
2,3,2016-10-20 10:07:20.459081,0,2016,10,20,0,27,59.04,6.5,6.5,12.0,12.0,35.70755,-0.889673
3,4,2017-01-13 13:03:34.000000,0,2017,1,13,0,38,2129.03,2172.2,4292.0,73.5,83.0,36.069293,-0.299025
4,5,2016-11-03 15:41:39.124610,1,2016,11,3,0,33,2102.53,43.461111,96.0,178.333333,716.0,-73.903518,41.701694


#Feature Engineering

We only take the hour from the application time because the exact date and time aren’t as important as the time of day when the loan was applied.

In [12]:
merged_data['application_hour'] = pd.to_datetime(merged_data['application_at']).dt.hour

We group the ages into categories like "young", "middle-aged", and so on, to see if age affects loan repayment.

In [13]:
merged_data['age_group'] = pd.cut(merged_data['age'], bins=[0, 25, 35, 45, 60, 100], labels=['18-25', '26-35', '36-45', '46-60', '60+'])

We turn the cash incoming data into categories like "low", "medium", or "high" to understand how different income levels affect loan outcomes.

In [14]:
merged_data['cash_incoming_category'] = pd.qcut(merged_data['cash_incoming_30days'], q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])

Cleaning and Handling Null Values

In [34]:
merged_data.shape

(400, 15)

In [35]:
merged_data.isnull().sum()

Unnamed: 0,0
user_id,0
application_at,0
loan_outcome,0
application_year,0
application_month,0
application_day,0
is_weekend,0
age,0
cash_incoming_30days,0
avg_accuracy,28


In [None]:
merged_data[['avg_accuracy', 'max_accuracy', 'avg_upload_delay', 'max_upload_delay']] = merged_data[['avg_accuracy', 'max_accuracy', 'avg_upload_delay', 'max_upload_delay']].fillna(0)
merged_data['latitude'] = merged_data['latitude'].fillna(merged_data['latitude'].median())
merged_data['longitude'] = merged_data['longitude'].fillna(merged_data['longitude'].median())

In [38]:
merged_data.isnull().sum()

Unnamed: 0,0
user_id,0
application_at,0
loan_outcome,0
application_year,0
application_month,0
application_day,0
is_weekend,0
age,0
cash_incoming_30days,0
avg_accuracy,0


In [39]:
merged_data.head()

Unnamed: 0,user_id,application_at,loan_outcome,application_year,application_month,application_day,is_weekend,age,cash_incoming_30days,avg_accuracy,max_accuracy,avg_upload_delay,max_upload_delay,longitude,latitude
0,1,2017-08-14 09:08:50.000000,0,2017,8,14,0,42,8988.12,1105.084571,4434.0,999.1,6735.0,36.84054,-1.294342
1,2,2016-05-17 10:10:12.447976,1,2016,5,17,0,36,9968.12,48.596,310.59,3810.1,37609.0,36.761261,-1.284719
2,3,2016-10-20 10:07:20.459081,0,2016,10,20,0,27,59.04,6.5,6.5,12.0,12.0,35.70755,-0.889673
3,4,2017-01-13 13:03:34.000000,0,2017,1,13,0,38,2129.03,2172.2,4292.0,73.5,83.0,36.069293,-0.299025
4,5,2016-11-03 15:41:39.124610,1,2016,11,3,0,33,2102.53,43.461111,96.0,178.333333,716.0,-73.903518,41.701694


#Modelling

Helper fucntion to prepare the dataset

In [40]:
def prepare_data(merged_data):
    X = merged_data.drop(['loan_outcome', 'user_id', 'application_at','application_year','application_month','application_day','age'], axis=1)
    y = merged_data['loan_outcome']

    numeric_features = [
        'cash_incoming_30days',
        'avg_accuracy', 'max_accuracy',
        'avg_upload_delay', 'max_upload_delay',
        'longitude', 'latitude',
        'application_hour'
    ]
    categorical_features = [
        'age_group',
        'cash_incoming_category',
        'is_weekend'
    ]

    return X, y, numeric_features, categorical_features

Preprocessing Pipeline

In [41]:
def create_preprocessor(numeric_features, categorical_features):
    return ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

Training Classical Machine Learning Models

In [45]:
def train_and_evaluate_models(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()

    preprocessor = create_preprocessor(numeric_features, categorical_features)

    models = {
        'log_reg': LogisticRegression(max_iter=1000),
        'rf': RandomForestClassifier(n_estimators=100, random_state=42),
        'xgb': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
        'lgb': lgb.LGBMClassifier(random_state=42)
    }

    results = {}

    for name, model in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])

        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='roc_auc')
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

        results[name] = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1 Score': f1_score(y_test, y_pred),
            'ROC AUC': roc_auc_score(y_test, y_pred_proba)
        }

    results_df = pd.DataFrame.from_dict(results, orient='index')
    print("\nModel Performance Comparison:")
    print(results_df)

    return results_df

In [None]:
X, y, numeric_features, categorical_features = prepare_data(merged_data)
comparison_table = train_and_evaluate_models(X, y)

In [47]:
comparison_table

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
log_reg,0.5625,0.6,0.55814,0.578313,0.692018
rf,0.6875,0.75,0.627907,0.683544,0.743872
xgb,0.625,0.685714,0.55814,0.615385,0.709617
lgb,0.625,0.666667,0.604651,0.634146,0.683847


Hyperparamter Tuning for the Random Forest Classifier

In [48]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Random Search CV to find the best parameters

In [54]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
print("Best Parameters found: ", random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters found:  {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 30}


Retrain with the best parameters

In [61]:
best_params = random_search.best_params_

rf_best = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42
)

rf_best.fit(X_train, y_train)
y_pred = rf_best.predict(X_test)
y_pred_proba = rf_best.predict_proba(X_test)[:, 1]

In [63]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))

Accuracy: 0.6923953486233
Precision: 0.7058823529411765
Recall: 0.5581395348837209
F1 Score: 0.6233766233766234
ROC AUC: 0.7071024512884977


In [65]:
cv_scores = cross_val_score(rf_best, X, y, cv=5)
print("CV Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

CV Scores: [0.625  0.8    0.5875 0.7625 0.7   ]
Mean CV Score: 0.6950000000000001


# Implementing Meta Modelling (Just to See the Performance)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', xgb.XGBClassifier(random_state=42)),
    ('lgb', lgb.LGBMClassifier(random_state=42)),
    ('log_reg', LogisticRegression(max_iter=1000))
]

meta_model = LogisticRegression()
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_model.fit(X_train, y_train)

y_pred = stacking_model.predict(X_test)

In [71]:
y_pred = stacking_model.predict(X_test)
y_pred_proba = stacking_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

Accuracy: 0.6875
Precision: 0.75
Recall: 0.627906976744186
F1 Score: 0.6835443037974683
ROC AUC: 0.748585795097423


Saving the Models for future inference

In [72]:
import pickle

with open('rf_best.pkl', 'wb') as file:
    pickle.dump(rf_best, file)

with open('stacking_model.pkl', 'wb') as file:
    pickle.dump(stacking_model, file)