# Modelling

&copy; Sifiso Rimana | May, 2025 | Financial Inclusion - Zindi Competetion

----

The training and testing sets are already split in `feature_engineering.ipynb` notebook. Furthermore, `X_train` and `y_train` are already preprocessed, we can just fit them into the model. The testing set is not transformed, for predictions, these need to be passed into the preprocessor first.

##  Building models

---

In [7]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import arviz as az 

plt.style.use('arviz-darkgrid')

import sys
sys.path.append('..')

from scripts.utils import load_model
from scripts.utils import save_model

In [2]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier, RidgeClassifierCV, RidgeClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error as mae

In [8]:
X_train = load_model('../data/X_train_smote.pkl')
y_train = load_model('../data/y_train_smote.pkl')
X_test = load_model('../data/X_test.pkl')
y_test = load_model('../data/y_test.pkl')

preprocessor = load_model('../models/preprocessor.pkl')
y_encoder = load_model('../models/bank_account_encoder.pkl')

In [None]:
svc_clf = GridSearchCV(
    SVC(), param_grid={'kernel':('linear', 'rbf'), 'C':[1, 10]}
)

svc_clf.fit(X_train, y_train)

In [28]:
preprocessor.transform(X_train)

array([[0, 0, 2, ..., 0, 0, 0],
       [3, 0, 3, ..., 0, 0, 0],
       [3, 1, 2, ..., 0, 0, 0],
       ...,
       [3, 2, 5, ..., 1, 0, 0],
       [3, 2, 5, ..., 0, 0, 0],
       [5, 1, 0, ..., 0, 0, 0]])

In [52]:
qda = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('QDA', QuadraticDiscriminantAnalysis(reg_param=0.1))
])

lda = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('LDA', LinearDiscriminantAnalysis())
])

rfc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('RFC', RandomForestClassifier(random_state=42))
])

dtc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('DTC', DecisionTreeClassifier())
])

svc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('SVC', SVC())
])

lr_cv = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('LogisticCV', LogisticRegressionCV(penalty='l1', solver='liblinear'))
])

sgd_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('SGD', SGDClassifier(penalty='l1'))
])

ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('RidgeClf', RidgeClassifier())
])

ridge_cv = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('RidgeClfCV', RidgeClassifierCV())
])

models = [qda, lda, rfc, dtc, svc, lr_cv, sgd_clf, ridge, ridge_cv]
model_names = ['QDA', 'LDA', 'RFC', 'DTC', 'SVC', 'LogisticCV', 'SGD', 'Ridge', 'RidgeCV']
metrics = []

baseline_model_metric = mae(y_test, baseline_model.predict(X_test))

for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    current_metric = mae(y_true=y_test, y_pred=model.predict(X_test))
    metrics.append({
        'model_name': model_name,
        'MAE': round(current_metric, 6),
        'Better than Baseline': current_metric < baseline_model_metric
    })



pd.DataFrame.from_dict(metrics)

Unnamed: 0,model_name,MAE,Better than Baseline
0,QDA,0.126036,False
1,LDA,0.117747,False
2,RFC,0.120723,False
3,DTC,0.122635,False
4,SVC,0.112646,True
5,LogisticCV,0.113496,False
6,SGD,0.12051,False
7,Ridge,0.120935,False
8,RidgeCV,0.12051,False


In [34]:
baseline_model_metric

0.1132837407013815

In [44]:
def submission(model, filename: str, save_to='../submissions'):
    df_test = pd.read_csv('../data/Test.csv', index_col='uniqueid')
    age_bins = [0, 20, 25, 30, 50, 60, 100]
    age_labels = ['<20', '20-25', '26-30', '31-50', '51-60', '60+']
    df_test['age_group'] = \
        pd.cut(df_test['age_of_respondent'], bins=age_bins, 
               labels=age_labels, right=True, include_lowest=True)
    
    df_test['bank_account'] = model.predict(df_test)
    
    df_test['unique_id'] = df_test.index + ' x ' + df_test['country']
    df_test[['unique_id', 'bank_account']].to_csv(f'{save_to}/{filename}', index=False)

In [53]:
submission(lr_cv, filename='lr_vc_submission_03_2025-05-10.csv')