In [4]:
!git clone https://github.com/Toavina00/Zindi-credit-scoring
!cp -r /content/Zindi-credit-scoring/dataset .
!rm -rf /content/Zindi-credit-scoring

Cloning into 'Zindi-credit-scoring'...
remote: Enumerating objects: 23, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 23 (delta 3), reused 20 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (23/23), 3.28 MiB | 11.83 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [2]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv("dataset/Train.csv")
test  = pd.read_csv("dataset/Test.csv")
indicator = pd.read_csv("dataset/economic_indicators.csv")

## Data Preprocessing

In [5]:
def preprocess(df, indicator):

    df = df.copy()

    # Discrete features and date features
    df[['disbursement_year', 'disbursement_month', 'disbursement_day']] = df['disbursement_date'].str.split('-', expand=True).astype(np.int64)
    df[['due_year', 'due_month', 'due_day']] = df['due_date'].str.split('-', expand=True).astype(np.int64)
    df['New_versus_Repeat'] = df['New_versus_Repeat'].map(lambda x: 1 if x == "New Loan" else 0)

    # Continuous features
    df["Repay_Rate"] = (df["Total_Amount_to_Repay"] + 1) / (df["Total_Amount"] + 1)

    # Economical indicators
    df_ind = indicator[["Country", "Indicator", "YR2020", "YR2021", "YR2022", "YR2023"]].dropna()

    ind_dict = {}
    country = df_ind["Country"].unique()
    years = {k: int(k[2:])+1 for k in ["YR2020", "YR2021", "YR2022", "YR2023"]}
    ind_list = ["Inflation, consumer prices (annual %)", "Unemployment rate", "Official exchange rate (LCU per US$, period average)"]

    for country in df_ind["Country"].unique():
        ind_dict[country] = {}
        for ind in ind_list:
            ind_dict[country][ind] = {}
            for k, v in years.items():
                ind_dict[country][ind][v] = df_ind[(df_ind["Country"] == country) & (df_ind["Indicator"] == ind)][k].values[0]

    for ind in ind_list:
        df[ind] = df.apply(lambda row: ind_dict[row['country_id']][ind][row['disbursement_year']], axis=1)

    df.rename(columns={
        "Inflation, consumer prices (annual %)": "Inflation",
        "Unemployment rate": "Unemployment",
        "Official exchange rate (LCU per US$, period average)": "Exchange_Rate",
    }, inplace=True)

    # Drop columns
    df = df.drop(columns=[
        'disbursement_date', 'due_date', "customer_id",
        "country_id", "tbl_loan_id", "lender_id",
        "loan_type", "Total_Amount_to_Repay",
        "Lender_portion_to_be_repaid", "disbursement_year",
        "due_year", "Inflation", "due_day", "due_month"
    ])

    return df

In [63]:
def preprocess(df, indicator):

    df = df.copy()

    # Discrete features and date features
    df[['disbursement_year', 'disbursement_month', 'disbursement_day']] = df['disbursement_date'].str.split('-', expand=True).astype(np.int64)
    df[['due_year', 'due_month', 'due_day']] = df['due_date'].str.split('-', expand=True).astype(np.int64)
    df['New_versus_Repeat'] = df['New_versus_Repeat'].map(lambda x: 1 if x == "New Loan" else 0)
    df['loan_type'] = df['loan_type'].map(lambda x: 1 if x == "type_1" else 0)

    # Continuous features
    df["Repay_Rate"] = (df["Total_Amount_to_Repay"] + 1) / (df["Total_Amount"] + 1)
    df["Lender_Repay_Rate"] = (df["Lender_portion_to_be_repaid"] + 1) / (df["Amount_Funded_By_Lender"] + 1)

    # Economical indicators
    df_ind = indicator[["Country", "Indicator", "YR2020", "YR2021", "YR2022", "YR2023"]].dropna()

    ind_dict = {}
    country = df_ind["Country"].unique()
    years = {k: int(k[2:])+1 for k in ["YR2020", "YR2021", "YR2022", "YR2023"]}
    ind_list = ["Inflation, consumer prices (annual %)", "Unemployment rate", "Official exchange rate (LCU per US$, period average)"]

    for country in df_ind["Country"].unique():
        ind_dict[country] = {}
        for ind in ind_list:
            ind_dict[country][ind] = {}
            for k, v in years.items():
                ind_dict[country][ind][v] = df_ind[(df_ind["Country"] == country) & (df_ind["Indicator"] == ind)][k].values[0]

    for ind in ind_list:
        df[ind] = df.apply(lambda row: ind_dict[row['country_id']][ind][row['disbursement_year']], axis=1)

    df.rename(columns={
        "Inflation, consumer prices (annual %)": "Inflation",
        "Unemployment rate": "Unemployment",
        "Official exchange rate (LCU per US$, period average)": "Exchange_Rate",
    }, inplace=True)

    # Drop columns
    df = df.drop(columns=[
        'disbursement_date', 'due_date', "customer_id",
        "country_id", "tbl_loan_id", "lender_id",
    ])

    return df

## Model

In [14]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, TargetEncoder, FunctionTransformer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from lightgbm import LGBMClassifier, plot_importance as lgb_plot_importance
from xgboost import XGBClassifier, plot_importance as xgb_plot_importance

from bayes_opt import BayesianOptimization

set_config(transform_output="pandas")

In [64]:
df_train = preprocess(train, indicator)

X, y = df_train.drop(columns='target'), df_train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68654 entries, 0 to 68653
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           68654 non-null  object 
 1   loan_type                    68654 non-null  int64  
 2   Total_Amount                 68654 non-null  float64
 3   Total_Amount_to_Repay        68654 non-null  float64
 4   duration                     68654 non-null  int64  
 5   New_versus_Repeat            68654 non-null  int64  
 6   Amount_Funded_By_Lender      68654 non-null  float64
 7   Lender_portion_Funded        68654 non-null  float64
 8   Lender_portion_to_be_repaid  68654 non-null  float64
 9   target                       68654 non-null  int64  
 10  disbursement_year            68654 non-null  int64  
 11  disbursement_month           68654 non-null  int64  
 12  disbursement_day             68654 non-null  int64  
 13  due_year        

### Baseline

In [20]:
model = make_pipeline(
    make_column_transformer(
        (
            make_pipeline(
                SimpleImputer(strategy='mean'),
                StandardScaler(),
            ), make_column_selector(dtype_include=np.float64)),
        (
            make_pipeline(
                SimpleImputer(strategy='most_frequent'),
            ), make_column_selector(dtype_include=np.int64)
        ),
        remainder='drop'
    ),
    LogisticRegression(),
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
cv_results = cross_validate(model, X_train, y_train, cv=cv, scoring='f1', return_train_score=True)

print(f"Train f1-score: {cv_results['train_score'].mean()}")
print(f"Validation f1-score: {cv_results['test_score'].mean()}")

print("\nTest report:")
print(classification_report(y_test, y_pred))

Train f1-score: 0.6157476911321593
Validation f1-score: 0.6109205494707757

Test report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     13479
           1       0.73      0.54      0.62       252

    accuracy                           0.99     13731
   macro avg       0.86      0.77      0.81     13731
weighted avg       0.99      0.99      0.99     13731



### Models

In [67]:

continuous_columns = X.select_dtypes(include=np.float64).columns
discrete_columns = X.select_dtypes(include=np.int64).columns

continuous_columns = continuous_columns.drop([])
discrete_columns = discrete_columns.drop([])

model = make_pipeline(
    RandomOverSampler(sampling_strategy=0.3, random_state=42),
    make_column_transformer(
        (SimpleImputer(strategy="mean"), continuous_columns),
        (SimpleImputer(strategy='most_frequent'), discrete_columns),
        remainder='drop'
    ),
    LGBMClassifier(random_state=42, n_estimators=100, max_depth=5)
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
cv_results = cross_validate(model, X_train, y_train, cv=cv, scoring='f1', return_train_score=True)

print(f"Train f1-score: {cv_results['train_score'].mean()}")
print(f"Validation f1-score: {cv_results['test_score'].mean()}")

print("\nTest report:")
print(classification_report(y_test, y_pred))

Train f1-score: 0.8784681903035885
Validation f1-score: 0.8249531754190537

Test report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     13479
           1       0.70      0.94      0.81       252

    accuracy                           0.99     13731
   macro avg       0.85      0.97      0.90     13731
weighted avg       0.99      0.99      0.99     13731



In [55]:
continuous_columns = X.select_dtypes(include=np.float64).columns
discrete_columns = X.select_dtypes(include=np.int64).columns

continuous_columns = continuous_columns.drop([])
discrete_columns = discrete_columns.drop([])


def evaluate(max_depth, learning_rate, lambda_l1, lambda_l2, n_estimators, thresh):
    params = {
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'random_state': 42,
        'n_jobs': -1,
        "verbose": -1,
    }

    model = make_pipeline(
        RandomOverSampler(random_state=42),
        make_column_transformer(
            (SimpleImputer(strategy="mean"), continuous_columns),
            (SimpleImputer(strategy='most_frequent'), discrete_columns),
            remainder='drop'
        ),
        LGBMClassifier(**params)
    )
    model.predict = lambda X: (model.predict_proba(X) > thresh).argmax(axis=1)
    
    cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    cv_results = cross_validate(model, X_train, y_train, cv=cv, scoring='f1', return_train_score=True)
    
    val_score = cv_results['test_score'].mean()
    train_score = cv_results['train_score'].mean()

    return val_score if abs(val_score - train_score) < 0.05 else 0.0

param_space = {
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.2),
    'n_estimators': (50, 500),
    'thresh': (0.5, 0.5),
    'lambda_l1': (0.1, 1.0),
    'lambda_l2': (0.1, 1.0),
}

optimizer = BayesianOptimization(f=evaluate, pbounds=param_space, random_state=42)
optimizer.maximize(init_points=10, n_iter=30)

best_params = optimizer.max['params']

model = make_pipeline(
    make_column_transformer(
        (SimpleImputer(strategy="mean"), continuous_columns),
        (SimpleImputer(strategy='most_frequent'), discrete_columns),
        remainder='drop'
    ),
    LGBMClassifier(
        max_depth=int(best_params['max_depth']),
        learning_rate=best_params['learning_rate'],
        n_estimators=int(best_params['n_estimators']),
        lambda_l1=best_params['lambda_l1'],
        lambda_l2=best_params['lambda_l2'],
        random_state=42
    )
)
model.predict = lambda X: (model.predict_proba(X) > best_params["thresh"]).argmax(axis=1)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
cv_results = cross_validate(model, X_train, y_train, cv=cv, scoring='f1', return_train_score=True)

print(f"Train f1-score: {cv_results['train_score'].mean()}")
print(f"Validation f1-score: {cv_results['test_score'].mean()}")

print("\nTest report:")
print(classification_report(y_test, y_pred))

|   iter    |  target   | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... |  thresh   |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.0      [39m | [39m0.4371   [39m | [39m0.9556   [39m | [39m0.1491   [39m | [39m7.191    [39m | [39m120.2    [39m | [39m0.5      [39m |
| [35m2        [39m | [35m0.7805   [39m | [35m0.1523   [39m | [35m0.8796   [39m | [35m0.1242   [39m | [35m7.957    [39m | [35m59.26    [39m | [35m0.5      [39m |
| [39m3        [39m | [39m0.6964   [39m | [39m0.8492   [39m | [39m0.2911   [39m | [39m0.04455  [39m | [39m4.284    [39m | [39m186.9    [39m | [39m0.5      [39m |
| [39m4        [39m | [39m0.7368   [39m | [39m0.4888   [39m | [39m0.3621   [39m | [39m0.1263   [39m | [39m3.976    [39m | [39m181.5    [39m | [39m0.5      [39m |
| [39m5        [39m | [39m0.0      [39m | [39m0.5105   [39m | [39m0.8067   [39m | [

### Submission

In [68]:
# Make submission

df_test = preprocess(test, indicator)

model.fit(X, y)

y_pred = model.predict(df_test)

submission = pd.DataFrame({
    "ID": test["ID"],
    "target": y_pred
})

submission.to_csv("submission.csv", index=False)

---