In [1]:
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score


import matplotlib.pyplot as plt
import seaborn as sns

import pickle

In [2]:
#Create dataframe from CSV
df = pd.read_csv("data/dataset.csv",sep=";")

In [3]:
#Drop feature with more than 40% missing data
df = df.drop(columns=df.columns[(df.isna().sum()>30000)])

In [4]:
#Missing values in Y is 10% of the data. We will drop these rows
df = df.dropna(subset=['default'])

In [5]:
#Create X and Y
X = df.drop(columns='default')
y = df.default

In [6]:
#Create categorical columns variable to preprocessing
cat_cols = ['merchant_category', 'merchant_group', 'name_in_email', 'status_last_archived_0_24m',
            'status_2nd_last_archived_0_24m', 'status_3rd_last_archived_0_24m', 'status_max_archived_0_6_months',
            'status_max_archived_0_12_months', 'status_max_archived_0_24_months']

In [7]:
#Create numerical columns variable to preprocessing
num_cols = ['account_amount_added_12_24m', 'account_days_in_rem_12_24m', 'account_days_in_term_12_24m', 'age', 'avg_payment_span_0_12m',
            'max_paid_inv_0_12m', 'max_paid_inv_0_24m', 'num_active_div_by_paid_inv_0_12m', 'num_active_inv', 'num_arch_dc_0_12m', 'num_arch_dc_12_24m',
            'num_arch_ok_0_12m', 'num_arch_ok_12_24m', 'num_arch_rem_0_12m', 'num_arch_written_off_0_12m', 'num_arch_written_off_12_24m', 'num_unpaid_bills',
            'recovery_debt', 'sum_capital_paid_account_0_12m', 'sum_capital_paid_account_12_24m',
            'sum_paid_inv_0_12m', 'time_hours']

In [8]:
#Create a new variable with features that have 7 or less categories
feat_categorical_nunique = X[cat_cols].nunique()
feat_categorical_small = list(feat_categorical_nunique[feat_categorical_nunique < 7].index)
len(feat_categorical_small)

6

In [9]:
cat_transformer = make_pipeline(
                SimpleImputer(strategy='most_frequent'),
                OneHotEncoder()
                    )

In [10]:
#RobustScaler to numerical features with no normal distribution
robust_feat = make_pipeline(
                SimpleImputer(strategy='mean'),
                RobustScaler()
                )

In [11]:
#Create the pipeline
preproc = make_column_transformer(
    (cat_transformer, feat_categorical_small),
    (robust_feat, num_cols),
    remainder='drop'
)

In [12]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
preproc = preproc.fit(X_train)

In [14]:
with open('./preproc.pkl','wb') as file:
    pickle.dump(preproc, file)

In [15]:
X_train_preprocessed = preproc.transform(X_train)

In [16]:
model = LogisticRegression(
    max_iter=3000,
    penalty='l2'
)

In [17]:
log_model = model.fit(X_train_preprocessed,y_train)

In [18]:
with open('./log_model.pkl','wb') as file:
    pickle.dump(log_model, file)

In [19]:
X_df = pd.read_csv("data/dataset.csv",sep=";")
X_target = X_df[X_df.default.isna()]

X_target.head(1)

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
89976,6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7,,0,0.0,0.0,0.0,0.009135,1.0,1.0,,...,1,1,1,1,0,8815,0,27157,19.895556,


In [20]:
first_row_df = pd.DataFrame(X_df.iloc[0]).T
first_row_df

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
