# Monitoring ML models

### Testing endpoints

Loading the data from a list of records.

In [215]:
f = open("../monitoring/batch_records.json")
request = f.read()
request_sample = request.split('}')
print(request_sample[0])

[
    {
        "VAR2": "M",
        "IDADE": 43.893,
        "VAR5": "PR",
        "VAR6": -25.4955709,
        "VAR7": -49.2454987,
        "VAR8": "D",
        "VAR9": "E",
        "VAR10": "MEDIA",
        "VAR11": 1.0,
        "VAR12": 0.182,
        "VAR14": 0.597,
        "VAR15": 0.618,
        "VAR16": 0.25,
        "VAR18": 1.076712,
        "VAR19": 5.057534,
        "VAR22": 0.125,
        "VAR24": 0.069,
        "VAR25": 0.0969999999999999,
        "VAR32": "SALDO INEXISTENTE",
        "VAR39": 0.661039,
        "VAR40": 0.573539,
        "VAR41": 0.4793699999999999,
        "VAR42": 0.4440489999999999,
        "VAR47": 0.006,
        "VAR49": "S",
        "VAR50": "N",
        "VAR51": "N",
        "VAR52": "N",
        "VAR53": "N",
        "VAR54": "N",
        "VAR55": "N",
        "VAR56": "S",
        "VAR57": "S",
        "VAR58": "N",
        "VAR59": "N",
        "VAR60": "N",
        "VAR61": "N",
        "VAR62": "N",
        "VAR63": "N",
        "VAR64": "N",


Using the requests library to send the post requests to the endpoints.

In [179]:
import requests

The first endpoint receives a list of records and returns two things:
1. The volumetry (quantity of records) for each month
2. The value of the area under the ROC curve

In [180]:
response = requests.post(url="http://0.0.0.0:8001/v1/performance", data = request).text


volumetry = response.split("+")[0][2:-3].split(",")
volumetry[0] =  "Volumetry --> \n\n" + volumetry[0]
for i in range(len(volumetry)):
    print(volumetry[i])

print("\nAUC ROC --> \n\n"+response.split("+")[1][:-2])

Volumetry --> 

"2017-07":74
"2017-08":72
"2017-05":67
"2017-06":63
"2017-03":62
"2017-01":58
"2017-02":55
"2017-04":49

AUC ROC --> 

0.5751748251748252


The second endpoint uses the Kolmogorov-Smirnov distance to measure how far away the score distribution of a desired database is comparing to the Test Database (/datasets/credit_01/test.gz)
The path of the desired database is passed in the Post request.

Testing endpoint with train.gz

In [142]:
path = "/home/srctwd/challenge-data-scientist-ntech/datasets/credit_01/train.gz"
response = requests.post(url="http://0.0.0.0:8001/v1/aderencia", data = path).text
response = response.split(",")
print(response[0][1:13]+" -->\n")
print(response[0][14:])
print(response[1][1:])
print(response[2][1:])
print(response[3][1:-2])

KstestResult -->

statistic=0.002759858953621075
pvalue=0.9605978662359891
statistic_location=0.7445096329184175
statistic_sign=1


Testing endpoint with oot.gz

In [143]:
path = "/home/srctwd/challenge-data-scientist-ntech/datasets/credit_01/oot.gz"
response = requests.post(url="http://0.0.0.0:8001/v1/aderencia", data = path).text
response = response.split(",")
print(response[0][1:13]+" -->\n")
print(response[0][14:])
print(response[1][1:])
print(response[2][1:])
print(response[3][1:-2])

KstestResult -->

statistic=0.020915414151451373
pvalue=4.01624188958718e-12
statistic_location=0.7928217560872773
statistic_sign=1


From the two databases tested, we see that the score distribution of train.gz is the closest to test.gz.

### Training new model

In [182]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
import pickle

In [183]:
train_path = "../datasets/credit_01/train.gz"
test_path = "../datasets/credit_01/test.gz"

In [184]:
df_train = pd.read_csv(train_path, compression="gzip")
df_test = pd.read_csv(test_path, compression="gzip")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [185]:
df_train.fillna(value=np.nan,inplace=True)
print(df_train.head())

                    REF_DATE  TARGET VAR2   IDADE VAR4 VAR5       VAR6  \
0  2017-06-16 00:00:00+00:00       1    F  76.126  NaN   SP -23.568523   
1  2017-02-07 00:00:00+00:00       1  NaN     NaN  NaN   SP -23.189738   
2  2017-03-11 00:00:00+00:00       1  NaN     NaN  NaN   PE  -7.563015   
3  2017-04-28 00:00:00+00:00       1    F  65.786  NaN   AM  -3.119028   
4  2017-02-15 00:00:00+00:00       1    M  24.918  NaN   AC -10.014903   

        VAR7 VAR8 VAR9  ...       VAR141  VAR142  VAR143  VAR144  VAR145  \
0 -46.804297    C    E  ...  4094.377623       C     NaN     NaN     NaN   
1 -46.815943  NaN    E  ...  1347.882336       E     NaN     NaN     NaN   
2 -35.013143  NaN    E  ...  1428.485398       E     NaN     NaN     NaN   
3 -60.021731  NaN    E  ...  1478.879522       E     NaN     NaN     NaN   
4 -67.798491    E    E  ...  1560.669227       E     NaN     NaN     NaN   

   VAR146  VAR147                                             VAR148  VAR149  \
0     NaN     102 

In [186]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101128 entries, 0 to 101127
Columns: 151 entries, REF_DATE to ID
dtypes: float64(34), int64(3), object(114)
memory usage: 116.5+ MB


In [187]:
print("Number of unique values in each column:") 
for column in df_train.columns:
    print(column + " - #unique: " + str(df_train[column].nunique()) + "        #nan:" + str(df_train[column].isna().sum()))

Number of unique values in each column:
REF_DATE - #unique: 242        #nan:0
TARGET - #unique: 2        #nan:0
VAR2 - #unique: 3        #nan:12356
IDADE - #unique: 21359        #nan:11594
VAR4 - #unique: 1        #nan:100944
VAR5 - #unique: 27        #nan:2835
VAR6 - #unique: 45233        #nan:2835
VAR7 - #unique: 45207        #nan:2835
VAR8 - #unique: 5        #nan:44526
VAR9 - #unique: 5        #nan:2798
VAR10 - #unique: 5        #nan:2770
VAR11 - #unique: 9        #nan:38719
VAR12 - #unique: 221        #nan:46220
VAR13 - #unique: 916        #nan:88133
VAR14 - #unique: 789        #nan:21411
VAR15 - #unique: 891        #nan:52173
VAR16 - #unique: 192        #nan:63551
VAR17 - #unique: 2358        #nan:88133
VAR18 - #unique: 792        #nan:21411
VAR19 - #unique: 1886        #nan:52173
VAR20 - #unique: 85        #nan:91606
VAR21 - #unique: 5        #nan:88081
VAR22 - #unique: 9        #nan:52106
VAR23 - #unique: 91        #nan:88133
VAR24 - #unique: 59        #nan:19685
VAR25 - #uniqu

Features that most likely do not add any value to the classification task:

**REF_DATE** - date the registry was taken. Does not speak much about the individual entries

**ID** - unique for each entry

Missing for more than 80k entries: 
**VAR4, VAR13, VAR17, VAR20, VAR21, VAR23, VAR26, VAR27, VAR28, VAR29, VAR30, VAR31, VAR33
VAR34, VAR36, VAR37, VAR38, VAR43, VAR44, VAR45, VAR46, VAR48, VAR143, VAR144, VAR145, VAR146**

 

In [188]:
to_drop = []
for column in df_train.columns:
    if df_train[column].isna().sum() > 80000:
        to_drop.append(column)
print(to_drop)

['VAR4', 'VAR13', 'VAR17', 'VAR20', 'VAR21', 'VAR23', 'VAR26', 'VAR27', 'VAR28', 'VAR29', 'VAR30', 'VAR31', 'VAR33', 'VAR34', 'VAR36', 'VAR37', 'VAR38', 'VAR43', 'VAR44', 'VAR45', 'VAR46', 'VAR48', 'VAR143', 'VAR144', 'VAR145', 'VAR146']


In [189]:
df_train = df_train.drop(columns=to_drop)
df_train = df_train.drop(columns=["REF_DATE", "ID"])

In [190]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101128 entries, 0 to 101127
Columns: 123 entries, TARGET to VAR149
dtypes: float64(19), int64(2), object(102)
memory usage: 94.9+ MB


Checking if Dataset is balanced:

We have 4x more examples of the positive class than of the negative class

In [155]:
df_train["TARGET"].value_counts()

1    80130
0    20998
Name: TARGET, dtype: int64

Preprocessing categorical and numerical columns

Source: https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline_column_transformer.html (Scikit learn documentation)

In [191]:
X = df_train.drop(['TARGET'], axis=1)

y = df_train['TARGET']

In [192]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

In [193]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [194]:
for column in numerical_columns:
    X[column].fillna(0, inplace=True)
for column in categorical_columns:
    X[column].fillna("", inplace=True)

In [195]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101128 entries, 0 to 101127
Columns: 122 entries, VAR2 to VAR149
dtypes: float64(19), int64(1), object(102)
memory usage: 94.1+ MB


In [196]:
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [197]:
model = make_pipeline(preprocessor, DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0))
model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['VAR2', 'VAR5', 'VAR8',
                                                   'VAR9', 'VAR10', 'VAR32',
                                                   'VAR35', 'VAR49', 'VAR50',
                                                   'VAR51', 'VAR52', 'VAR53',
                                                   'VAR54', 'VAR55', 'VAR56',
                                                   'VAR57', 'VAR58', 'VAR59',
                                                   'VAR60', 'VAR61', 'VAR62',
                                                   'VAR63', 'VAR64', 'VAR65',
                                                   'VAR66', 'VAR67', 'VAR68',
                                                   'VAR69', 'VAR70', 'VAR71', ...]),
                    

Splitting into train and test

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [199]:
X_train.shape, X_test.shape

((70789, 122), (30339, 122))

In [200]:
_ = model.fit(X_train, y_train)

In [201]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70789 entries, 19537 to 15795
Columns: 122 entries, VAR2 to VAR149
dtypes: float64(19), int64(1), object(102)
memory usage: 66.4+ MB


In [202]:
y_pred = model.predict_proba(X_test)[:,1]

In [203]:
AUC = roc_auc_score(y_test, y_pred)

In [204]:
print(AUC)

0.6220028045603245


In [205]:
old_model_path = '../monitoring/model.pkl'
new_model_path = '../monitoring/new_model.pkl'
pickle.dump(model, open(new_model_path, 'wb'))

In [206]:
old_model = pickle.load(open(old_model_path, 'rb'))
new_model = pickle.load(open(new_model_path, 'rb'))

In [207]:
to_drop = ['VAR4', 'VAR13', 'VAR17', 'VAR20', 'VAR21', 'VAR23', 'VAR26', 'VAR27', 'VAR28', 'VAR29', 'VAR30', 
            'VAR31', 'VAR33', 'VAR34', 'VAR36', 'VAR37', 'VAR38', 'VAR43', 'VAR44', 'VAR45', 'VAR46', 'VAR48', 
            'VAR143', 'VAR144', 'VAR145', 'VAR146', 'REF_DATE', 'ID']
def new_process_dataset(df, to_drop):
    df.fillna(value=np.nan,inplace=True)
    df = df.drop(columns=to_drop)
    
    numerical_columns_selector = selector(dtype_exclude=object)
    categorical_columns_selector = selector(dtype_include=object)

    numerical_columns = numerical_columns_selector(df)
    categorical_columns = categorical_columns_selector(df)
    
    for column in numerical_columns:
        df[column].fillna(0, inplace=True)
    for column in categorical_columns:
        df[column].fillna("", inplace=True)
    
    X = df.drop(['TARGET'], axis=1)
    y = df['TARGET']
    return X, y

def old_process_dataset(df):
    df.fillna(value=np.nan,inplace=True)
    df.drop(["REF_DATE"], axis=1)
    
    X = df.drop(['TARGET'], axis=1)
    y = df['TARGET']
    return X,y

In [208]:
X_old_testdf, y_old_testdf = old_process_dataset(df_test)
X_new_testdf, y_new_testdf = new_process_dataset(df_test, to_drop)

In [209]:
pred_old_testdf = old_model.predict_proba(X_old_testdf)[:,1]
pred_new_testdf = new_model.predict_proba(X_new_testdf)[:,1]

In [210]:
old_AUC = roc_auc_score(y_old_testdf, pred_old_testdf)
new_AUC = roc_auc_score(y_new_testdf, pred_new_testdf)

In [211]:
print(old_AUC)
print(new_AUC)

0.6075332746684731
0.6192777660729909


With some minimal feature engineering, the Decision Tree Classifier was able to perform slightly better than the pre-trained model. 
To improve the performance further, we could use other models such as Random Forest or XGBoost. 
XGBoost model is mentioned in many sources as a really good model to use with tabular data. 