# Models and Evaluation

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

data_dev = pd.read_csv('data_processed/complete/data_selected.csv')
data_comp = pd.read_csv('data_processed/complete/enc_data_comp.csv')
cols = data_dev.columns

display(data_dev.head())
display(data_comp.head())

FileNotFoundError: [Errno 2] No such file or directory: 'data_processed/data_selected.csv'

In [None]:
def get_features(df):
    return df.drop('status', axis=1)
def get_target(df):
    return df.drop(df.columns.difference(['status']), axis=1)

results = {}

In [None]:
data_comp = data_comp[cols]
data_comp.head()

### Split data

In [None]:
def split_data(data_dev, data_comp):
    ids_train = data_dev['loan_id']
    ids_test = data_comp['loan_id']
    data_dev.drop(['loan_id'], axis=1, inplace=True)
    data_comp.drop(['loan_id'], axis=1, inplace=True)

    X_train = get_features(data_dev)
    y_train = get_target(data_dev)
    X_test = get_features(data_comp)
    y_test = get_target(data_comp)

    return X_train, X_test, y_train, y_test, ids_train, ids_test

In [None]:
print('Data shape:',data_comp.shape)
print('Status  1:',data_comp[data_comp['status']==1].shape)
print('Status -1:',data_comp[data_comp['status']==-1].shape)

_X_train, _X_test, _y_train, _y_test, _ids_train, _ids_test = split_data(data_dev, data_comp)

print('\nTrain shape:',_y_train.shape)
print('Status ratio:',_y_train[_y_train['status']==1].shape[0],'|',_y_train[_y_train['status']==-1].shape[0])

print('\nTest shape:',_y_test.shape)
print('Status ration:',_y_test[_y_test['status']==1].shape[0],'|',_y_test[_y_test['status']==-1].shape[0])

display(_X_train.head())
display(_y_train.head())
display(_X_test.head())
display(_y_test.head())

### Resampling

In [None]:
def smote(X_train, y_train):
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(random_state=1, sampling_strategy=1.0)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train

In [None]:
_smote_X_train, _smote_y_train = smote(_X_train, _y_train)

print('\nTrain shape:',_smote_y_train.shape)
print('Status ratio:',_smote_y_train[_smote_y_train['status']==1].shape[0],'|',_smote_y_train[_smote_y_train['status']==-1].shape[0])

### Scaling

In [None]:
def normalize_data(X_train, X_test, scaler):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    return X_train, X_test

def standardize_data(X_train, X_test):
    from sklearn.preprocessing import StandardScaler
    return normalize_data(X_train, X_test, StandardScaler())
def min_max_scaling(X_train, X_test):
    from sklearn.preprocessing import MinMaxScaler
    return normalize_data(X_train, X_test, MinMaxScaler())

### Algorithms

In [None]:
def add_model(name, model, use_smote=False):
    import datetime
    X_train = _X_train.copy()
    X_test = _X_test.copy()
    smote_X_train = _smote_X_train.copy()

    # Scaling Normalizations
    # X_train, X_test = standardize_data(X_train, X_test)
    # X_train, X_test = min_max_scaling(X_train, X_test)

    start = datetime.datetime.now()
    if use_smote: model.fit(smote_X_train, np.ravel(_smote_y_train.values))
    else: model.fit(X_train, np.ravel(_y_train.values))
    end = datetime.datetime.now()
    
    results[name] = {'model': model, 
                    'X_train': X_train, 
                    'X_test': X_test, 
                    'y_train': np.ravel(_y_train.values), 
                    'y_test': np.ravel(_y_test.values),
                    'fit_time': (end - start).microseconds / 1000 # time difference in milliseconds
                    }

In [None]:
SMOTE = True

In [None]:
from sklearn.tree import DecisionTreeClassifier
add_model('dtc',
    DecisionTreeClassifier(),
    use_smote=SMOTE
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
add_model('knn',
    KNeighborsClassifier(),
    use_smote=SMOTE
)

In [None]:
from sklearn.neural_network import MLPClassifier
add_model('mlp',
    MLPClassifier(),
    use_smote=SMOTE
)

In [None]:
from sklearn.naive_bayes import GaussianNB
add_model('gnb',
    GaussianNB(),
    use_smote=SMOTE
)

In [None]:
from sklearn.svm import SVC
add_model('svc',
    SVC(probability=True),
    use_smote=SMOTE
)

In [None]:
from sklearn.linear_model import LogisticRegression
add_model('lr',
    LogisticRegression(),
    use_smote=SMOTE
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
add_model('rf',
    RandomForestClassifier(),
    use_smote=SMOTE
)

## Prediction

In [None]:
def predict_proba(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    proba = result['model'].predict_proba(result['X_'+prefix])
    result[prefix+'_pred_prob'] = proba

for name in results.keys():
    predict_proba(name)
for name in results.keys():
    predict_proba(name, isTrain=True)


## Evaluation

In [None]:
expected = pd.read_csv('kaggle/expected.csv')

for name in results.keys():
    d = {'Id': _ids_test.values, 'Predicted': results[name]['test_pred_prob'][:,0]}
    df = pd.DataFrame(data=d)
    mix = pd.merge(df, expected, left_on='Id', right_on='loan_id', how='left')
    results[name]['mix'] = mix[['Id', 'loan_id', 'Predicted', 'status']]

In [None]:
def roc_and_auc(name, isTrain=False):
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_curve, RocCurveDisplay, auc
    fig, ax = plt.subplots()
    result = results[name]
    fpr, tpr, thresholds = roc_curve(result['mix']['status'], result['mix']['Predicted'], pos_label=result['model'].classes_[0])
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax,name=(name.upper()))
    print(name.upper(), 'test' + ":\t", auc(fpr, tpr))

for name in results.keys():
    roc_and_auc(name)

# Save submission

In [None]:
d = {'Id': _ids_test.values, 'Predicted': results['rf']['test_pred_prob'][:,0]}
df = pd.DataFrame(data=d)
df.to_csv('kaggle/submission.csv', index=False)

display(df)