# Models and Evaluation

In [160]:
import pandas as pd
import numpy as np
from IPython.display import display

data = pd.read_csv('data_processed/kaggle_data.csv')

display(data.head())

Unnamed: 0,loan_id,account_id,loan_amount,loan_duration,loan_payments,status,account_district_id,account_frequency,disp_id,client_id,has_disponent,client_district_id,gender,last_balance,monthly_diff,age_on_loan_request,age_on_loan_request_disc
0,5314,1787,96396,12,8033,-1.0,30,weekly issuance,2166,2166,False,30,Female,20100.0,6700,46,40-49
1,5316,1801,165960,36,4610,1.0,46,monthly issuance,2181,2181,False,46,Male,52208.9,14733,25,21-29
2,6863,9188,127080,60,2118,1.0,45,monthly issuance,11006,11314,False,45,Male,20272.8,-6859,57,50-59
3,5325,1843,105804,36,2939,1.0,12,monthly issuance,2235,2235,False,14,Female,34292.7,-11054,53,50-59
4,7240,11013,274740,60,4579,1.0,1,weekly issuance,13231,13539,False,63,Male,41142.9,-30531,15,0-19


In [161]:
data = data[['loan_id','loan_payments','has_disponent','gender','monthly_diff','status']]
data.head()

Unnamed: 0,loan_id,loan_payments,has_disponent,gender,monthly_diff,status
0,5314,8033,False,Female,6700,-1.0
1,5316,4610,False,Male,14733,1.0
2,6863,2118,False,Male,-6859,1.0
3,5325,2939,False,Female,-11054,1.0
4,7240,4579,False,Male,-30531,1.0


In [162]:
def get_features(df):
    return df.drop('status', axis=1)
def get_target(df):
    return df.drop(df.columns.difference(['status']), axis=1)

results = {}

In [163]:
def encode_data(df, columns):
    from sklearn.preprocessing import OrdinalEncoder
    le = OrdinalEncoder()
    cols = [col for col in columns if col in df.keys()]
    df[cols] = le.set_params(encoded_missing_value=-1).fit_transform(df[cols])
    return df

In [164]:
categorical_columns = list(data.select_dtypes("object").columns)
data = encode_data(data, categorical_columns)

data

Unnamed: 0,loan_id,loan_payments,has_disponent,gender,monthly_diff,status
0,5314,8033,False,0.0,6700,-1.0
1,5316,4610,False,1.0,14733,1.0
2,6863,2118,False,1.0,-6859,1.0
3,5325,2939,False,0.0,-11054,1.0
4,7240,4579,False,1.0,-30531,1.0
...,...,...,...,...,...,...
677,4989,7348,False,0.0,1775,
678,5221,4376,False,1.0,2413,
679,6402,5812,False,1.0,-33263,
680,5346,2318,False,1.0,-13067,


### Split data

In [165]:
def split_data(data):
    train = data[data['status'].notnull()]
    test = data[data['status'].isnull()]

    ids_train = train['loan_id']
    ids_test = test['loan_id']
    train.drop(['loan_id'], axis=1, inplace=True)
    test.drop(['loan_id'], axis=1, inplace=True)
    X_train = get_features(train)
    y_train = get_target(train)
    X_test = get_features(test)
    y_test = get_target(test)

    return X_train, X_test, y_train, y_test, ids_train, ids_test

In [166]:
print('Data shape:',data.shape)
print('Status  1:',data[data['status']==1].shape)
print('Status -1:',data[data['status']==-1].shape)

_X_train, _X_test, _y_train, _y_test, _ids_train, _ids_test = split_data(data)

print('\nTrain shape:',_y_train.shape)
print('Status ratio:',_y_train[_y_train['status']==1].shape[0],'|',_y_train[_y_train['status']==-1].shape[0])

print('\nTest shape:',_y_test.shape)
print('Status ration:',_y_test[_y_test['status']==1].shape[0],'|',_y_test[_y_test['status']==-1].shape[0])

display(_X_train.head())
display(_y_train.head())
display(_X_test.head())
display(_y_test.head())

Data shape: (682, 6)
Status  1: (282, 6)
Status -1: (46, 6)

Train shape: (328, 1)
Status ratio: 282 | 46

Test shape: (354, 1)
Status ration: 0 | 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(['loan_id'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(['loan_id'], axis=1, inplace=True)


Unnamed: 0,loan_payments,has_disponent,gender,monthly_diff
0,8033,False,0.0,6700
1,4610,False,1.0,14733
2,2118,False,1.0,-6859
3,2939,False,0.0,-11054
4,4579,False,1.0,-30531


Unnamed: 0,status
0,-1.0
1,1.0
2,1.0
3,1.0
4,1.0


Unnamed: 0,loan_payments,has_disponent,gender,monthly_diff
328,1566,False,1.0,8970
329,7240,False,0.0,4830
330,4845,False,0.0,-24751
331,3698,True,0.0,-18389
332,3210,False,1.0,3191


Unnamed: 0,status
328,
329,
330,
331,
332,


### Resampling

In [167]:
def smote(X_train, y_train):
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(random_state=1, sampling_strategy=1.0)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train

In [168]:
_smote_X_train, _smote_y_train = smote(_X_train, _y_train)

print('\nTrain shape:',_smote_y_train.shape)
print('Status ratio:',_smote_y_train[_smote_y_train['status']==1].shape[0],'|',_smote_y_train[_smote_y_train['status']==-1].shape[0])


Train shape: (564, 1)
Status ratio: 282 | 282


### Scaling

In [169]:
def normalize_data(X_train, X_test, scaler):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    return X_train, X_test

def standardize_data(X_train, X_test):
    from sklearn.preprocessing import StandardScaler
    return normalize_data(X_train, X_test, StandardScaler())
def min_max_scaling(X_train, X_test):
    from sklearn.preprocessing import MinMaxScaler
    return normalize_data(X_train, X_test, MinMaxScaler())

### Algorithms

In [170]:
def add_model(name, model, use_smote=False):
    import datetime
    X_train = _X_train.copy()
    X_test = _X_test.copy()
    smote_X_train = _smote_X_train.copy()
    """
    if (name == 'dtc'):
        X_train.drop(['age_on_loan_request_disc'], inplace=True, axis=1)
        X_test.drop(['age_on_loan_request_disc'], inplace=True, axis=1)
        smote_X_train.drop(['age_on_loan_request_disc'], inplace=True, axis=1)
    else:
        X_train.drop(['age_on_loan_request'], inplace=True, axis=1)
        X_test.drop(['age_on_loan_request'], inplace=True, axis=1)
        smote_X_train.drop(['age_on_loan_request'], inplace=True, axis=1)
    """ 
    # Scaling Normalizations
    # X_train, X_test = standardize_data(X_train, X_test)
    # X_train, X_test = min_max_scaling(X_train, X_test)

    start = datetime.datetime.now()
    if use_smote: model.fit(smote_X_train, np.ravel(_smote_y_train.values))
    else: model.fit(X_train, np.ravel(_y_train.values))
    end = datetime.datetime.now()
    
    results[name] = {'model': model, 
                    'X_train': X_train, 
                    'X_test': X_test, 
                    'y_train': np.ravel(_y_train.values), 
                    'y_test': np.ravel(_y_test.values),
                    'fit_time': (end - start).microseconds / 1000 # time difference in milliseconds
                    }

In [171]:
SMOTE = False

In [172]:
from sklearn.tree import DecisionTreeClassifier
add_model('dtc',
    DecisionTreeClassifier(),
    use_smote=SMOTE
)

In [173]:
from sklearn.neighbors import KNeighborsClassifier
add_model('knn',
    KNeighborsClassifier(),
    use_smote=SMOTE
)

In [174]:
from sklearn.neural_network import MLPClassifier
add_model('mlp',
    MLPClassifier(),
    use_smote=SMOTE
)

In [175]:
from sklearn.naive_bayes import GaussianNB
add_model('gnb',
    GaussianNB(),
    use_smote=SMOTE
)

In [176]:
from sklearn.svm import SVC
add_model('svc',
    SVC(probability=True),
    use_smote=SMOTE
)

In [177]:
from sklearn.ensemble import RandomForestClassifier
add_model('rf',
    RandomForestClassifier(),
    use_smote=SMOTE
)

## Prediction

In [178]:
def predict_proba(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    proba = result['model'].predict_proba(result['X_'+prefix])
    result[prefix+'pred_prob'] = proba

for name in results.keys():
    predict_proba(name)
for name in results.keys():
    predict_proba(name, isTrain=True)


In [180]:
d = {'Id': _ids_test.values, 'Predicted': results['rf']['testpred_prob'][:,0]}
df = pd.DataFrame(data=d)
df.to_csv('kaggle/submission.csv', index=False)

display(df)

Unnamed: 0,Id,Predicted
0,5895,0.02
1,7122,0.31
2,6173,0.01
3,6142,0.00
4,5358,0.40
...,...,...
349,4989,0.81
350,5221,0.09
351,6402,0.03
352,5346,0.58
