In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from imblearn.pipeline import Pipeline 
from imblearn.over_sampling import RandomOverSampler

In [161]:
file = 'train_preprocessed_type_of_loan.csv'

df = pd.read_csv("../Data/" + file, low_memory=False)

In [162]:
df

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,January,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,Auto Loan Credit-Builder Loan Home Equity Loan...,...,Good,809.98,26.822620,265.0,No,49.574949,80.415295,HighspentSmallvaluepayments,312.494089,2
1,February,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,Auto Loan Credit-Builder Loan Home Equity Loan...,...,Good,809.98,31.944960,266.0,No,49.574949,118.280222,LowspentLargevaluepayments,284.629162,2
2,March,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,Auto Loan Credit-Builder Loan Home Equity Loan...,...,Good,809.98,28.609352,267.0,No,49.574949,81.699521,LowspentMediumvaluepayments,331.209863,2
3,April,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,Auto Loan Credit-Builder Loan Home Equity Loan...,...,Good,809.98,31.377862,268.0,No,49.574949,199.458074,LowspentSmallvaluepayments,223.451310,2
4,May,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,Auto Loan Credit-Builder Loan Home Equity Loan...,...,Good,809.98,24.797347,269.0,No,49.574949,41.420153,HighspentMediumvaluepayments,341.489231,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,April,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,Auto Loan Student Loan,...,,502.38,34.663572,378.0,No,35.104023,60.971333,HighspentLargevaluepayments,479.866228,0
99996,May,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,Auto Loan Student Loan,...,,502.38,40.565631,379.0,No,35.104023,54.185950,HighspentMediumvaluepayments,496.651610,0
99997,June,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,Auto Loan Student Loan,...,,502.38,41.255522,380.0,No,35.104023,24.028477,HighspentLargevaluepayments,516.809083,0
99998,July,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,Auto Loan Student Loan,...,,502.38,33.638208,381.0,No,35.104023,251.672582,LowspentLargevaluepayments,319.164979,1


In [3]:
def features(df, Not_specified, A_OH):
    tmp = df.copy()
    if Not_specified:
        if A_OH:
            uniq_types = tmp['Type_of_Loan'].value_counts()[:10].index
            for uniq_type in uniq_types:
                tmp[uniq_type] = tmp['Type_of_Loan'].str.count(uniq_type)
            
            features_num = list(tmp.select_dtypes(include=["number"]).columns[-10:])
            features_obj = list(tmp.select_dtypes(include=["object"]).columns)
            
            features_cat = features_num + features_obj
            for cat in features_cat:
                tmp[cat] = tmp[cat].astype('category')
            
            tmp.drop(columns='Type_of_Loan', inplace=True)
            features_cat.remove('Type_of_Loan')
        else:
            features_cat = list(tmp.select_dtypes(include=["object"]).columns)
            
            for cat in features_cat:
                tmp[cat] = tmp[cat].astype('category')
    else:
        tmp.loc[tmp['Type_of_Loan'].str.contains('Not Specified'),
                'Type_of_Loan'] = None
        
        if A_OH:
            uniq_types = tmp['Type_of_Loan'].value_counts()[:9].index
            for uniq_type in uniq_types:
                tmp[uniq_type] = tmp['Type_of_Loan'].str.count(uniq_type)
            
            features_num = list(tmp.select_dtypes(include=["number"]).columns[-9:])
            features_obj = list(tmp.select_dtypes(include=["object"]).columns)
            
            features_cat = features_num + features_obj
            for cat in features_cat:
                tmp[cat] = tmp[cat].astype('category')
                
            tmp.drop(columns='Type_of_Loan', inplace=True)
            features_cat.remove('Type_of_Loan')
        else:
            features_cat = list(tmp.select_dtypes(include=["object"]).columns)
            
            for cat in features_cat:
                tmp[cat] = tmp[cat].astype('category')
    
    features_num = list(set(tmp.columns) - set(features_cat))
    return tmp, features_num, features_cat   

In [147]:
def data_splitting(df, cat, num, nan=False):
    tmp = (nan*[df] + (1-nan)*[df.dropna().reset_index(drop=True)])[0].copy()        
    return train_test_split(tmp[num + cat],
                            tmp['Credit_Score'], test_size=0.2, shuffle=True)


def data_resampler(X_train, y_train, enable=False):
    sampler = RandomOverSampler()
    X_train, y_train = enable*sampler.fit_resample(X_train, y_train) + \
                       (1-enable)*(X_train, y_train)
    return X_train, y_train   

In [159]:
def table(df, model='SGD'):
    """m: miss
       s: sampling
       a: aug_OH
       n: not_spec
    """
    
    from itertools import product
    from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
    
    conditions = [[0, 1]]*4
    products = list(product(*conditions))
    n, m = len(products), 12
    result = np.zeros((n, m))
    
    for i, conds in tqdm(enumerate(products)):
        m, s, a, n = conds
        tmp, num, cat = features(df, n, a)
        X_train, X_test, y_train, y_test = data_splitting(tmp, cat, num, m)
        X_train, y_train = data_resampler(X_train, y_train, s)
        
        cat_preprocessor = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                                    ("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        num_preprocessor = Pipeline([
                                    ("imputer", SimpleImputer(strategy="median"))])

        preprocessor = ColumnTransformer([
            ("numerical", num_preprocessor, num),
            ("categorical", cat_preprocessor, cat)
        ])

        ## Transforming
        X_train = preprocessor.fit_transform(X_train)
        X_test = preprocessor.transform(X_test)

        scaler = MaxAbsScaler()
        try:
            X_train = scaler.fit_transform(X_train).toarray()
            X_test = scaler.transform(X_test).toarray()
        except:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)     
            
        svc = SGDClassifier(loss="hinge", n_jobs=10, validation_fraction=0.25)
        svc.fit(X_train, y_train)       
                    
        f1_train = f1_score(svc.predict(X_train), y_train, average=None)
        acc_train = accuracy_score(svc.predict(X_train), y_train)
        f1_test = f1_score(svc.predict(X_test), y_test, average=None)
        acc_test = accuracy_score(svc.predict(X_test), y_test)        
        
        result[i,:] = *conds, *f1_train, acc_train, *f1_test, acc_test


    cols = ['m', 's', 'a', 'n', 'F1_1_train', 'F1_2_train', 'F1_3_train', 'Train',
            'F1_1_test', 'F1_2_test', 'F1_3_test', 'Test']            
    return pd.DataFrame(result, columns=cols)

In [160]:
table(df)

16it [02:36,  9.79s/it]


Unnamed: 0,m,s,a,n,F1_1_train,F1_2_train,F1_3_train,Train,F1_1_test,F1_2_test,F1_3_test,Test
0,0.0,0.0,0.0,0.0,0.912056,0.94572,0.974008,0.942266,0.900506,0.938712,0.973855,0.935219
1,0.0,0.0,0.0,1.0,0.853254,0.864553,0.895376,0.865783,0.845164,0.857993,0.892752,0.859477
2,0.0,0.0,1.0,0.0,0.814861,0.874771,0.971123,0.875384,0.813212,0.871918,0.975547,0.874334
3,0.0,0.0,1.0,1.0,0.875451,0.862648,0.847974,0.864293,0.871784,0.857689,0.838154,0.859056
4,0.0,1.0,0.0,0.0,0.90947,0.839622,0.949218,0.900773,0.821393,0.859565,0.920011,0.859061
5,0.0,1.0,0.0,1.0,0.907167,0.819251,0.921169,0.883053,0.847777,0.868882,0.91329,0.868873
6,0.0,1.0,1.0,0.0,0.898497,0.806505,0.922888,0.876783,0.788588,0.835809,0.908237,0.834563
7,0.0,1.0,1.0,1.0,0.915755,0.814119,0.902426,0.878392,0.853656,0.861607,0.890787,0.863965
8,1.0,0.0,0.0,0.0,0.851136,0.861408,0.88682,0.862,0.850995,0.861684,0.888821,0.8625
9,1.0,0.0,0.0,1.0,0.848865,0.861024,0.890678,0.861637,0.850406,0.862406,0.893249,0.8633


In [165]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

m, s, a, n = (0, 0, 0, 0)
tmp, num, cat = features(df, n, a)
X_train, X_test, y_train, y_test = data_splitting(tmp, cat, num, m)
X_train, y_train = data_resampler(X_train, y_train, s)

cat_preprocessor = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                            ("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])

num_preprocessor = Pipeline([
                            ("imputer", SimpleImputer(strategy="median"))])

preprocessor = ColumnTransformer([
    ("numerical", num_preprocessor, num),
    ("categorical", cat_preprocessor, cat)
])

## Transforming
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

scaler = MaxAbsScaler()
try:
    X_train = scaler.fit_transform(X_train).toarray()
    X_test = scaler.transform(X_test).toarray()
except:
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)     
    
svc = SGDClassifier(loss="hinge", n_jobs=10, validation_fraction=0.25)
svc.fit(X_train, y_train)       
            
f1_train = f1_score(svc.predict(X_train), y_train, average=None)
acc_train = accuracy_score(svc.predict(X_train), y_train)
f1_test = f1_score(svc.predict(X_test), y_test, average=None)
acc_test = accuracy_score(svc.predict(X_test), y_test)

In [166]:
f1_train, acc_train

(array([0.91794676, 0.93758602, 0.94479546]), 0.9337587125871258)

In [167]:
f1_test, acc_test

(array([0.91759295, 0.93777864, 0.94668008]), 0.9341943419434194)

In [44]:
type(svc).__name__

'SGDClassifier'

In [45]:
import xgboost as xgb

In [49]:
xgb_class = xgb.XGBClassifier()
type(xgb_class).__name__

'XGBClassifier'

In [54]:
import tensorflow
from tensorflow.keras import Sequential, initializers
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy

Arch = [Dense(units=299, activation='relu')]
model = Sequential(Arch)

model.compile(loss=SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

model

<keras.engine.sequential.Sequential at 0x1f55b3dc670>

In [56]:
type(model).__name__

'Sequential'