In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from imblearn.pipeline import Pipeline 
from imblearn.over_sampling import RandomOverSampler

from tqdm import tqdm

from umap import UMAP

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file = 'train_preprocessed_type_of_loan.csv'

df = pd.read_csv("../Data/" + file, low_memory=False)

In [3]:
def features(df, Not_specified, A_OH):
    tmp = df.copy()
    if Not_specified:
        if A_OH:
            uniq_types = tmp['Type_of_Loan'].value_counts()[:10].index
            for uniq_type in uniq_types:
                tmp[uniq_type] = tmp['Type_of_Loan'].str.count(uniq_type)
            
            features_num = list(tmp.select_dtypes(include=["number"]).columns[-10:])
            features_obj = list(tmp.select_dtypes(include=["object"]).columns)
            
            features_cat = features_num + features_obj
            for cat in features_cat:
                tmp[cat] = tmp[cat].astype('category')
            
            tmp.drop(columns='Type_of_Loan', inplace=True)
            features_cat.remove('Type_of_Loan')
        else:
            features_cat = list(tmp.select_dtypes(include=["object"]).columns)
            
            for cat in features_cat:
                tmp[cat] = tmp[cat].astype('category')
    else:
        tmp.loc[tmp['Type_of_Loan'].str.contains('Not Specified'),
                'Type_of_Loan'] = None
        
        if A_OH:
            uniq_types = tmp['Type_of_Loan'].value_counts()[:9].index
            for uniq_type in uniq_types:
                tmp[uniq_type] = tmp['Type_of_Loan'].str.count(uniq_type)
            
            features_num = list(tmp.select_dtypes(include=["number"]).columns[-9:])
            features_obj = list(tmp.select_dtypes(include=["object"]).columns)
            
            features_cat = features_num + features_obj
            for cat in features_cat:
                tmp[cat] = tmp[cat].astype('category')
                
            tmp.drop(columns='Type_of_Loan', inplace=True)
            features_cat.remove('Type_of_Loan')
        else:
            features_cat = list(tmp.select_dtypes(include=["object"]).columns)
            
            for cat in features_cat:
                tmp[cat] = tmp[cat].astype('category')
    
    features_num = list(set(tmp.drop(columns='Credit_Score').columns) - set(features_cat))
    return tmp, features_num, features_cat   

In [4]:
def data_splitting(df, cat, num, nan=False):
    tmp = (nan*[df] + (1-nan)*[df.dropna().reset_index(drop=True)])[0].copy()
    
    return train_test_split(tmp[num + cat], 
                            tmp['Credit_Score'], test_size=0.2, shuffle=True)


def data_resampler(X_train, y_train, enable=False):
    sampler = RandomOverSampler()
    X_train, y_train = enable*sampler.fit_resample(X_train, y_train) + \
                       (1-enable)*(X_train, y_train)
    return X_train, y_train

In [5]:
def table(df, model='SGD'):
    """m: miss
       s: sampling
       a: aug_OH
       n: not_spec
    """
    
    from itertools import product
    from time import time
    from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score
    import xgboost as xgb
    
    conditions = [[0, 1]]*4
    products = list(product(*conditions))
    n, m = len(products), 7
    result = np.zeros((n, m))
    
    for i, conds in tqdm(enumerate(products)):
        m, s, a, n = conds
        tmp, num, cat = features(df, n, a)
        X_train, X_test, y_train, y_test = data_splitting(tmp, cat, num, m)
        X_train, y_train = data_resampler(X_train, y_train, s)
        
        cat_preprocessor = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                                    ("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        num_preprocessor = Pipeline([
                                    ("imputer", SimpleImputer(strategy="median"))])

        preprocessor = ColumnTransformer([
            ("numerical", num_preprocessor, num),
            ("categorical", cat_preprocessor, cat)
        ])

        tic = time()
        ## Transforming
        X_train = preprocessor.fit_transform(X_train)
        #X_val = preprocessor.transform(X_val)
        X_test = preprocessor.transform(X_test)

        scaler = MaxAbsScaler()
        try:
            X_train = scaler.fit_transform(X_train).toarray()
            #X_val = scaler.transform(X_val).toarray()
            X_test = scaler.transform(X_test).toarray()
        except:
            X_train = scaler.fit_transform(X_train)
            #X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test) 
            
        reducer = UMAP(15)
        embedding = reducer.fit_transform(X_train)
        
        params = {'subsample': 0.5,
          'reg_lambda': 2,
          'reg_alpha': 1,
          'objective': 'multi:softmax',
          'n_estimators': 200,
          'min_child_weight': 1,
          'max_depth': 5,
          'learning_rate': 0.06,
          'colsample_bytree': 0.5}

        model = xgb.XGBClassifier(**params)
            
        model.fit(X=embedding, y=y_train);

        y_hat = model.predict(embedding)

        res = precision_score(y_true=y_train, y_pred=y_hat, average=None)
        
        plt.figure(figsize=(7, 7))
        scatter = plt.scatter(embedding[:, 0],embedding[:, 1],  s=1, c=y_train)
        plt.xlabel('UMAP feature 0')
        plt.ylabel('UMAP feature 1')
        plt.title('UMAP projection')
        legend1 = plt.legend(*scatter.legend_elements(),
                            title="Classes")
        plt.savefig(f"../Results/Clusters/{m}_{s}_{a}_{n}.png", transparent=True, dpi=600)
        plt.close()
        
        result[i,:] = *conds, *res


    cols = ['m', 's', 'a', 'n', 'Purity_0', 'Purity_1', 'Purity_2']
    return pd.DataFrame(result, columns=cols)

In [6]:
res = table(df)

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
16it [21:33, 80.87s/it] 


In [16]:
from scipy.stats import hmean

res['Purity'] = hmean(res.iloc[:, -3:], axis=1)
res.to_csv("../Results/Purity.csv", index=False)

In [17]:
res

Unnamed: 0,m,s,a,n,Purity_0,Purity_1,Purity_2,Purity
0,0.0,0.0,0.0,0.0,0.630898,0.726288,0.500338,0.596537
1,0.0,0.0,0.0,1.0,0.614708,0.699983,0.497452,0.585092
2,0.0,0.0,1.0,0.0,0.623813,0.698375,0.530984,0.605498
3,0.0,0.0,1.0,1.0,0.607401,0.696689,0.521291,0.597567
4,0.0,1.0,0.0,0.0,0.581735,0.557866,0.604271,0.580319
5,0.0,1.0,0.0,1.0,0.609919,0.551367,0.655711,0.60022
6,0.0,1.0,1.0,0.0,0.581416,0.560859,0.579371,0.571215
7,0.0,1.0,1.0,1.0,0.538345,0.571086,0.597691,0.578623
8,1.0,0.0,0.0,0.0,0.612471,0.686547,0.500166,0.582285
9,1.0,0.0,0.0,1.0,0.621269,0.688488,0.503048,0.585519
