In [20]:
import os.path
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
np.set_printoptions(precision=3, suppress=True)
import seaborn as sns
sns.set(style='whitegrid')
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from keras import regularizers
from sklearn.model_selection import train_test_split
import zipfile
import uuid

In [38]:
class Data():
    def __init__(self, data_path = 'Data/SouthGermanCredit/', german = 'SouthGermanCredit.asc', english = 'SouthGermanCredit_english.csv'):
        self.data_path = data_path
        self.german = german
        self.english = english
        if not os.path.isfile(self.data_path + english):
            self.translate()
            
    def translate(self):
        df = pd.read_csv(self.data_path + self.german, sep = ' ')
        df.rename(columns = {
                            'laufkont': 'status', 
                            'laufzeit': 'duration',
                            'moral': 'credit_history',
                            'verw': 'purpose',
                            'hoehe': 'amount',
                            'sparkont': 'savings',
                            'beszeit': 'employment_duration',
                            'rate': 'installment_rate',
                            'famges': 'personal_status_sex',
                            'buerge': 'other_debtors',
                            'wohnzeit': 'present_residence',
                            'verm': 'property',
                            'alter': 'age',
                            'weitkred': 'other_installment_plans',
                            'wohn': 'housing',
                            'bishkred': 'number_credits',
                            'beruf': 'job',
                            'pers': 'people_liable',
                            'telef': 'telephone',
                            'gastarb': 'foreign_worker',
                            'kredit': 'credit_risk'
                            }, inplace = True)
        df.to_csv(self.data_path + self.english, index = False)
    
    def view_data(self):
        df = pd.read_csv(self.data_path + self.english)
        print(df.columns)
        print(df.describe())
        print(df.head())
        
    def get_data(self):
        return pd.read_csv(self.data_path + self.english).astype(np.float32)
    
    def gen_uuid(self, df, uuid_list = []):
        for i in range(df.shape[0]):
            uuid_list.append(str(uuid.uuid4()))
        uuid_list = pd.Series(uuid_list)
        df_uuid = df.copy()
        df_uuid['id'] = uuid_list
        df_uuid = df_uuid.set_index('id')
        return df_uuid
    
    def get_data_vfl(self, df, display = False):
        data_1 = ['credit_history', 'purpose', 'employment_duration', 'personal_status_sex',
                  'present_residence', 'age', 'housing', 'job', 'telephone', 'foreign_worker']
        
        # has label [credit_risk]
        data_2 = ['status', 'duration', 'amount', 'savings', 'installment_rate', 'other_debtors',
                  'property', 'other_installment_plans', 'number_credits', 'people_liable', 'credit_risk']
        if (display):
            print(df[data_1].head())
            print('----------------------')
            print(df[data_2].head())
        return df[data_1], df[data_2]

In [46]:
class Helpers():
    def __init__(self):
        pass
    def plot_loss(self, loss, accuracy):
        plt.plot(loss, label='loss')
        plt.plot(accuracy, label='accuracy')
        plt.xlabel('Epoch')
        # plt.ylabel('Error')
        plt.legend()
        plt.grid(True)

    def plot_accuracy(self, predictions, answers):
        tp, tn, fp, fn = 0, 0, 0, 0

        for x in range(len(predictions)):
            if answers[x] == 1:
                if np.argmax(predictions[x]) == 1:
                    tp = tp + 1
                else:
                    fn = fn + 1
            else:
                if np.argmax(predictions[x]) == 0:
                    tn = tn + 1
                else:
                    fp = fp + 1

        accuracy = (tp + tn)/(tp + fp + fn + tn)
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        specificity = tn / (tn + fp)
        print("Accuracy: " + str(accuracy))
        print("Precision: " + str(precision))
        print("Recall: " + str(recall))
        # print("Specificity: " + str(specificity))
        print("F-Measure: " + str(2*(recall * precision) / (recall + precision)))


    def convert_to_non_sparse(self, sparse):
        vector_list = np.zeros((len(sparse), 2))
        for x in range(len(sparse)):
            vector_list[x] = [1 - sparse[x], sparse[x]]
        return vector_list

In [45]:
data_class = Data()
original_df = data_class.get_data()
df_id = data_class.gen_uuid(original_df)
df_1, df_2 = data_class.get_data_vfl(df_id)
df_1_train, df_1_test = train_test_split(df_1, test_size = 0.2, random_state = 69)
df_2_train, df_2_test = df_1.loc[df_1_train.index], df_2.loc[df_1_test.index]