In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import  train_test_split


import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

from ccnets.config import get_parser
from ccnets.ccnets import CCNets
from ccnets.resnets import ResNets
from nn.custom_deepfm import DeepFM
from nn.custom_dnn import ResMLP, MLP 
from ccnets.utils.log import create_log_details, create_log_name
from ccnets.utils.setting import set_random_seed
import torch
import os
from torch.utils.tensorboard import SummaryWriter


In [2]:
df = pd.read_csv('./data/FinanceCompanyLoanData.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df.shape

(614, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
df['ApplicantIncome'] = df['ApplicantIncome'].astype('float64')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    float64
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(5), object(8)
memory usage: 62.5+ KB


In [6]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
df.fillna(0, inplace = True)
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [8]:
df['Loan_Status'].value_counts()

Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [9]:
del_cols = ['Loan_ID']

df.drop(columns=del_cols, axis = 1, inplace = True)

In [10]:
target = 'Loan_Status'

X = df.drop(columns = target, axis = 1)
y = df[target]

In [11]:
col_names = X.columns

In [12]:
X.shape

(614, 11)

In [13]:
sc = RobustScaler()
cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

X[cols] = sc.fit_transform(X[cols])
# X = pd.DataFrame(X, columns = cols)
type(X)


pandas.core.frame.DataFrame

In [14]:
# oh_encoder = OneHotEncoder()
# X = oh_encoder.fit_transform(X).toarray()

# X

In [15]:
dumm_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

X = pd.get_dummies(X, columns= dumm_cols)

X

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_0,Gender_Female,Gender_Male,Married_0,Married_No,...,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_0,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.698029,-0.517358,-1.872659,0.0,0.0,False,False,True,False,True,...,False,False,True,False,False,True,False,False,False,True
1,0.264096,0.139079,0.044944,0.0,0.0,False,False,True,False,False,...,False,False,True,False,False,True,False,True,False,False
2,-0.278492,-0.517358,-0.883895,0.0,0.0,False,False,True,False,False,...,False,False,True,False,False,False,True,False,False,True
3,-0.421422,0.509087,-0.074906,0.0,0.0,False,False,True,False,False,...,False,False,False,True,False,True,False,False,False,True
4,0.749786,-0.517358,0.239700,0.0,0.0,False,False,True,False,True,...,False,False,True,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.312768,-0.517358,-0.808989,0.0,0.0,False,True,False,False,True,...,False,False,True,False,False,True,False,True,False,False
610,0.100600,-0.517358,-1.273408,-180.0,0.0,False,False,True,False,False,...,False,True,True,False,False,True,False,True,False,False
611,1.459983,-0.412885,1.917603,0.0,0.0,False,False,True,False,False,...,False,False,True,False,False,True,False,False,False,True
612,1.292374,-0.517358,0.928839,0.0,0.0,False,False,True,False,False,...,True,False,True,False,False,True,False,False,False,True


In [16]:
X = X.applymap(lambda x: int(x) if isinstance(x, bool) else x)


In [17]:
X.shape

(614, 24)

In [18]:
y = pd.get_dummies(y)
y = y.astype(int)

In [19]:
print(X.shape)
print(y.shape)
print(X.info())

(614, 24)
(614, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ApplicantIncome          614 non-null    float64
 1   CoapplicantIncome        614 non-null    float64
 2   LoanAmount               614 non-null    float64
 3   Loan_Amount_Term         614 non-null    float64
 4   Credit_History           614 non-null    float64
 5   Gender_0                 614 non-null    int64  
 6   Gender_Female            614 non-null    int64  
 7   Gender_Male              614 non-null    int64  
 8   Married_0                614 non-null    int64  
 9   Married_No               614 non-null    int64  
 10  Married_Yes              614 non-null    int64  
 11  Dependents_0             614 non-null    int64  
 12  Dependents_0             614 non-null    int64  
 13  Dependents_1             614 non-null    int64  
 14  Depende

In [20]:
X

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_0,Gender_Female,Gender_Male,Married_0,Married_No,...,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_0,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.698029,-0.517358,-1.872659,0.0,0.0,0,0,1,0,1,...,0,0,1,0,0,1,0,0,0,1
1,0.264096,0.139079,0.044944,0.0,0.0,0,0,1,0,0,...,0,0,1,0,0,1,0,1,0,0
2,-0.278492,-0.517358,-0.883895,0.0,0.0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
3,-0.421422,0.509087,-0.074906,0.0,0.0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
4,0.749786,-0.517358,0.239700,0.0,0.0,0,0,1,0,1,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.312768,-0.517358,-0.808989,0.0,0.0,0,1,0,0,1,...,0,0,1,0,0,1,0,1,0,0
610,0.100600,-0.517358,-1.273408,-180.0,0.0,0,0,1,0,0,...,0,1,1,0,0,1,0,1,0,0
611,1.459983,-0.412885,1.917603,0.0,0.0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1
612,1.292374,-0.517358,0.928839,0.0,0.0,0,0,1,0,0,...,1,0,1,0,0,1,0,0,0,1


In [21]:

class Dataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype= torch.float32).unsqueeze(-1)
        return vals, label


In [22]:
args = get_parser()
args.device = torch.device('cuda:0' if (torch.cuda.is_available() and args.ngpu > 0) else "cpu")

In [23]:
import IPython ; file_path = IPython.extract_module_locals()[1]['__vsc_ipynb_file__']
from pathlib import Path
file_name = Path(file_path).stem
model_path = path_append + f"models/{file_name}/"
temp_path = path_append + f"models/{'temp_'}{file_name}/"
log_path = path_append + f"log/{file_name}/"


if Path(temp_path).exists() is False: 
    os.mkdir(temp_path)

if Path(model_path).exists() is False: 
    os.mkdir(model_path)

if Path(log_path).exists() is False: 
    os.mkdir(log_path)  

args.model_path = model_path
args.temp_path = temp_path

In [28]:
args.num_epoch = 2000
args.lr = 2e-4
args.batch_size = 64
args.step_size = 10

args.num_layer = 3
args.hidden_size = 128

args.obs_size = 24
args.label_size = 1
args.explain_size = 6
args.seq_len = 0

args.num_checkpoints = 100
args.use_one_hot = False

args.reasoner_joint_type = "add"
args.producer_joint_type = "add"
args.label_type = "UC" 

args.obs_fn = "none"
args.label_fn = "none"
test_size = 0.2

X = X[:]
# args.num_epoch = int(round(3.2/(1 - test_size)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle= False)


X_train = X_train.iloc[:, :].values 
X_test = X_test.iloc[:, :].values 
y_train = y_train.iloc[:, -1].values
y_test = y_test.iloc[:, -1].values

trainset = Dataset(X_train, y_train)
testset = Dataset(X_test, y_test)


In [29]:

args.loss_type = "L1"
args.error_type = "Sub"
args.loss_reduction = "all"
args.error_reduction = "none"
log_details = create_log_details(args)
args.log = SummaryWriter(log_dir=create_log_name(log_path, log_details))
set_random_seed(0)

ccnets = CCNets(args, MLP, DeepFM, ResMLP)
ccnets.train(trainset, testset)

# args.loss_type = "MSE"
# args.loss_reduction = "all"
# log_details = create_log_details(args)
# args.log = SummaryWriter(log_dir=create_log_name(log_path, log_details))
# set_random_seed(0)
# resnets = ResNets(args, MLP, DeepFM)
# resnets.train(trainset, testset)

  0%|          | 0/2000 [00:00<?, ?it/s]

[14/2000][1/8][Time 1.86]
Opt-Adam lr: 0.00019908190206581545
Inf: 0.0094	Gen: 1.6442	Rec: 1.6437	E: 0.0099	R: 0.0089	P: 3.2784
precision: 0.6917	recall: 0.9881	f1: 0.8137
[28/2000][3/8][Time 1.64]
Opt-Adam lr: 0.0001981680186507147
Inf: 0.0023	Gen: 1.6153	Rec: 1.6151	E: 0.0024	R: 0.0022	P: 3.2281
precision: 0.6829	recall: 1.0000	f1: 0.8116
[42/2000][5/8][Time 1.65]
Opt-Adam lr: 0.0001972583304079914
Inf: 0.0029	Gen: 1.5742	Rec: 1.5739	E: 0.0032	R: 0.0026	P: 3.1451
precision: 0.6829	recall: 1.0000	f1: 0.8116
[57/2000][0/8][Time 1.71]
Opt-Adam lr: 0.00019635281807975005
Inf: 0.0047	Gen: 1.5606	Rec: 1.5597	E: 0.0056	R: 0.0039	P: 3.1155
precision: 0.6829	recall: 1.0000	f1: 0.8116
[71/2000][2/8][Time 1.67]
Opt-Adam lr: 0.0001954514624964984
Inf: 0.0252	Gen: 1.2726	Rec: 1.2667	E: 0.0310	R: 0.0193	P: 2.5141
precision: 0.6833	recall: 0.9762	f1: 0.8039
[85/2000][4/8][Time 1.62]
Opt-Adam lr: 0.00019455424457674148
Inf: 0.0606	Gen: 0.6838	Rec: 0.6578	E: 0.0867	R: 0.0346	P: 1.2810
precision: 0.68

In [26]:
X.shape

(614, 24)

In [27]:
y.shape

(614, 2)