In [173]:
import numpy as np 
import pandas as pd 
from sklearn.utils import shuffle

application = pd.read_csv('/kaggle/input/credit-card-approval-prediction/application_record.csv')
credit = pd.read_csv('/kaggle/input/credit-card-approval-prediction/credit_record.csv')

In [174]:
ids = set(application['ID']).intersection(set(credit['ID']))
application = application[application['ID'].isin(ids)]
credit = credit[credit['ID'].isin(ids)]

# **Split Train Test**

In [175]:
train_size = len(application)*80//100
test_size = len(application) - train_size
print(f'Train size: {train_size}, Test size: {test_size}')

Train size: 29165, Test size: 7292


In [176]:
application = shuffle(application)

x_train = application[:train_size]
x_test = application[train_size:]

y_train = credit[credit['ID'].isin(x_train['ID'])]
y_test = credit[credit['ID'].isin(x_test['ID'])]

# **Encode Application Dataset**

In [177]:
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

In [178]:
x_test

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,...,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff
16571,5047808,0,103500.0,-14276,-2561,1,1,1,0,2.0,...,0,0,0,0,0,0,1,0,0,0
22615,5067016,0,225000.0,-9926,-461,1,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
29190,5090479,0,94500.0,-16161,-2897,1,1,1,0,2.0,...,1,0,0,0,0,0,0,0,0,0
8592,5029721,1,135000.0,-9778,-2910,1,1,0,1,3.0,...,0,0,0,0,0,0,0,0,0,0
18033,5052931,0,54000.0,-24044,365243,1,0,1,0,2.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21271,5062251,0,315000.0,-19441,-4064,1,0,0,0,2.0,...,0,0,1,0,0,0,0,0,0,0
21876,5065580,2,144000.0,-11147,-262,1,0,0,0,4.0,...,0,0,0,0,0,0,0,0,0,0
434660,5096068,0,180000.0,-14943,-1650,1,1,0,1,2.0,...,0,0,0,0,0,0,0,0,0,0
49339,5149989,0,315000.0,-16805,-806,1,0,0,0,2.0,...,0,0,0,0,0,0,0,0,0,0


# **Encode Credit Dataset**

In [179]:
def get_credit_status(credit):
    group=credit.groupby('ID')
    pivot_tb = credit.pivot(index = 'ID', columns = 'MONTHS_BALANCE', values = 'STATUS')
    pivot_tb['open_month'] = group['MONTHS_BALANCE'].min()
    pivot_tb['end_month'] = group['MONTHS_BALANCE'].max() 
    pivot_tb['ID'] = pivot_tb.index
    pivot_tb = pivot_tb[['ID', 'open_month', 'end_month']]
    pivot_tb['window'] = pivot_tb['end_month'] - pivot_tb['open_month'] 
    pivot_tb.reset_index(drop = True, inplace = True)
    credit0 = credit.copy()
    credit0 = pd.merge(credit0, pivot_tb, on = 'ID', how = 'left') 
    credit0=credit0[credit0['window']>20]
    credit0['status']=np.where((credit0['STATUS']=='2')| (credit0['STATUS']=='3')|(credit0['STATUS']=='4')|(credit0['STATUS']=='5'),1,0)
    
    return credit0

In [180]:
y_train = get_credit_status(y_train)[['ID','status']]
y_test = get_credit_status(y_test)[['ID','status']]

In [181]:
y_train = y_train.groupby('ID').any().reset_index()
y_test = y_test.groupby('ID').any().reset_index()

In [182]:
y_test

Unnamed: 0,ID,status
0,5008832,False
1,5008838,False
2,5008952,False
3,5008967,False
4,5008979,False
...,...,...
3052,5150386,False
3053,5150400,False
3054,5150405,False
3055,5150411,False


# **Merge x and y together to make sure the ids matches**

In [183]:
merged_train = x_train.merge(y_train, on='ID')
merged_test = x_test.merge(y_test, on='ID')

In [184]:
x_train = merged_train.drop(['ID', 'status'],axis=1)
x_test = merged_test.drop(['ID', 'status'],axis=1)
y_train = merged_train['status']
y_test = merged_test['status']

# **Models**

In [191]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

In [189]:
lr_model = LogisticRegression()
lr_model = lr_model.fit(x_train, y_train)

In [193]:
predictions = lr_model.predict(x_test)

print(f'Accuracy: {accuracy_score(y_test, predictions)}')
print(f'F1 score: {f1_score(y_test, predictions)}')

Accuracy: 0.9653254824991822
F1 score: 0.0
