In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Data cleansing v1
'''
id : 단순 id
url : 단순 url
title : purpose 칼럼에서 더 categorized 설명되는 내용
application_type : individual이 980개
next_pymnt_d : 칼럼이 비어 있음
policy_code : 값이 모두 1임
chargeoff_within_12_mths : 0이 995개
delinq_amnt : 0이 997개
num_tl_120dpd_2m : 0이 아닌 값이 딱 1개
num_tl_30dpd : 0이 아닌 값이 딱 2개
hardship_flag : 값이 모두 N
disbursement_method : cash가 997개
'''
 
df = pd.read_csv("lending_club_sample.csv")
df = df.drop(columns=["id",
                      "url",
                      "title",
                      "application_type",
                      "next_pymnt_d",
                      "policy_code",
                      "chargeoff_within_12_mths",
                      "delinq_amnt",
                      "num_tl_120dpd_2m",
                      "num_tl_30dpd",
                      "hardship_flag",
                      "disbursement_method"])

df.to_csv("data_preprocessed_v1.csv", index=False)

In [19]:
# Data cleansing v2
df = pd.read_csv("./data_preprocessed_v1.csv")

# categorical variable(ordinal variable) Labeling
og_grade = df["grade"]
og_sub_grade = df["sub_grade"]

grade = np.array(df["grade"])
sub_grade = np.array(df["sub_grade"])

encoder = LabelEncoder()
grade_encoded = encoder.fit_transform(grade)
sub_grade_encoded = encoder.fit_transform(sub_grade)

df["grade"] = grade_encoded
df["sub_grade"] = sub_grade_encoded

# data cleansing. drop columns which has under 50 datas
df = df.loc[:, df.count() >= 50]

# data cleansing. add avg fico score column 
insert_loc = df.columns.get_loc('fico_range_low')
df.insert(insert_loc, 'fico_avg', (df['fico_range_low'] + df['fico_range_high']) / 2)

# drop columns
df = df.drop(columns=["zip_code",
                      "fico_range_low",
                      "fico_range_high"])

df

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,debt_settlement_flag
0,20000.0,20000.0,20000.0,36 months,13.99,683.46,2,12,Accounts Payable Mgr,10+ years,...,2.0,100.0,66.7,0.0,0.0,759411.0,27732.0,17500.0,23245.0,N
1,7000.0,7000.0,7000.0,36 months,9.16,223.12,1,6,,,...,4.0,96.6,25.0,0.0,0.0,53672.0,28344.0,19200.0,23072.0,N
2,20000.0,20000.0,20000.0,36 months,8.67,632.93,1,5,dental technician,10+ years,...,1.0,89.7,57.1,0.0,0.0,434045.0,47232.0,28300.0,32219.0,N
3,16000.0,16000.0,16000.0,36 months,14.46,550.43,2,13,Veterinarian,5 years,...,0.0,84.0,33.3,0.0,0.0,155587.0,81483.0,77000.0,73587.0,N
4,4000.0,4000.0,4000.0,36 months,11.53,131.97,1,9,Program Manager,2 years,...,2.0,73.7,,0.0,0.0,329177.0,91411.0,0.0,0.0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,21000.0,21000.0,21000.0,60 months,25.82,626.52,4,23,Assistant principal,2 years,...,6.0,100.0,66.7,0.0,0.0,377464.0,157456.0,12000.0,117101.0,N
996,10625.0,10625.0,10625.0,60 months,17.57,267.33,3,18,Team Member,10+ years,...,5.0,100.0,0.0,1.0,0.0,89666.0,70977.0,6000.0,77266.0,N
997,7000.0,7000.0,7000.0,36 months,8.18,219.94,1,5,Learning and Developing Specialist,3 years,...,4.0,92.3,50.0,0.0,0.0,71752.0,50074.0,10600.0,49652.0,N
998,8000.0,8000.0,8000.0,36 months,13.35,270.91,2,11,Supervisor II,10+ years,...,2.0,100.0,81.8,0.0,0.0,59391.0,42018.0,19300.0,27423.0,N
