In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model
%matplotlib inline

In [2]:
loan_raw = pd.read_csv('LoanStats3d.csv', low_memory=False)

In [3]:
#drop columns with data that don't really help
loan_raw = loan_raw.drop(['id', 'member_id', 'emp_title', 'emp_length', 'issue_d', 'url', 'purpose', 'title',
                         'zip_code', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d',
                         'last_credit_pull_d', 'pymnt_plan'], axis=1)

#drop columns with very little data in them
loan_raw = loan_raw.drop(['mths_since_last_record', 'mths_since_last_major_derog', 'annual_inc_joint',
                         'dti_joint', 'verification_status_joint', 'mths_since_recent_bc_dlq',
                         'mths_since_recent_revol_delinq', 'desc'], axis=1)

#fill the NaN in these columns with 0's
loan_raw['mths_since_last_delinq'] = loan_raw['mths_since_last_delinq'].fillna(0)

#convert percentages to floats
loan_raw['int_rate'] = loan_raw['int_rate'].str.strip('%').astype('float') / 100
loan_raw['revol_util'] = loan_raw['revol_util'].str.strip('%').astype('float') / 100

#convert remaining categorical variables
le = LabelEncoder()
for col in loan_raw.select_dtypes(object):
    loan_raw[str(col)] = le.fit_transform(loan_raw[str(col)])

In [4]:
#drop NaN values
loan_raw = loan_raw.dropna()

#create a new data frame to start testing what can be dropped
loans = loan_raw

In [5]:
#start dropping features
loans = loans.drop(['funded_amnt', 'funded_amnt_inv', 'sub_grade', 'pct_tl_nvr_dlq', 'installment',
                   'policy_code', 'grade', 'total_bal_ex_mort', 'out_prncp', 'out_prncp_inv', 'total_rec_int',
                   'total_pymnt', 'total_pymnt_inv', 'total_rec_int', 'last_pymnt_amnt'], axis=1)

In [6]:
X = loans.drop('loan_status', axis=1)
y = loans.loan_status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
rfc = ensemble.RandomForestClassifier(50)
rfc.fit(X, y)

cross_val_score(rfc, X, y, cv=10)

array([0.89829438, 0.87934302, 0.8912081 , 0.88614801, 0.88804554,
       0.88045541, 0.88425047, 0.8961368 , 0.88220393, 0.88522511])

In [8]:
print(sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), X.columns), 
             reverse=True))

[(0.3505, 'total_rec_prncp'), (0.0476, 'loan_amnt'), (0.0195, 'collection_recovery_fee'), (0.0186, 'int_rate'), (0.0185, 'revol_bal'), (0.0162, 'revol_util'), (0.016, 'annual_inc'), (0.0157, 'max_bal_bc'), (0.0156, 'dti'), (0.0151, 'mo_sin_old_il_acct'), (0.0146, 'total_rev_hi_lim'), (0.0141, 'total_bc_limit'), (0.0141, 'bc_util'), (0.0139, 'tot_hi_cred_lim'), (0.0137, 'bc_open_to_buy'), (0.0135, 'total_il_high_credit_limit'), (0.0133, 'mo_sin_old_rev_tl_op'), (0.0128, 'il_util'), (0.0128, 'avg_cur_bal'), (0.0124, 'tot_cur_bal'), (0.012, 'total_acc'), (0.0118, 'all_util'), (0.0115, 'mths_since_rcnt_il'), (0.0111, 'total_bal_il'), (0.0109, 'num_rev_accts'), (0.0109, 'mths_since_recent_inq'), (0.0105, 'num_il_tl'), (0.0105, 'mo_sin_rcnt_rev_tl_op'), (0.0098, 'num_sats'), (0.0097, 'recoveries'), (0.0097, 'num_bc_tl'), (0.0097, 'mths_since_recent_bc'), (0.0095, 'num_op_rev_tl'), (0.0092, 'acc_open_past_24mths'), (0.0091, 'mo_sin_rcnt_tl'), (0.009, 'mths_since_last_delinq'), (0.009, 'inq_la

In [11]:
pca = PCA(n_components=74)
pca.fit(loans)
ratio = pca.explained_variance_ratio_

In [12]:
a = 0
for i in range(73):
    a=a+ratio[i]
    if a > 0.99:
        print(i)
        break

4


In [13]:
pca = PCA(n_components=4)
pca.fit(X_train)
train_features = pca.transform(X_train)
train_target = pca.transform(y_train.values.reshape(-1,1))
print(pca.explained_variance_ratio_)

[0.87866965 0.05356834 0.03378491 0.02100961]
