First, we import the basic packages.

In [2]:
import numpy as np
import pandas as pd                                                                   
import matplotlib.pyplot as plt
import pickle

The next set of packages are the scikit learn packages that we will need.

In [3]:
from sklearn import neighbors, datasets, metrics, preprocessing, tree
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Introduction

### Reading in The Dataset

To read a previously pickled object in, we use the following code block. This subset of the original data is a filtered version of the original one; it only contains the data from 2007-01-01 till 2011-12-31.

To pickle a new object, we use the template code in comments:

In [4]:
with open('../data/loans_0711.pkl', 'rb') as f:
    loans_0711 = pickle.load(f)
    
# with open('../data/loans_no_miss.pkl', 'wb') as f:
#    pickle.dump((X,y), f)

In [6]:
loans_0711.shape

(42535, 74)

The data dictionary, available from the [Kaggle website](https://www.kaggle.com/wendykan/lending-club-loan-data/data) where the original data can be obtained from, contains a full description of each of the 74 columns in the dataset.

In [7]:
loans_0711.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,


As we can see, there are missing values in the columns. We shall perform some data cleaning to create a new dataset before proceeding.

In [9]:
loans_0711.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42535 entries, 0 to 42534
Data columns (total 74 columns):
id                             42535 non-null int64
member_id                      42535 non-null int64
loan_amnt                      42535 non-null float64
funded_amnt                    42535 non-null float64
funded_amnt_inv                42535 non-null float64
term                           42535 non-null object
int_rate                       42535 non-null float64
installment                    42535 non-null float64
grade                          42535 non-null object
sub_grade                      42535 non-null object
emp_title                      39909 non-null object
emp_length                     41423 non-null object
home_ownership                 42535 non-null object
annual_inc                     42531 non-null float64
verification_status            42535 non-null object
issue_d                        42535 non-null datetime64[ns]
loan_status                    

In [10]:
# select the columns to keep
cols_to_keep = ['annual_inc', 'delinq_2yrs', 'dti',
                'emp_length', 'int_rate', 'loan_amnt',
                'installment', 'inq_last_6mths', 'total_acc',
                'revol_util', 'revol_bal', 'funded_amnt_inv']

loans_0711_sub = loans_0711.loc[:, cols_to_keep]

Next, we create a variable that contains the labels that we wish to predict.

In [21]:
# create a new variable that contains the labels we wish to predict
loan = loans_0711_sub['loan_amnt'].values
funded = loans_0711_sub['funded_amnt_inv'].values                                         
targets = np.abs(loan-funded)/loan

loans_0711_sub['target_q'] = targets # Quantitative version of variable
# Categorical version of variable
loans_0711_sub['target_c'] = np.where(loans_0711_sub.target_q >= 0.05, 1, 0)

Next, we drop the rows with missing values.

In [22]:
# drop missing values rows
# notna function; not NA
no_miss = loans_0711_sub[pd.notna(loans_0711_sub).all(axis=1)]

In [23]:
no_miss.emp_length.head()
# only need the number part

0    10+ years
1     < 1 year
2    10+ years
3    10+ years
4       1 year
Name: emp_length, dtype: object

In [25]:
emp2 = no_miss.emp_length.str.extract(pat='(^[\d].* )', expand=False).values
# ^ means start from, \d means digital, so this code searches for sth. starting with digital.
emp2 = np.where(pd.isna(emp2), '0', emp2)
emp2 = np.where(emp2 == '10+ ', '11', emp2)
emp2 = [int(x.strip()) for x in emp2]
no_miss = no_miss.assign(emp_length = emp2)

In [26]:
no_miss.emp_length.head()

0    11
1     0
2    11
3    11
4     1
Name: emp_length, dtype: int64

Finally, we store them as numpy arrays and pickle them so that we will not have to do the above every time.

In [30]:
X = no_miss.iloc[:, 0:12].values
y = no_miss.target_c.values
with open('../data/loans_no_miss.pkl', 'wb') as f:
    pickle.dump((X,y), f)

In [31]:
X[0:2,:]

array([[2.4000e+04, 0.0000e+00, 2.7650e+01, 1.1000e+01, 1.0650e+01,
        5.0000e+03, 1.6287e+02, 1.0000e+00, 9.0000e+00, 8.3700e+01,
        1.3648e+04, 4.9750e+03],
       [3.0000e+04, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.5270e+01,
        2.5000e+03, 5.9830e+01, 5.0000e+00, 4.0000e+00, 9.4000e+00,
        1.6870e+03, 2.5000e+03]])

# Supervised Learning Methods and Concepts

## k-Nearest Neighbours (KNN)

Nearest neighbours can be defined in terms of the **number** of nearest neighbours, or they can be defined in terms of a **radius** - all observations within a particular radius are considered neighbours.

In [32]:
from sklearn import neighbors
from sklearn import datasets

# Create an instance of k-Nearest neighbor classifier
nn11 = neighbors.KNeighborsClassifier(n_neighbors = 11, n_jobs=-1)

We train the classfier using the input samples and their corresponding output values (the desired outcomes).

In [33]:
# Train the classifier
nn11.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=11, p=2,
           weights='uniform')

The `nn11` object now contains the fitted model. In this case, it simply remembers the neighbours of each observation. We can extract the neighbours of a particular point.

Let us find the neighbours of the first point in the dataset.

In [35]:
nn11.kneighbors(X[0,:].reshape(1,-1)) # 自然会找到第一行

(array([[   0.        ,  631.24367728, 1164.28217426, 1240.161276  ,
         1633.47947064, 1645.70780772, 1864.11841805, 1914.1180129 ,
         1937.14303398, 1957.47561076, 1958.97612494]]),
 array([[    0, 25371, 15846, 25507, 25530,  2355, 25342, 13048, 17716,
         28406, 14959]], dtype=int64))

In [34]:
dist,ind = nn11.kneighbors(X[0,:].reshape(1,-1)) #把第一行作为要寻找同类的

One of the indices is of course 0. A point is always neighbours with itself!

In [36]:
nn11.score(X,y) # accuracy: 多少比重的分类是正确的

0.8960228372363074

In [17]:
yhat = nn11.predict(X)
metrics.confusion_matrix(y, yhat)

array([[33536,    36],
       [ 4262,  3502]])

In [18]:
metrics.confusion_matrix(y, np.zeros(len(y)))

array([[33572,     0],
       [ 7764,     0]])

In [22]:
1- np.mean(y)

0.812173408167215

What would happen if we used a nearest neighbour model with $k=1$? Try it out.

In [None]:
# the acc=100%

## Training and Testing Sets

### Training and Testing Set

We would like to put aside 25% of the data for testing. There are several 
functions in `sklearn` to perform this split. What we use here is referred to as a [Stratified Shuffle Split](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html).

In [37]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.25, 
                             random_state = 42)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [34573 12153 36328 ... 32345 33407 28294] TEST: [21430 20972 17430 ... 14416 36885  1107]


### Choosing Hyperparameters

First, we use the simplest form of cross-validation. The result will always be the same, as opposed to using a shuffle split.

In [38]:
nn11 = neighbors.KNeighborsClassifier(n_neighbors=11, n_jobs=-1)
cross_val_score(nn11, X_train, y_train, cv=5)

array([0.87534269, 0.87276246, 0.88324464, 0.87790323, 0.87820616])

Here is the performance on the test set, after fitting the model to the full training set.

In [39]:
nn11.fit(X_train, y_train)
nn11.score(X_test, y_test)

0.8835881556028643

Next, we perform a grid search for the best value of $k$ to use. We search over the values 5 to 11.

In [40]:
np.arange(5, 11)

array([ 5,  6,  7,  8,  9, 10])

In [41]:
n_n = np.arange(5, 11)                                                                                             
nn_gs = neighbors.KNeighborsClassifier()
clf = GridSearchCV(estimator=nn_gs, param_grid=dict(n_neighbors=n_n), 
                   cv=5, n_jobs=-1, return_train_score=True)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': array([ 5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [None]:
clf.# + tab

In [23]:
clf.best_params_

{'n_neighbors': 5}

In [42]:
clf.cv_results_

{'mean_fit_time': array([0.10478039, 0.10137134, 0.08736148, 0.08996325, 0.09296551,
        0.11528144]),
 'mean_score_time': array([0.26979156, 0.28360162, 0.27709703, 0.29561   , 0.32453036,
        0.33854017]),
 'mean_test_score': array([0.8892007 , 0.88352364, 0.88542675, 0.87920134, 0.88107219,
        0.87594349]),
 'mean_train_score': array([0.90829625, 0.89452295, 0.89826464, 0.8882411 , 0.89141026,
        0.88332205]),
 'param_n_neighbors': masked_array(data=[5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 5},
  {'n_neighbors': 6},
  {'n_neighbors': 7},
  {'n_neighbors': 8},
  {'n_neighbors': 9},
  {'n_neighbors': 10}],
 'rank_test_score': array([1, 3, 2, 5, 4, 6]),
 'split0_test_score': array([0.88743751, 0.88098694, 0.88276085, 0.87727786, 0.87937429,
        0.87405257]),
 'split0_train_score': array([0.90927785, 0.89552841, 0.89907665, 0.88972219, 0.8931091

## Random Forest

In [43]:
RandomForestClassifier?

In [46]:
n_trees = np.arange(10, 200, 20)

In [47]:
# Create a Random Forest classifier
forest_gs = RandomForestClassifier(class_weight='balanced')

In [48]:
clf2 = GridSearchCV(estimator=forest_gs, param_grid=dict(n_estimators=n_trees), 
                   cv=5, n_jobs=-1, return_train_score=True)
clf2.fit(X_train, y_train)

KeyboardInterrupt: 

In [40]:
clf2.best_params_

{'n_estimators': 190}

In [41]:
clf2.cv_results_

{'mean_fit_time': array([ 0.79266024,  2.61465702,  4.22016525,  6.04094701,  7.17489071,
         9.27405567,  9.80590858, 11.7384182 , 14.89979043, 19.9608151 ]),
 'std_fit_time': array([0.02584526, 0.11284201, 0.16828965, 0.24134844, 0.24272632,
        0.58488471, 0.1494    , 0.27435659, 0.61085688, 2.14595261]),
 'mean_score_time': array([0.01794701, 0.06662512, 0.0794838 , 0.1197957 , 0.14829288,
        0.17516379, 0.19906864, 0.24536271, 0.32465467, 0.36069613]),
 'std_score_time': array([0.00114071, 0.01146537, 0.00193123, 0.01620355, 0.02235753,
        0.01721246, 0.00603319, 0.02469827, 0.05408652, 0.22018896]),
 'param_n_estimators': masked_array(data=[10, 30, 50, 70, 90, 110, 130, 150, 170, 190],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 10},
  {'n_estimators': 30},
  {'n_estimators': 50},
  {'n_estimators': 70},
  {'n_estim