# Import 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
% matplotlib inline

In [None]:
path = '../input/'
! ls {path}

In [None]:
%time data = pd.read_table('../input/XYZCorp_LendingData.txt',parse_dates=['issue_d'],low_memory=False)

It's good practice to do not look at the test set so I am going to seperate test data from total data 

In [None]:
train_df = data[data['issue_d'] < '2015-6-01']
test_df = data[data['issue_d'] >= '2015-6-01']

# Preprocessing

In [None]:
train = train_df.copy()
test = test_df.copy()

In [None]:
train.dtypes.value_counts()

In [None]:
train['issue_d'].describe()

In [None]:
test['issue_d'].describe()

In [None]:
print(train.shape)
print(test.shape)

**Destribution of dependent variable**

In [None]:
train['default_ind'].value_counts().plot.bar()

**Describution of independent variable**

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.describe(exclude=np.number)

In [None]:
train.dtypes.value_counts()

## Missing Value Treatement

In [None]:
def missing_values_table(df):
    total_missing = df.isnull().sum().sort_values(ascending=False)
    percentage_missing = (100*df.isnull().sum()/len(df)).sort_values(ascending=False)
    missing_table = pd.DataFrame({'missing values':total_missing,'% missing':percentage_missing})
    return missing_table

In [None]:
missing_values = missing_values_table(train)
missing_values.head(20)

In [None]:
train.dtypes.value_counts()

In [None]:
def to_datepart(df,fldname,drop=False):
    fld = df[fldname]
    fld_dtype = fld.dtype
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 
            'Is_year_end', 'Is_year_start']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64)
    if drop: df.drop(fldname, axis=1, inplace=True)

In [None]:
import re
to_datepart(train,'issue_d',drop=True)
to_datepart(test,'issue_d',drop=True)

In [None]:
def treat_missing(df):
    for c in df.columns:
        if df[c].dtype == 'object':
            df.fillna(df[c].mode()[0],inplace=True)
        else:
            df.fillna(df[c].median(),inplace=True)

In [None]:
treat_missing(train)
treat_missing(test)

## Treating Categorical Values

In [None]:
def train_cat(df):
    for n,c in df.items():
        if df[n].dtype == 'object': df[n] = c.astype('category').cat.as_ordered()

In [None]:
train_cat(train)
train_cat(test)

In [None]:
train.select_dtypes('category').apply(pd.Series.nunique, axis = 0)

In [None]:
to_drop = ['sub_grade','emp_title','desc','title','zip_code',
           'addr_state','earliest_cr_line','last_pymnt_d','last_credit_pull_d']

In [None]:
train.drop(to_drop,axis=1,inplace=True)
test.drop(to_drop,axis=1,inplace=True)

## EDA

In [None]:
correlation = train.corr()['default_ind'].sort_values(ascending=False)
correlation.dropna(inplace=True)

In [None]:
fig,array = plt.subplots(1,2,figsize=(12,3))
correlation.head(10).plot.bar(ax=array[0])
correlation.tail(10).plot.bar(ax=array[1])

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(train.loc[train['default_ind']==0,'issue_dYear'],label='default_ind = 0')
sns.kdeplot(train.loc[train['default_ind']==1,'issue_dYear'],label='default_ind = 1')
plt.xlabel('Year'); plt.ylabel('Density'); plt.title('Distribution of defaulter');

### Loan Purpose

In [None]:
train.purpose.value_counts(ascending=False).plot.bar(figsize=(8,6))
plt.xlabel('purpose'); plt.ylabel('Density'); plt.title('Purpose of loan');

**Debt Consolidation** stands as clear winner for loan purpose, with more than 350K loans — or 58% from the total.<br/>

Other highlights include:

**Credit Card**  — more than 130K (~20%)<br>
**Home Improvement **— more than 135K (~6%)<br>
**Other Purposes** — less than 30K (~4%)

## Delinquent Loans

In [None]:
train.grade.value_counts().plot.bar()

In [None]:
fig,array=plt.subplots(1,2,figsize=(12,5))
train.loc[train['default_ind']==0,'grade'].value_counts().plot.bar(ax=array[0])
train.loc[train['default_ind']==1,'grade'].value_counts().plot.bar(ax=array[1])
array[0].set_title('default_ind=0 vs grade'),array[1].set_title('default_ind=1 vs grade')

In [None]:
train.application_type.value_counts()

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for c in train.columns:
    if train[c].dtype == 'object':
        if len(list(train[c].unique())) <= 2:
            train[c] = le.fit_transform(train[c])
            test[c] = le.transform(test[c])

## One Hot Encoding

In [None]:
print(train.shape)
print(test.shape)
train = pd.get_dummies(train)
test = pd.get_dummies(test)
print(train.shape)
print(test.shape)

## Aligning training and test data

In [None]:
# train_label = train['default_ind']
# # Align the training and testing data, keep only columns present in both dataframes
# train, test = train.align(test, join = 'inner', axis = 1)

# # Add the target back in
# train['default_ind'] = train_label

# print(train.shape)
# print(test.shape)

## Dependent and Independent Variable

In [None]:
X = train.copy()
y = X.pop('default_ind')

In [None]:
def split_vals(a,n):return a[:n].copy(),a[n:].copy()

In [None]:
n_valid = len(test_df)  # same as test set size
n_trn = len(X)-n_valid
raw_train,raw_valid = split_vals(train_df,n_trn)
X_train, X_valid = split_vals(X, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape

## Fitting the model

In [None]:
from sklearn.ensemble import RandomForestClassifier
m = RandomForestClassifier(n_jobs=-1,oob_score=True,n_estimators=100)
%time m.fit(X_train,y_train),m.oob_score_

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score
y_pred = m.predict(X_valid)
cm = confusion_matrix(y_valid,y_pred)
print(cm)
print(classification_report(y_valid,y_pred))
print(accuracy_score(y_valid,y_pred))
print(roc_auc_score(y_valid,y_pred))

## Single Tree

In [None]:
m = RandomForestClassifier(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)

In [None]:
y_pred = m.predict(X_valid)
cm = confusion_matrix(y_valid,y_pred)
print(cm)
print(classification_report(y_valid,y_pred))
print(accuracy_score(y_valid,y_pred))
print(roc_auc_score(y_valid,y_pred))

In [None]:
m = RandomForestClassifier(n_estimators=1, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)

In [None]:
draw_tree(m.estimators_[0], X, precision=3)

In [None]:
y_pred = m.predict(X_valid)
cm = confusion_matrix(y_valid,y_pred)
print(cm)
print(classification_report(y_valid,y_pred))
print(accuracy_score(y_valid,y_pred))
print(roc_auc_score(y_valid,y_pred))

## Feature Importnace

In [None]:
def feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)

In [None]:
fi = feat_importance(m, X_train); fi[:10]

In [None]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False)

In [None]:
def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
plot_fi(fi[:20])