In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
pwd

In [None]:
data_info = pd.read_csv("../input/lendingclub-data-sets/lending_club_info.csv",index_col='LoanStatNew')

In [None]:
data_info.head()

In [None]:
print(data_info.loc['revol_util']['Description'])

In [None]:
def feat_info(col_name):
    print(data_info.loc[col_name]['Description'])

In [None]:
feat_info('mort_acc')

In [None]:
df = pd.read_csv('../input/lendingclub-data-sets/lending_club_loan_two.csv')

In [None]:
df.head()

### EDA

In [None]:
sns.countplot(x="loan_status",data=df);

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df["loan_amnt"],kde=True,bins=50);

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),annot=True,cmap ="viridis");
plt.ylim(10,0)

In [None]:
feat_info("installment")

In [None]:
feat_info("loan_amnt")

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x="installment",y="loan_amnt",data=df);

In [None]:
sns.boxplot(x="loan_status",y="loan_amnt",data=df)

In [None]:
df.groupby("loan_status")["loan_amnt"].describe()

In [None]:
df["grade"].unique()

In [None]:
df["sub_grade"].unique()

In [None]:
sns.countplot(x="grade",data=df,hue="loan_status");

In [None]:
plt.figure(figsize=(15,5))
subgrade_order = sorted(df["sub_grade"].unique())
sns.countplot(x="sub_grade",data=df,order=subgrade_order,palette="coolwarm");

In [None]:
plt.figure(figsize=(15,5))
subgrade_order = sorted(df["sub_grade"].unique())
sns.countplot(x="sub_grade",data=df,order=subgrade_order,palette="coolwarm",hue="loan_status");

In [None]:
f_and_g = df[(df["grade"]=="G") | (df["grade"]=="F")]

plt.figure(figsize=(15,5))
subgrade_order = sorted(f_and_g["sub_grade"].unique())
sns.countplot(x="sub_grade",data=f_and_g,order=subgrade_order,hue="loan_status");

In [None]:
df["loan_repaid"] = df["loan_status"].map({"Fully Paid":1,"Charged Off":0})

In [None]:
df.corr()["loan_repaid"].sort_values().drop("loan_repaid").plot(kind="bar");

### Data Manipulation

In [None]:
df.describe().T

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
100 * df.isnull().sum() / len(df)

In [None]:
df["emp_title"].nunique()

In [None]:
df["emp_title"].value_counts()

In [None]:
df.drop("emp_title",axis=1,inplace=True)

In [None]:
sorted(df["emp_length"].dropna().unique())

In [None]:
emp_length_order = ['< 1 year',
 '1 year',
 '2 years',
 '3 years',
 '4 years',
 '5 years',
 '6 years',
 '7 years',
 '8 years',
 '9 years',
 '10+ years'] 

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="emp_length",data=df,order=emp_length_order);

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="emp_length",data=df,order=emp_length_order,hue="loan_status");

In [None]:
emp_co = df[df["loan_status"] == "Charged Off"].groupby("emp_length").count()["loan_status"]

In [None]:
emp_fp = df[df["loan_status"] == "Fully Paid"].groupby("emp_length").count()["loan_status"]

In [None]:
emp_len = emp_co/(emp_co+emp_fp)

In [None]:
emp_len.plot(kind="bar");

* charge off rated are extremely similar across all employement lengths. So we drop the emp_lentgh column

In [None]:
df.drop("emp_length",axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df["purpose"].head(10)

In [None]:
df["title"].head(10)

In [None]:
df.drop("title",axis=1,inplace=True)

In [None]:
feat_info("mort_acc")

In [None]:
df.corr()["mort_acc"].sort_values().drop("mort_acc").plot(kind="bar");

In [None]:
total_acc_avg = df.groupby("total_acc").mean()["mort_acc"]

In [None]:
def fill_mort_acc(total_acc,mort_acc):
    if np.isnan(mort_acc):
        return total_acc_avg[total_acc]
    else:
        return mort_acc

In [None]:
df["mort_acc"] = df.apply(lambda x: fill_mort_acc(x["total_acc"],x["mort_acc"]),axis=1)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

### Categorical Variables and Dummy Variables

### term feature

In [None]:
df.select_dtypes(["object"]).columns

In [None]:
feat_info("term")

In [None]:
df["term"].value_counts()

In [None]:
df["term"] = df["term"].apply(lambda term: int(term[:3]))

### grade feature

In [None]:
feat_info("grade")

In [None]:
df["grade"].value_counts().sort_values(ascending=False)

In [None]:
df.drop("grade",axis=1,inplace=True)

In [None]:
dummies = pd.get_dummies(df["sub_grade"],drop_first=True)

In [None]:
df = pd.concat([df.drop("sub_grade",axis=1),dummies],axis=1)

In [None]:
df.columns

### verification_status, application_type, initial_list_status, purpose

In [None]:
dummies = pd.get_dummies(df[["verification_status","application_type","initial_list_status","purpose"]],drop_first=True)
df = pd.concat([df.drop(["verification_status","application_type","initial_list_status","purpose"],axis=1),dummies],axis=1)

In [None]:
df.head()

### home_ownership

In [None]:
df["home_ownership"].value_counts()

In [None]:
df["home_ownership"] = df["home_ownership"].replace(["NONE","ANY"],"OTHER")

In [None]:
df["home_ownership"].value_counts()

In [None]:
dummies = pd.get_dummies(df["home_ownership"],drop_first=True)
df = pd.concat([df.drop("home_ownership",axis=1),dummies],axis=1)

### address

In [None]:
df["zip_code"] = df["address"].apply(lambda x:x[-5:])

In [None]:
df["zip_code"].value_counts()

In [None]:
dummies = pd.get_dummies(df["zip_code"],drop_first=True)
df = pd.concat([df.drop("zip_code",axis=1),dummies],axis=1)

In [None]:
df.drop("address",axis=1,inplace=True)

### issue_d

* this would be data leakage, we wouldn't know beforehand whether or not a loan would be issued when using our model, so drop this feature

In [None]:
feat_info("issue_d")

In [None]:
df.drop("issue_d",axis=1,inplace=True)

### earliest_cr_line

* extract the year from this feature

In [None]:
df["earliest_cr_year"] = df["earliest_cr_line"].apply(lambda x: int(x[-4:]))

In [None]:
df.drop("earliest_cr_line",axis=1,inplace=True)

### Data Preprocessing

In [None]:
df = df.drop("loan_status",axis=1)

In [None]:
X = df.drop("loan_repaid",axis=1).values
y = df["loan_repaid"].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

### Creating Model

In [None]:
import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout

In [None]:
model = Sequential()

model.add(Dense(78,activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(39,activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(19,activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(1,activation="sigmoid"))

model.compile(loss="binary_crossentropy",optimizer="adam")

In [None]:
model.fit(x=X_train, y=y_train, epochs=10, batch_size=256, validation_data=(X_test,y_test))

### Save the Model

In [None]:
from tensorflow.keras.models import load_model

In [None]:
model.save("lending_club_model.h5")

### Evaluating Model Performance

In [None]:
losses = pd.DataFrame(model.history.history)

In [None]:
losses.plot();

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
predictions = model.predict_classes(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
confusion_matrix(y_test,predictions)

In [None]:
import random
random.seed(101)
random_ind = random.randint(0,len(df))

new_customer = df.drop("loan_repaid",axis=1).iloc[random_ind]
new_customer

In [None]:
new_customer = scaler.transform(new_customer.values.reshape(1,78))

In [None]:
model.predict_classes(new_customer)

In [None]:
df.iloc[random_ind]["loan_repaid"]