##### Load and clean LoanStat3a.csv data.

Loans issued by lendingclub.com from 2007-2011 with performance data.
LendingClub is a US peer-to-peer lending company.
This is the typical problem of risk credit. 
The global objective is to first clean the data, and then use the various features to create 
a model so we can predict the target variable "loan_status"(loan fully paid or charged off).

In [43]:
import pandas as pd
import numpy as np

loans = pd.read_csv('LoanStats3a.csv', skiprows=1, low_memory=False)

half_count = len(loans) / 2
loans = loans.dropna(thresh=half_count, axis=1)
loans = loans.drop(['desc', 'url'],axis=1)

colsToDrop = ["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"]
    
loans = loans.drop(colsToDrop, axis=1)
        
colsToDrop = ["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"]
    
loans = loans.drop(colsToDrop, axis=1) 

colsToDrop = ["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"]

loans = loans.drop(colsToDrop, axis=1)
    
loans.loan_status.value_counts()

loans = loans[ (loans.loan_status == "Fully Paid") | (loans.loan_status == "Charged Off") ]

mapping_dict = {
    "loan_status": {
        "Fully Paid": 0, # NOTA: o classificador myAdaboost aceita qualquer tipo de dados como label
        "Charged Off": 1
    }
}

loans.replace(mapping_dict, inplace=True)

drop_columns = []

cols = list(loans.columns)

for col in cols:
    non_null = loans[col].dropna()
    unique_non_null = non_null.unique()
    if len(unique_non_null) == 1:
        drop_columns.append(col)
        
loans = loans.drop(drop_columns, axis = 1)

loans = loans.drop("pub_rec_bankruptcies", axis=1)
loans = loans.dropna(subset=["title", "revol_util", "last_credit_pull_d"])

mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
    }
}

loans.replace(mapping_dict, inplace=True)

loans["emp_length"].fillna(0, inplace=True)

loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)

loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")


cat_columns = ["home_ownership", "verification_status", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)

loans.reset_index(inplace=True, drop=True)
    
print(loans.shape)

(39707, 38)


##### Divide the Data in train and test

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from myAdaboost import *

features_cols = list(loans.columns)
features_cols.remove("loan_status")
features = loans[features_cols].copy(deep=True)
target = loans["loan_status"].copy(deep=True)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=1)


##### Create a prediction model using myAdaboost classifier and test it

In [45]:

from sklearn.tree import DecisionTreeClassifier
from myAdaboost import *

thresh = 0.5

myAdaB = myAdaboost(threshold = thresh, 
                    number_of_iterations=50, 
                    classifier = DecisionTreeClassifier(max_depth=1))
myAdaB.fit( X_train, y_train )
predictions_prob = myAdaB.predict_proba(X_test)[:,1]

print("\n")

predictions = myAdaB.predict(X_test)

print(confusion_matrix( y_test, predictions) ) 

print("\nAUC Score (Test): %f" % roc_auc_score(y_test, predictions_prob) )


Iteration:49 | error:0.489088061549 | alpha:0.0218273426628

[[7294 3961]
 [ 715 1134]]

AUC Score (Test): 0.686664


##### Create a prediction model using sklearn AdaBoostClassifier

In [33]:
from sklearn.ensemble import AdaBoostClassifier

adb = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), 
                         n_estimators=50, learning_rate=1.0, 
                         algorithm="SAMME.R", random_state=None)

adb.fit( X_train, y_train )
predictions_prob = adb.predict_proba(X_test)[:,1]

print("\n")

predictions = adb.predict(X_test)

print(confusion_matrix( y_test, predictions) ) 

print("\nAUC Score (Test): %f" % roc_auc_score(y_test, predictions_prob) )



[[11211    44]
 [ 1797    52]]

AUC Score (Test): 0.693145


##### Create a prediction model using sklearn LogisticRegression classifier

In [34]:
from sklearn.linear_model import LogisticRegression

penalty = {
    0:1,
    1:5
}

lr = LogisticRegression(class_weight = penalty)
lr.fit(X_train, y_train)

predictions_prob = lr.predict_proba(X_test)[:,1]

print("\n")

predictions = lr.predict(X_test)

print(confusion_matrix( y_test, predictions) ) 

print("\nAUC Score (Test): %f" % roc_auc_score(y_test, predictions_prob) )



[[8317 2938]
 [ 883  966]]

AUC Score (Test): 0.690932


#### Conclusion

The various algorithms did not differ to much in the final result, althoug the
Logistic Regression with penalty had the best performance
It's also noticeable that the myAdaboost algorithm had only a sligthly worse 
performence comparing with the Adaboost Classifier from the sklearn library.