In [124]:
import pandas as pd
import numpy as np

loans = pd.read_csv('LoanStats3a.csv', skiprows=1, low_memory=False)

half_count = len(loans) / 2
loans = loans.dropna(thresh=half_count, axis=1)
loans = loans.drop(['desc', 'url'],axis=1)

colsToDrop = ["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"]
    
loans = loans.drop(colsToDrop, axis=1)
        
colsToDrop = ["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"]
    
loans = loans.drop(colsToDrop, axis=1) 

colsToDrop = ["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"]

loans = loans.drop(colsToDrop, axis=1)
    
loans.loan_status.value_counts()

loans = loans[ (loans.loan_status == "Fully Paid") | (loans.loan_status == "Charged Off") ]

mapping_dict = {
    "loan_status": {
        "Fully Paid": 0, # NOTA: o classificador myAdaboost aceita qualquer tipo de dados como label
        "Charged Off": 1
    }
}

loans.replace(mapping_dict, inplace=True)

drop_columns = []

cols = list(loans.columns)

for col in cols:
    non_null = loans[col].dropna()
    unique_non_null = non_null.unique()
    if len(unique_non_null) == 1:
        drop_columns.append(col)
        
loans = loans.drop(drop_columns, axis = 1)

loans = loans.drop("pub_rec_bankruptcies", axis=1)
loans = loans.dropna(subset=["title", "revol_util", "last_credit_pull_d"])

mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
    }
}

loans.replace(mapping_dict, inplace=True)

loans["emp_length"].fillna(0, inplace=True)

loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)

loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")


cat_columns = ["home_ownership", "verification_status", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)

loans.reset_index(inplace=True, drop=True)
    
print(loans.shape)

(39707, 38)


In [52]:
def calc_metrics(predictions, target):
    
    tn = sum( (predictions == 0) & (target == 0) )
    #tn = sum( ~(new_predictions | target) )
    #print("tn: {}".format(tn))
    
    fp = sum( (predictions == 1) & (target == 0) )
    #fp = sum( new_predictions & ~target )
    #print("fp: {}".format(fp))
    #print("tn + fp: {}".format(tn + fp))

    tp = sum( (predictions == 1) & (target == 1) )
    #tp = sum( new_predictions  & target )
    #print("tp: {}".format(tp))
    
    fn = sum( (predictions == 0) & (target == 1) )
    #fn = sum( ~new_predictions & target )
    #print("fn: {}".format(fn))
    #print("tp + fn: {}".format(tp + fn))

    tpr = tp / (tp + fn)
    tnr = tn / (tn + fp)
    fpr = fp / (fp + tn)
    npv = tn / (tn + fn)

    #print("tpr: {}".format(tpr))
    #print("fpr: {}".format(fpr))
    #print("accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)) )
    
    #precision = tn /(tn + fp)
    #recall = tn/(tn + fn)
    
    precision = tp /(tp + fp)
    recall = tp/(tp + fn)
    acc = (tp + tn)/(tp + tn + fp +fn)
    
    F1 = 2*(precision*recall)/(precision + recall)
    
    return tp, fp, tn, fn, tpr, tnr, fpr, precision, acc, F1, npv

In [374]:
##############################################################
################### My Adaboost Class - Definition
##############

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_array
from sklearn.utils import check_X_y
from sklearn.preprocessing import MinMaxScaler
import copy
import numpy as np
import sys

class myAdaboost(BaseEstimator, ClassifierMixin):  


    
    def __init__(self, threshold=0.5, number_of_iterations=50, classifier=None):
        
        self.threshold = threshold
        self.T = number_of_iterations
        self.weak_learners_list = []
        self.classifier = classifier

    def check_y(self, y):
        
        if type(y) == type(pd.Series([])):
            
            if pd.isnull(y).sum() > 0:
                raise NameError("There are None ou Nan values in y")
    
            if not ( np.array_equal( y.unique(), np.array([1, 0]) )  or  np.array_equal( y.unique(), np.array([0, 1]) ) ):
                raise NameError( "y must be a pandas Series or an 1d numpy array with numeric binary values: (0,1)")
            
            
        if type(y) == type(np.array([])):       

            if pd.isnull(pd.Series( y ) ).sum() > 0:
                raise NameError("There are None ou Nan values in y")
            
            if y.dtype == "object":
                raise NameError("y is an array of type objcet. y must be a pandas Series or an 1d numpy array with numeric binary values: (0,1) ")
            
            if not ( np.array_equal(np.unique(y), np.array([0, 1])) or  np.array_equal(np.unique(y), np.array([1, 0])) ):
                raise NameError("y must be a pandas Series or an 1d numpy array with numeric binary values: (0,1)")       
    
    
    def fit(self, X=None, y=None):
        
        
        # calling a method to ensure that "y is a pandas Series or an 1d numpy array with numeric binary values: (0,1)"
        self.check_y(y)
        
        X_, y_ = check_X_y(X, y)
        y_[ y_==0 ] = -1
        
        weights = np.array( [1/len(y_)]*len(y_) )
        
        np.random.seed(seed=123)
        
        for t in range(self.T):
            
            dic = {}
                  
            train_set_index = np.random.choice(a=len(y_), size=len(y_), p=weights, replace=True)
                 
            self.classifier.fit( X_[train_set_index], y_[train_set_index] )
            new_predictions = self.classifier.predict( X_ )
            
            new_error = sum( weights*( new_predictions != y_ ) )
        
            new_cl = copy.deepcopy( self.classifier )
            
            alpha = 0.5*np.log( ( 1-new_error )/new_error )
        
            sys.stdout.write('\rIteration:' + str(t) + " | error:" + str(new_error) + " | alpha:" + str(alpha)  )
        
            dic["weak_learner"] = new_cl
            dic["alpha"] = alpha
        
            preNormWeights = weights * np.exp( -1*dic["alpha"]*(y_*new_predictions) )
        
            weights = preNormWeights/preNormWeights.sum()
            weights = np.array(weights)
            
            self.weak_learners_list.append(dic)
        
        return self
        
    
    def predict_proba(self, X):
        
        sum_predictions = np.matrix( [0.0]*len(X) )
        
        for weak_learner in self.weak_learners_list:
            predictions = weak_learner["weak_learner"].predict( X )
            sum_predictions += weak_learner["alpha"]*predictions
    
        mMscaler = MinMaxScaler()

        mMscaler.fit(sum_predictions.T)

        sum_predictions_norm = mMscaler.transform(sum_predictions.T)
        
        sum_predictions_norm = np.concatenate( ( (1-sum_predictions_norm), sum_predictions_norm), axis=1 )
        
        return np.array(sum_predictions_norm)
  
    
    def predict(self, X):
        
        predictions = self.predict_proba(X)[:,1]
        
        predictions[ predictions >= self.threshold ] = 1
        predictions[ predictions < self.threshold ] = 0

        return predictions
    
    def score(self, X=None, y=None):
        
        predictions = self.predict(X)
        acc = accuracy_score(y, predictions, normalize=True, sample_weight=None)
        return  acc
    
    

In [373]:
##############################################################
################### My Adaboost Class
##############

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

features_cols = list(loans.columns)
features_cols.remove("loan_status")
features = loans[features_cols].copy(deep=True)
target = loans["loan_status"].copy(deep=True)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=1)
X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

thresh = 0.5

myAdaB = myAdaboost(threshold = thresh, number_of_iterations=50, classifier = DecisionTreeClassifier(max_depth=1))
myAdaB.fit( X_train, y_train )
predictions_prob = myAdaB.predict_proba(X_test)[:,1]

print("\n\n")

predictions = pd.Series( myAdaB.predict(X_test) )

print(confusion_matrix( y_test, predictions) ) 

print("\nAUC Score (Test): %f" % metrics.roc_auc_score(y_test, predictions_prob) )




Iteration:49 | error:0.489088061549 | alpha:0.0218273426628


[[7294 3961]
 [ 715 1134]]

AUC Score (Test): 0.686664


In [361]:
print(10203+1052+1639+210)

print( sum( (y_test==0) &(predictions==1) ) )

13104
1052


In [75]:
import numpy as np

from sklearn.utils import check_array

from sklearn.utils.multiclass import type_of_target

from sklearn.utils.multiclass import unique_labels

#a = np.array([[0, 0, 1, 1],
#              [0, 1, 1, 1]])

#a = np.array([["a", "a", "a", "a"],
#              ["a", "a", "a", "a"],
#              ["a", "a", "a", "a"]])

a = np.array([1, 0, 1, 1, 0, 1])
#              [0, 1, 1, 0])

#a = np.array(["a", "a", "a", "a"])

#‘multiclass’: y contains more than two discrete values, is not a sequence of sequences, and is 1d or a column vector.
#a = np.array(["a", "b", "b", 1.2, "c"])

def check_y_integrity(y):

    if type(y) != type(pd.Series([])):
        print( "y must be a pandas Series with binary values (0,1) ")
        return 0
    
    if pd.isnull(y).sum() > 0:
        print("there are None ou Nan values in y series")
        return 0
    
    y_ = y.values
    
    type_of_target_ =  type_of_target(y_) 
    print(type_of_target_)

    if type_of_target_ == "multilabel-indicator":
        print( "y must be a pandas Series with binary values (0,1) ")
        return 0

    if type_of_target_ == "continuous":
        print( "y must be a pandas Series with binary values (0,1) ")
        return 0

    if type_of_target_ == "multiclass":
        print( "y must be a pandas Series with binary values (0,1) ")
        return 0

    unique_labels_ = unique_labels(y_)

    if not np.array_equal( unique_labels_, np.array([0, 1]) ):
        print( "y must be a pandas Series with binary values (0,1) ")
        return 0
        
    y_[ y_ == 0] = -1
    
    return pd.Serie(y_, index = y.index)
        

check_y_integrity(a)
    
#if type_of_target_ != "binary" and type_of_target_ != "multiclass":
    
#    print("stop")
#    print(type_of_target_)
    

#print(unique_labels(a))



y must be a pandas Series with binary values (0,1) 


0

In [146]:
import pandas as pd

from sklearn.utils.multiclass import type_of_target

y = y_train.values

print(y[0:10])
print(y.shape)
y = np.array(["hj", 5, "hj"]) 
print(y)
print(y.shape)

print(type(y))

print(unique_labels(y))

#print(type_of_target(y))

['hj' 'hj' 'hj' 'hj' 'hj' 5 'hj' 'hj' 'hj' 'hj']
(26603,)
['hj' '5' 'hj']
(3,)
<class 'numpy.ndarray'>
['5' 'hj']


In [189]:
if not ( np.array_equal( y_train.unique(), np.array([1, 0]) )  or  np.array_equal( y_train.unique(), np.array([0, 1]) ) ):
    print( "y must be a pandas Series with numeric binary values (0,1) ")
    

y must be a pandas Series with numeric binary values (0,1) 


In [192]:
y_train.unique()

array(['d', 1], dtype=object)

In [50]:

from sklearn.utils import check_X_y

X = pd.DataFrame({'A' : ["b", 2, 3, 4],
                  'B' : [5, 6, 7, 8],
                  'C' : [9, 1, 2, 3],
                  'D' : [4, 6, 8, 1] }, index=['a', 'c', 'e', 'f'])

y = pd.Series([1, 2, 3, 4])

print(X)
print(y)
                  
print( check_X_y(X, y) )



   A  B  C  D
a  b  5  9  4
c  2  6  1  6
e  3  7  2  8
f  4  8  3  1
0    1
1    2
2    3
3    4
dtype: int64


ValueError: could not convert string to float: 'b'

In [210]:
from sklearn.preprocessing import normalize

X = pd.DataFrame({'A' : [1, 2, 3, 4],
                  'B' : [5, 6, 7, 8],
                  'C' : [9, 1, 2, 3],
                  'D' : [4, 6, 8, 1] }, index=['a', 'c', 'e', 'f'])

print(X)

y = pd.Series([1, 2, 3, 4])

print(normalize(X, norm = "l2") )


   A  B  C  D
a  1  5  9  4
c  2  6  1  6
e  3  7  2  8
f  4  8  3  1
[[ 0.09016696  0.45083482  0.81150267  0.36066785]
 [ 0.22792115  0.68376346  0.11396058  0.68376346]
 [ 0.26726124  0.62360956  0.17817416  0.71269665]
 [ 0.42163702  0.84327404  0.31622777  0.10540926]]


In [219]:
print( 2-(1+2+3+4)/4)

-0.5


In [214]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(copy=True, with_mean=True, with_std=False)
scaler.fit(X)

print(scaler.transform(X))


[[-1.5  -1.5   5.25 -0.75]
 [-0.5  -0.5  -2.75  1.25]
 [ 0.5   0.5  -1.75  3.25]
 [ 1.5   1.5  -0.75 -3.75]]


In [263]:
from sklearn.preprocessing import MinMaxScaler


X = np.matrix([[1, 3, 4]])
#X = np.matrix([[1],
#               [3], 
#               [4]])

#X = X.reshape(-1, 1) 

print(X.T)


scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
scaler.fit(X.T),
X = scaler.transform(X.T)

print(X)

mMscaler = MinMaxScaler()

mMscaler.fit(X)

print(mMscaler.transform(X))

[[1]
 [3]
 [4]]
[[-1.33630621]
 [ 0.26726124]
 [ 1.06904497]]
[[ 0.        ]
 [ 0.66666667]
 [ 1.        ]]




In [292]:
m = np.matrix([[1, 3],
               [3, 4],
               [5, 5]])

print(m)

a = np.array(m)

type(a)

print(a)

[[1 3]
 [3 4]
 [5 5]]
[[1 3]
 [3 4]
 [5 5]]
