In [81]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [82]:
pathfile1 = '../data/loans_ind_clean.csv'

In [83]:
loans_ind = pd.read_csv(pathfile1, engine = 'python')
loans_ind.head()

Unnamed: 0,term,installment,grade,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,title,...,num_sats,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,fico_avg
0,36.0,359.26,A,6.0,OWN,153000.0,Not Verified,1.0,credit_card,Credit card refinancing,...,20,2,95.7,11.1,1,528172,100865,28100,120572,722.0
1,36.0,285.7,E,2.0,RENT,50000.0,Source Verified,1.0,debt_consolidation,Debt consolidation,...,4,0,80.0,100.0,0,7600,5588,3600,4000,687.0
2,36.0,232.79,A,7.0,MORTGAGE,110000.0,Not Verified,1.0,debt_consolidation,Debt consolidation,...,19,3,100.0,8.3,0,350617,45955,83700,32239,712.0
3,60.0,243.29,C,7.0,RENT,51979.0,Source Verified,1.0,debt_consolidation,Debt consolidation,...,15,3,100.0,0.0,2,34200,10956,18800,5500,692.0
4,36.0,492.34,C,7.0,MORTGAGE,75000.0,Verified,1.0,debt_consolidation,Debt consolidation,...,4,3,90.0,100.0,0,170591,27684,3000,30321,687.0


In [84]:
loans_ind['loan_status'].value_counts()

1.0    344607
0.0    115934
Name: loan_status, dtype: int64

## Preparación del modelo

### Reducir la dimensión del dataset
Reducimos la dimensión para poder trabajar los modelos de forma más simplificada en un principio. Posteriormente se entrenaran los modelos con la totalidad del dataset.

In [85]:
loans_500 = loans_ind.head(500)
loans_500.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 48 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   term                        500 non-null    float64
 1   installment                 500 non-null    float64
 2   grade                       500 non-null    object 
 3   emp_length                  475 non-null    float64
 4   home_ownership              500 non-null    object 
 5   annual_inc                  500 non-null    float64
 6   verification_status         500 non-null    object 
 7   loan_status                 500 non-null    float64
 8   purpose                     500 non-null    object 
 9   title                       500 non-null    object 
 10  addr_state                  500 non-null    object 
 11  dti                         500 non-null    float64
 12  earliest_cr_line            500 non-null    object 
 13  inq_last_6mths              500 non

Separamos la variable a predecir del dataset

In [86]:
y = loans_500['loan_status']
x = loans_500.drop('loan_status', axis = 1)
x_list = list(x.columns)

One Hot Encoding y escalar las variables

In [87]:
numeric_transformer_0 = Pipeline(steps = [
    ('imputer_0', SimpleImputer(strategy = 'constant')),
    ('escalar', StandardScaler())
])
numeric_transformer_median = Pipeline(steps = [
    ('imputer_median', SimpleImputer(strategy = 'median')),
    ('escalar1', StandardScaler())
])

In [88]:
categoric_transformer  = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'sin_info')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore')) # mirar esto de handle_unknowns
])

In [89]:
numeric_features_0 = x.loc[:, ['bc_open_to_buy', 'mths_since_recent_bc', 'revol_util', 'emp_length', 'term', 'installment', 'annual_inc', 'inq_last_6mths',
                                      'mths_since_last_delinq', 'open_acc', 'revol_bal', 'total_acc', 'tot_cur_bal', 'acc_open_past_24mths',
                                       'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op',
                                      'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_inq', 'num_actv_rev_tl',
                                       'num_sats', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'pub_rec_bankruptcies', 'tot_hi_cred_lim',
                                      'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'fico_avg']].columns

numeric_features_median = x.loc[:, ['percent_bc_gt_75',
                                            'bc_util',
                                            'avg_cur_bal',
                                            'num_rev_accts',
                                            'dti']].columns


categoric_features = x.select_dtypes(include = ['object']).columns

#Mirar las variables

In [90]:
preprocessor = ColumnTransformer(
        transformers = [
            ('num_0', numeric_transformer_0, numeric_features_0),
            ('num_median', numeric_transformer_median, numeric_features_median),
            ('cat', categoric_transformer, categoric_features)
        ]
)

Se seleccionan 500 observaciones aleatorias del dataset para ver que el modelo funciona correctamente.

Train y Test

In [91]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x,
                                                    y,
                                                    test_size = 0.25,
                                                    random_state = 42)

#Mirar bien el % que va a ir al test y lo de random_state

In [92]:
test_y.value_counts()

1.0    92
0.0    33
Name: loan_status, dtype: int64

In [30]:
#Para ver que se ha hecho correctamente
print('Training Features Shape:', train_x.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)

Training Features Shape: (375, 47)
Training Labels Shape: (375,)
Testing Features Shape: (125, 47)
Testing Labels Shape: (125,)


Establecemos el modelo base. Este modelo puede referirse a seleccionar la mitad de las personas como que se les entrega el prestamo y la otra mitad como que no se les entrega el prestamo.

Después estableceremos un modelo regresivo, un Random Forest, Support Vector Machine, K Neighbors Classifier, Ada Boosting Classifier y Gradient Boosting Classifier

In [None]:
baseline_preds = test_features[:, feature_list.index('')]

In [93]:
classifiers = [
    SVC(kernel="rbf", C=0.025, probability=True),
    RandomForestClassifier(),
    LogisticRegression()]


In [94]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', classifier)])
    pipe.fit(train_x, train_y)
    print(classifier) 
    print("model score: %.3f" % pipe.score(test_x, test_y))

SVC(C=0.025, probability=True)
model score: 0.736
RandomForestClassifier()
model score: 0.744
LogisticRegression()
model score: 0.744


In [95]:
from sklearn.metrics import confusion_matrix
confusion_matrix1 = confusion_matrix(test_y, pred_y)
print(confusion_matrix1)

[[ 3 30]
 [ 0 92]]


In [98]:
from sklearn.metrics import classification_report
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

         0.0       1.00      0.09      0.17        33
         1.0       0.75      1.00      0.86        92

    accuracy                           0.76       125
   macro avg       0.88      0.55      0.51       125
weighted avg       0.82      0.76      0.68       125

