### 1. Importamos los paquetes

Vamos a importar todo lo que necesitemos. 

In [54]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import bootcampviztools as bt 

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNetCV
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, auc, log_loss, cohen_kappa_score

### 2. Carga y primer vistazo de los datos.

In [55]:
df_credit = pd.read_csv("./data/credit_risk_dataset.csv")

df_credit.info()
df_credit.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
6,26,77100,RENT,8.0,EDUCATION,B,35000,12.42,1,0.45,N,3
7,24,78956,RENT,5.0,MEDICAL,B,35000,11.11,1,0.44,N,4
8,24,83000,RENT,8.0,PERSONAL,A,35000,8.9,1,0.42,N,2
9,21,10000,OWN,6.0,VENTURE,D,1600,14.74,1,0.16,N,3


### 3. FeatureEngineering

In [56]:
#Vemos que hay casi 3000 entradas con nulos. Así que vamos a eliminar esas entradas. Tenemos de sobra para hacer el proyecto. 

df_credit.dropna(inplace=True)

In [57]:
df_credit.reset_index().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28638 entries, 0 to 28637
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   index                       28638 non-null  int64  
 1   person_age                  28638 non-null  int64  
 2   person_income               28638 non-null  int64  
 3   person_home_ownership       28638 non-null  object 
 4   person_emp_length           28638 non-null  float64
 5   loan_intent                 28638 non-null  object 
 6   loan_grade                  28638 non-null  object 
 7   loan_amnt                   28638 non-null  int64  
 8   loan_int_rate               28638 non-null  float64
 9   loan_status                 28638 non-null  int64  
 10  loan_percent_income         28638 non-null  float64
 11  cb_person_default_on_file   28638 non-null  object 
 12  cb_person_cred_hist_length  28638 non-null  int64  
dtypes: float64(3), int64(6), object

In [58]:
df_credit.loan_intent.value_counts()

loan_intent
EDUCATION            5704
MEDICAL              5293
VENTURE              5001
PERSONAL             4877
DEBTCONSOLIDATION    4565
HOMEIMPROVEMENT      3198
Name: count, dtype: int64

In [59]:
df_credit.person_home_ownership.value_counts()

person_home_ownership
RENT        14551
MORTGAGE    11801
OWN          2192
OTHER          94
Name: count, dtype: int64

In [60]:
df_credit.person_age.value_counts(sort=True)

person_age
23     3413
22     3163
24     3130
25     2700
26     2201
27     1887
28     1624
29     1493
30     1126
21     1061
31     1008
32      860
33      758
34      625
35      559
36      482
37      409
38      321
39      270
40      236
41      218
42      165
43      142
44      116
45       91
46       86
47       76
48       65
50       45
49       41
52       33
51       33
53       26
54       21
55       18
58       17
57       15
56       14
20       14
60       13
61        8
66        8
64        7
62        6
69        5
70        5
59        5
65        5
144       3
63        3
73        2
78        1
123       1
80        1
84        1
76        1
67        1
Name: count, dtype: int64

In [61]:
#Eliminar los valores anomalos en edad
df_credit = df_credit[df_credit["person_age"] <= 90]

#Eliminar los valores anomalos en antigüedad empresarial
df_credit = df_credit[df_credit["person_emp_length"] <= 90]


df_credit.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [65]:
df_credit.reset_index(inplace= True)
df_credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28632 entries, 0 to 28631
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   index                       28632 non-null  int64  
 1   person_age                  28632 non-null  int64  
 2   person_income               28632 non-null  int64  
 3   person_home_ownership       28632 non-null  object 
 4   person_emp_length           28632 non-null  float64
 5   loan_intent                 28632 non-null  object 
 6   loan_grade                  28632 non-null  object 
 7   loan_amnt                   28632 non-null  int64  
 8   loan_int_rate               28632 non-null  float64
 9   loan_status                 28632 non-null  int64  
 10  loan_percent_income         28632 non-null  float64
 11  cb_person_default_on_file   28632 non-null  object 
 12  cb_person_cred_hist_length  28632 non-null  int64  
dtypes: float64(3), int64(6), object

In [68]:
df_credit["age_group"] = pd.cut(df_credit["person_age"], 
                                bins=[20,26,36,46,56,66],
                                labels = ["20-25", "26-35", "36-45", "46-55", "56-65"])

In [70]:
df_credit.age_group.dtype

CategoricalDtype(categories=['20-25', '26-35', '36-45', '46-55', '56-65'], ordered=True, categories_dtype=object)

In [71]:
df_credit.head(10)

Unnamed: 0,index,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group
0,1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-25
1,2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,20-25
2,3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,20-25
3,4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,20-25
4,5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,20-25
5,6,26,77100,RENT,8.0,EDUCATION,B,35000,12.42,1,0.45,N,3,20-25
6,7,24,78956,RENT,5.0,MEDICAL,B,35000,11.11,1,0.44,N,4,20-25
7,8,24,83000,RENT,8.0,PERSONAL,A,35000,8.9,1,0.42,N,2,20-25
8,9,21,10000,OWN,6.0,VENTURE,D,1600,14.74,1,0.16,N,3,20-25
9,10,22,85000,RENT,6.0,VENTURE,B,35000,10.37,1,0.41,N,4,20-25


### 4. Distribución del target

El target de este proyecto es loan_status, ya que indica si pagaron exitosamente el prestamo (0) o no (1), de modo que será nuestro target de riesgo.

In [66]:
target = "loan_status"