In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sb, warnings
from sklearn.linear_model import LogisticRegression # Regresor Logistico
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold # Validación, validación cruzada
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_error, precision_score, recall_score, f1_score # Metricas y coeficientes de eficiencia
%matplotlib inline
practicas_pV2 = pd.read_csv('C:/Users/SANTY/Downloads/practicas_pV2.csv')
practicas_pV2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9864 entries, 0 to 9863
Data columns (total 25 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   status_generado                             9864 non-null   int64
 1   scorecc                                     9864 non-null   int64
 2   acreedores                                  9864 non-null   int64
 3   cuentas_activas                             9864 non-null   int64
 4   emprende_activos                            9864 non-null   int64
 5   emprende_cerrados                           9864 non-null   int64
 6   quebrantos                                  9864 non-null   int64
 7   atrasos                                     9864 non-null   int64
 8   cobranza                                    9864 non-null   int64
 9   creditosFraudulentos                        9864 non-null   int64
 10  creditosIntegranteCausanteDeMora    

In [2]:
# Separacion Variables predictoras y objetivo

X = practicas_pV2.drop(columns = ["status_generado", 'creditosAbiertos', 'creditosAbiertosQuebrantosNoCerrados', 'otrosCreditosParaAnalizar', 'scorecc', 'creditosFraudulentos', 'creditosIntegranteCausanteDeMora',
       'creditosIntegranteSubsidiadoParaEvitarMora', 'creditosEnLocalizacion',
       'creditosCedidoAUnTercero', 'creditosConvenioDeFiniquito',
       'creditosOtros', 'domicilios', 'cerrados'])
y = practicas_pV2["status_generado"]

In [3]:
# Preparacion y ajuste del modelo con todas las variables

model = LogisticRegression(solver = 'newton-cg')
model.fit(X,y)

# Se utiliza el coeficiente de determinacion R^2
model.score(X,y)

0.8524939172749392

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 1) # Sepacración para validacion

# Precisión para el modelo entrenado 

name='Logistic Regression'
skfold = StratifiedKFold(n_splits=100, random_state=1, shuffle=True) 
cv_results = cross_val_score(model, X_train, y_train, cv=skfold, scoring='accuracy') # Validación cruzada por dobleces estraticados
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)

# Precisión para las predicciones del modelo
model.fit(X_train,y_train)
preds = model.predict(X_test)
print(f'Exactitud del modelo {accuracy_score(y_test, preds)}')
print(f'Precision del modelo {precision_score(y_test, preds)}')
print(f'Sensibilidad del modelo {recall_score(y_test, preds)}')
print(f'F1-Score del modelo {f1_score(y_test, preds)}')


Logistic Regression: 0.847264 (0.035854)
Exactitud del modelo 0.8596918085969181
Precision del modelo 0.8557548579970105
Sensibilidad del modelo 0.8821263482280431
F1-Score del modelo 0.8687405159332322


In [5]:
 # Determinando el umbral de cambio para el status

both = pd.DataFrame()
both['y_pred'] = model.predict(X_train)
both['probs'] = model.predict_proba(X_train)[:,1]


for i in list(np.arange(0,1,0.1)):
    both["y_pred"] = 0 
    both.loc[both["probs"] > i, 'y_pred'] = 1
    print ("Threshold",round(i, 3),"Train Accuracy:",
    round(accuracy_score(y_train, both['y_pred']),4))

Threshold 0.0 Train Accuracy: 0.527
Threshold 0.1 Train Accuracy: 0.7416
Threshold 0.2 Train Accuracy: 0.8049
Threshold 0.3 Train Accuracy: 0.8393
Threshold 0.4 Train Accuracy: 0.8546
Threshold 0.5 Train Accuracy: 0.8475
Threshold 0.6 Train Accuracy: 0.8317
Threshold 0.7 Train Accuracy: 0.8008
Threshold 0.8 Train Accuracy: 0.7374
Threshold 0.9 Train Accuracy: 0.6231


In [6]:
preds = (model.predict_proba(X_test)[:,1] >= 0.4).astype(bool)
preds0 =  (model.predict_proba(X_train)[:,1] >= 0.4).astype(bool)
print(f'Exactitud del modelo {accuracy_score(y_test, preds)}')
print(f'Precision del modelo {precision_score(y_test, preds)}')
print(f'Sensibilidad del modelo {recall_score(y_test, preds)}')
print(f'F1-Score del modelo {f1_score(y_test, preds)}')
print ("\n\nTrain Confusion Matrix\n\n",pd.crosstab(y_train, preds0,rownames = ["Actual"],colnames =["Predicted"]))
print ("\n\nTest Confusion Matrix\n\n",pd.crosstab(y_test, preds,rownames = ["Actual"],colnames =["Predicted"]))
    


Exactitud del modelo 0.8633414436334145
Precision del modelo 0.8306951135581555
Sensibilidad del modelo 0.9298921417565486
F1-Score del modelo 0.8774990912395493


Train Confusion Matrix

 Predicted  False  True 
Actual                 
0           2687    812
1            264   3635


Test Confusion Matrix

 Predicted  False  True 
Actual                 
0            922    246
1             91   1207


In [7]:
coeficientes = model.coef_
coeficientes = coeficientes[coeficientes != 0]
intercepto = model.intercept_
variables = X.columns

ecuacion = f"Log(odds) = {intercepto[0]:.3f}"

for i,coef in enumerate(coeficientes):
    ecuacion += f"+ {coef:.3f} * {variables[i]}"

print(ecuacion)

Log(odds) = 2.227+ -1.552 * acreedores+ 0.038 * cuentas_activas+ 0.322 * emprende_activos+ 0.005 * emprende_cerrados+ -0.530 * quebrantos+ -0.107 * atrasos+ 0.054 * cobranza+ -0.850 * posiblesCreditosCerrados+ 0.058 * consultas+ 0.024 * empleos+ 0.009 * edad
