# Librerías

In [1]:
# cálculos
import numpy as np
import pandas as pd
import math

# fechas
from datetime import datetime

# gráficos
import seaborn as sns
import matplotlib.pyplot as plt

# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# modelos
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# metricas
import sklearn.metrics as metrics

# hiperparametrizado
from sklearn.model_selection import GridSearchCV

# quitar alertas
import warnings
warnings.filterwarnings("ignore")



In [2]:
# Ruta al archivo 
ruta_archivo = "C:/Users/Herre/Desktop/NUCLIO/Entrega_TFM/modelo_financing.csv"
# Leer el archivo 
df_modelo = pd.read_csv(ruta_archivo)

In [3]:
df_modelo

Unnamed: 0,pk_cid,active_customer,segment,region_code,age,salary,entry_date_year,entry_date_month,entry_date_day,entry_date_weekday,...,entry_channel_KFC,entry_channel_KHE,entry_channel_KHQ,entry_channel_Otros,country_id_ES,gender_V,deceased_S,category_investment,category_financing,category_accounts
0,1045874,1.0,2,47.0,40,169608.12,2015,8,7,4,...,0,0,0,1,1,1,0,1,1,1
1,1114395,1.0,1,28.0,33,66377.22,2015,12,24,3,...,0,1,0,0,1,1,0,1,1,1
2,1438930,0.0,2,15.0,51,80298.30,2018,8,26,6,...,0,0,0,1,1,0,0,0,0,0
3,1220939,0.0,2,30.0,57,88299.00,2016,11,15,1,...,0,1,0,0,1,0,0,0,0,1
4,1134829,1.0,2,28.0,55,111948.27,2016,5,22,6,...,1,0,0,0,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9613,1271675,0.0,1,43.0,21,120893.73,2017,7,18,1,...,0,1,0,0,1,0,0,0,0,1
9614,1475143,0.0,1,13.0,20,213073.95,2018,10,9,1,...,0,0,1,0,1,0,0,0,0,1
9615,1404507,1.0,2,33.0,61,43041.48,2018,7,3,1,...,0,0,0,1,1,0,0,0,0,1
9616,1256178,1.0,2,8.0,48,42453.78,2018,7,22,6,...,1,0,0,0,1,0,0,0,1,0


In [4]:
df_modelo.head()

Unnamed: 0,pk_cid,active_customer,segment,region_code,age,salary,entry_date_year,entry_date_month,entry_date_day,entry_date_weekday,...,entry_channel_KFC,entry_channel_KHE,entry_channel_KHQ,entry_channel_Otros,country_id_ES,gender_V,deceased_S,category_investment,category_financing,category_accounts
0,1045874,1.0,2,47.0,40,169608.12,2015,8,7,4,...,0,0,0,1,1,1,0,1,1,1
1,1114395,1.0,1,28.0,33,66377.22,2015,12,24,3,...,0,1,0,0,1,1,0,1,1,1
2,1438930,0.0,2,15.0,51,80298.3,2018,8,26,6,...,0,0,0,1,1,0,0,0,0,0
3,1220939,0.0,2,30.0,57,88299.0,2016,11,15,1,...,0,1,0,0,1,0,0,0,0,1
4,1134829,1.0,2,28.0,55,111948.27,2016,5,22,6,...,1,0,0,0,1,1,0,0,1,1


En CatBoost no hace falta estandarizar

In [5]:
# 1. Dividir los datos en variables predictoras y target
# Asumamos que la columna 'category_financing' es la que queremos predecir
X = df_modelo.drop('category_financing', axis=1)  # Eliminar la columna target del conjunto de datos
y = df_modelo['category_financing']  # Definir la columna objetivo

# 2. Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Toreno de modelos

In [6]:
f1s = pd.DataFrame(columns=['modelo', 'f1_score'])
f1s

Unnamed: 0,modelo,f1_score


In [7]:
# añadimos en una lista los modelos que queremos poner a competir

modelos = []
modelos.append(('SVC', SVC()))
modelos.append(('LogisticRegression', LogisticRegression()))
modelos.append(('RidgeClassifier', RidgeClassifier()))
modelos.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
modelos.append(('RandomForestClassifier', RandomForestClassifier()))
modelos.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
modelos.append(('XGBClassifier', XGBClassifier()))
modelos.append(('CatBoostClassifier', CatBoostClassifier(verbose=False)))

In [8]:
for nombre, modelo in modelos:
  # entrenamiento
  modelo.fit(X_train, y_train)

  # predicción del test
  y_pred = modelo.predict(X_test)

  # evalua la metrica
  f1_score = metrics.f1_score(y_test, y_pred, average='macro')
  #el average='macro' se usa cuando estamos haciendo modelos multiclases. Sino lo pones el modelo peta.

  # añade la métrica al dataframe de métricas
  metrica = pd.DataFrame({'modelo':[nombre], 'f1_score':[f1_score]})

  print(nombre, np.round(f1_score,5))

  f1s = pd.concat([f1s,metrica], ignore_index=True)

SVC 0.62658
LogisticRegression 0.76487
RidgeClassifier 0.87499
DecisionTreeClassifier 0.81332
RandomForestClassifier 0.87929
GradientBoostingClassifier 0.88334
XGBClassifier 0.88029
CatBoostClassifier 0.88389


In [9]:
f1s.sort_values('f1_score', ascending= False)

Unnamed: 0,modelo,f1_score
7,CatBoostClassifier,0.88389
5,GradientBoostingClassifier,0.883343
6,XGBClassifier,0.88029
4,RandomForestClassifier,0.879293
2,RidgeClassifier,0.874989
3,DecisionTreeClassifier,0.813325
1,LogisticRegression,0.764874
0,SVC,0.626575


# Modelo

In [10]:
# Definir el modelo de CatBoost
# No necesitamos especificar las características categóricas manualmente
model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=100)

# Entrenar el modelo
model.fit(X_train, y_train)

# Hacer predicciones
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluar el modelo
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'Accuracy: {accuracy:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

0:	learn: 0.5785579	total: 2.46ms	remaining: 2.46s
100:	learn: 0.2519224	total: 243ms	remaining: 2.16s
200:	learn: 0.2102494	total: 506ms	remaining: 2.01s
300:	learn: 0.1782267	total: 751ms	remaining: 1.74s
400:	learn: 0.1541858	total: 1000ms	remaining: 1.49s
500:	learn: 0.1343012	total: 1.24s	remaining: 1.24s
600:	learn: 0.1166373	total: 1.49s	remaining: 989ms
700:	learn: 0.1034731	total: 1.77s	remaining: 757ms
800:	learn: 0.0918508	total: 2.04s	remaining: 507ms
900:	learn: 0.0808719	total: 2.28s	remaining: 251ms
999:	learn: 0.0724187	total: 2.52s	remaining: 0us
Accuracy: 0.8773
ROC AUC: 0.9361


In [11]:
# Calcular la precisión
accuracy = accuracy_score(y_test, y_pred)
print(f"Precisión: {accuracy * 100:.2f}%")

Precisión: 87.73%


In [12]:
# Nos aseguramos de que la columna 'pk_cid' esté en X_test

# Crear un nuevo DataFrame con pk_cid, las predicciones y las probabilidades
df_resultado = X_test[['pk_cid']].copy()  # Copiamos la columna pk_cid de X_test
df_resultado['prediccion'] = y_pred  # Agregamos la columna de predicciones
df_resultado['probabilidad_compra'] = y_pred_proba  # Agregamos la columna de probabilidades

# Mostrar el nuevo DataFrame directamente en pantalla
print(df_resultado.head())  # Muestra las primeras filas del DataFrame

       pk_cid  prediccion  probabilidad_compra
1791  1151275           0             0.031676
4217  1201055           1             0.947194
9025  1224705           1             0.974331
8394  1507317           1             0.689066
4102  1015234           1             0.922753


In [13]:
# Revisar la distribución de la variable objetivo
df_resultado['prediccion'].value_counts(normalize=True) 

prediccion
1    0.534823
0    0.465177
Name: proportion, dtype: float64

Necesitamos a los clientes con una buena probabilidad de compra, es por eso que escogemos a aquellos con una probabilidad superior al 70%.

In [14]:
# Filtrar filas donde la probabilidad de compra sea mayor al 60%
df_filtrado = df_resultado[df_resultado['probabilidad_compra'] > 0.70]

In [15]:
df_filtrado.info()

<class 'pandas.core.frame.DataFrame'>
Index: 933 entries, 4217 to 7553
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pk_cid               933 non-null    int64  
 1   prediccion           933 non-null    int64  
 2   probabilidad_compra  933 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 29.2 KB


In [17]:
df_filtrado

Unnamed: 0,pk_cid,prediccion,probabilidad_compra
4217,1201055,1,0.947194
9025,1224705,1,0.974331
4102,1015234,1,0.922753
487,1392254,1,0.863274
9369,1015823,1,0.986804
...,...,...,...
8166,1250774,1,0.862379
8778,599565,1,0.944484
2423,1013650,1,0.984607
1369,1017394,1,0.973659


Exportamos la prediccion porque la necesitaremos más adelante para conocer los ingresos que tendrá la empresa. Nos quedamos con todos los clientes con una probabilidad de compra superior al 70% porque son solo 933.

In [18]:
# Guardar el DataFrame en un archivo CSV para revisarlo luego
df_filtrado.to_csv('financing_resultado_predicciones.csv', index=False)