In [1]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sagemaker.session import Session


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import metrics, tree
from sklearn.tree import plot_tree
import joblib

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
# Configura tu bucket de S3 y las rutas de tus archivos
role = get_execution_role()
bucket = "api-bucket-cp"
data_key= "clientes-dir/synthetic_customer_data.csv"
data_location = 's3://{}/{}'.format(bucket, data_key)
df=pd.read_csv(data_location)
df.head()

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Unnamed: 0,customer_id,age,gender,annual_income,total_spent,num_purchases,avg_purchase_value,online_activity_score,loyalty_program,days_since_last_purchase,num_site_visits,customer_segment
0,CUST_0001,56,Male,65648,49624.926758,17,467.729972,52.525842,0,235,61,medium_value
1,CUST_0002,69,Male,123537,46357.348299,196,874.799829,11.120875,0,182,48,medium_value
2,CUST_0003,46,Male,145991,27043.860801,175,207.419293,86.81693,0,199,50,medium_value
3,CUST_0004,32,Female,29516,42117.461389,25,131.749484,29.736153,0,356,32,medium_value
4,CUST_0005,60,Male,132863,26095.803066,94,289.081236,16.253799,1,245,29,high_value


## Análisis y preprocesamiento de datos
---

In [3]:
# Estructura del dataset
df.shape

(1000, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               1000 non-null   object 
 1   age                       1000 non-null   int64  
 2   gender                    1000 non-null   object 
 3   annual_income             1000 non-null   int64  
 4   total_spent               1000 non-null   float64
 5   num_purchases             1000 non-null   int64  
 6   avg_purchase_value        1000 non-null   float64
 7   online_activity_score     1000 non-null   float64
 8   loyalty_program           1000 non-null   int64  
 9   days_since_last_purchase  1000 non-null   int64  
 10  num_site_visits           1000 non-null   int64  
 11  customer_segment          1000 non-null   object 
dtypes: float64(3), int64(6), object(3)
memory usage: 93.9+ KB


In [5]:
# Valores faltantes
print("Valores nulos en el conjunto de datos:")
df.isnull().sum()

Valores nulos en el conjunto de datos:


customer_id                 0
age                         0
gender                      0
annual_income               0
total_spent                 0
num_purchases               0
avg_purchase_value          0
online_activity_score       0
loyalty_program             0
days_since_last_purchase    0
num_site_visits             0
customer_segment            0
dtype: int64

In [6]:
# Valores duplicados
duplicados = sum(df.duplicated())
print(f'Registros duplicados: {duplicados}')

Registros duplicados: 0


In [7]:
# Valores del programa de fidelización
df['loyalty_program'].unique()

array([0, 1])

In [8]:
# Valores del segmento de clientes
df['customer_segment'].unique()

array(['medium_value', 'high_value', 'low_value'], dtype=object)

In [9]:
segment_counts = df['customer_segment'].value_counts()
segment_counts

medium_value    858
high_value      136
low_value         6
Name: customer_segment, dtype: int64

In [10]:
# Valores de Género
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [11]:
low_value_data = df[df['customer_segment'] == 'low_value']
low_value_data

Unnamed: 0,customer_id,age,gender,annual_income,total_spent,num_purchases,avg_purchase_value,online_activity_score,loyalty_program,days_since_last_purchase,num_site_visits,customer_segment
331,CUST_0332,33,Male,46015,1407.215756,10,409.524,76.519383,0,326,12,low_value
480,CUST_0481,36,Female,55046,4264.857841,15,442.857756,77.31835,1,329,36,low_value
703,CUST_0704,62,Male,123896,3643.28246,7,198.475406,42.876429,0,224,58,low_value
775,CUST_0776,67,Male,77716,1474.082321,19,28.414832,72.801226,0,286,20,low_value
916,CUST_0917,60,Female,111302,3708.592115,18,805.075191,21.515562,0,135,71,low_value
971,CUST_0972,18,Female,133092,3228.212119,13,552.719418,53.615539,0,57,55,low_value


In [12]:
medium_value_data = df[df['customer_segment'] == 'medium_value']
medium_value_data

Unnamed: 0,customer_id,age,gender,annual_income,total_spent,num_purchases,avg_purchase_value,online_activity_score,loyalty_program,days_since_last_purchase,num_site_visits,customer_segment
0,CUST_0001,56,Male,65648,49624.926758,17,467.729972,52.525842,0,235,61,medium_value
1,CUST_0002,69,Male,123537,46357.348299,196,874.799829,11.120875,0,182,48,medium_value
2,CUST_0003,46,Male,145991,27043.860801,175,207.419293,86.816930,0,199,50,medium_value
3,CUST_0004,32,Female,29516,42117.461389,25,131.749484,29.736153,0,356,32,medium_value
5,CUST_0006,25,Male,135005,31216.925831,21,533.919510,67.562131,0,352,58,medium_value
...,...,...,...,...,...,...,...,...,...,...,...,...
994,CUST_0995,63,Female,118540,317.727156,54,417.885634,65.083322,1,83,16,medium_value
995,CUST_0996,60,Female,92982,8430.959128,156,731.840015,61.506463,1,15,43,medium_value
996,CUST_0997,64,Male,122747,35680.997927,73,139.678658,17.567210,0,333,44,medium_value
997,CUST_0998,62,Female,75922,33352.632138,120,201.651024,51.318989,0,58,7,medium_value


In [13]:
high_value_data = df[df['customer_segment'] == 'high_value']
high_value_data

Unnamed: 0,customer_id,age,gender,annual_income,total_spent,num_purchases,avg_purchase_value,online_activity_score,loyalty_program,days_since_last_purchase,num_site_visits,customer_segment
4,CUST_0005,60,Male,132863,26095.803066,94,289.081236,16.253799,1,245,29,high_value
23,CUST_0024,19,Female,87675,25832.403442,141,453.226179,86.993783,1,97,43,high_value
31,CUST_0032,44,Female,71756,40890.740454,54,730.588569,31.501037,1,280,87,high_value
34,CUST_0035,33,Female,131785,43416.028836,109,377.607998,35.979409,1,189,65,high_value
35,CUST_0036,32,Female,147331,40315.677466,87,372.634524,30.697228,1,151,21,high_value
...,...,...,...,...,...,...,...,...,...,...,...,...
952,CUST_0953,43,Female,125730,41144.486240,157,79.265239,57.721607,1,328,98,high_value
982,CUST_0983,40,Female,118936,39467.730342,62,110.805611,11.115088,1,250,42,high_value
984,CUST_0985,41,Male,145839,21747.489237,171,941.343019,72.687691,1,111,58,high_value
992,CUST_0993,64,Female,119983,31672.322750,144,399.963233,50.275019,1,124,72,high_value


In [14]:
# Corroboramos que los valores de la columna Género se hayan modificado
df.head()

Unnamed: 0,customer_id,age,gender,annual_income,total_spent,num_purchases,avg_purchase_value,online_activity_score,loyalty_program,days_since_last_purchase,num_site_visits,customer_segment
0,CUST_0001,56,Male,65648,49624.926758,17,467.729972,52.525842,0,235,61,medium_value
1,CUST_0002,69,Male,123537,46357.348299,196,874.799829,11.120875,0,182,48,medium_value
2,CUST_0003,46,Male,145991,27043.860801,175,207.419293,86.81693,0,199,50,medium_value
3,CUST_0004,32,Female,29516,42117.461389,25,131.749484,29.736153,0,356,32,medium_value
4,CUST_0005,60,Male,132863,26095.803066,94,289.081236,16.253799,1,245,29,high_value


## Modelo de Entrenamiento 
---

In [15]:
from sklearn.model_selection import train_test_split

# Convertir etiquetas categóricas a numéricas
label_mapping = {'low_value': 0, 'medium_value': 1, 'high_value': 2}
df['customer_segment'] = df['customer_segment'].map(label_mapping)

# Dividir características (X) y etiquetas (y)
X = df.drop(columns=['customer_segment', 'customer_id','age','gender'])  # Excluye la etiqueta y otras irrelevantes
y = df['customer_segment']

# Dividir los datos en entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
import numpy as np
from sklearn.datasets import dump_svmlight_file

# Guardar los datos en formato LIBSVM
dump_svmlight_file(X_train, y_train, "train.libsvm")
dump_svmlight_file(X_test, y_test, "test.libsvm")

print("Archivos train.libsvm y test.libsvm creados.")


Archivos train.libsvm y test.libsvm creados.


In [17]:
# Definir la ubicación en S3
bucket = "api-bucket-cp"
s3_prefix = 'customer-segmentation-xgboost'
session = sagemaker.Session()

train_data_s3 = session.upload_data("train.libsvm", bucket=bucket, key_prefix=s3_prefix)
test_data_s3 = session.upload_data("test.libsvm", bucket=bucket, key_prefix=s3_prefix)

print(f"Datos cargados a S3:\nTrain: {train_data_s3}\nTest: {test_data_s3}")

Datos cargados a S3:
Train: s3://api-bucket-cp/customer-segmentation-xgboost/train.libsvm
Test: s3://api-bucket-cp/customer-segmentation-xgboost/test.libsvm


In [18]:
from sagemaker.estimator import Estimator

xgboost_container = sagemaker.image_uris.retrieve(
    "xgboost",  # Framework
    sagemaker.Session().boto_region_name,  # Región
    version="1.5-1"  # Versión del contenedor
)

xgb = Estimator(
    image_uri=xgboost_container,
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    max_run=300,
    output_path=f's3://{bucket}/{s3_prefix}/',
    sagemaker_session=session
)

# Configurar hiperparámetros para clasificación multiclase
xgb.set_hyperparameters(
    objective="multi:softmax",
    num_class=3,  # Tres clases: low, medium, high
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    num_round=100
)

# Entrenar el modelo
xgb.fit({"train": train_data_s3, "validation": test_data_s3})


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-11-19-23-25-26-270


2024-11-19 23:25:28 Starting - Starting the training job...
2024-11-19 23:25:43 Starting - Preparing the instances for training...
2024-11-19 23:26:08 Downloading - Downloading input data...
2024-11-19 23:26:39 Downloading - Downloading the training image......
2024-11-19 23:27:55 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2024-11-19 23:27:50.317 ip-10-2-199-24.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-11-19 23:27:50.340 ip-10-2-199-24.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-11-19:23:27:50:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-11-19:23:27:50:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34m[2024-11-19:23:27:50:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-11-19:23:27:50:INFO] Running

In [19]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium"
)

print(f"Endpoint name: {xgb_predictor.endpoint_name}")


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-11-19-23-29-19-731
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-11-19-23-29-19-731
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-11-19-23-29-19-731


-------------!Endpoint name: sagemaker-xgboost-2024-11-19-23-29-19-731
