### Fase 3 - Punto 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [3]:
# importar el archivo .csv
data = pd.read_csv('dataset.csv')
data

Unnamed: 0,embalaje,largo,ancho,alto,peso,procedencia,temperatura,manipulacion,protocolo
0,A,34.687708,51.459835,42.095164,23.857571,D,refrigerado,fragil,protocolo_1
1,A,49.464008,50.378068,38.858520,32.119336,A,ambiente,normal,protocolo_2
2,B,66.617850,52.215517,27.181471,23.359145,C,ambiente,fragil,protocolo_1
3,C,24.215506,41.945443,51.965421,20.012881,C,ambiente,normal,protocolo_2
4,C,56.804813,60.880069,56.895350,23.560809,C,ambiente,normal,protocolo_2
...,...,...,...,...,...,...,...,...,...
9995,A,60.906016,52.911421,24.009152,20.668354,C,ambiente,normal,protocolo_2
9996,C,50.839682,53.362869,49.981335,19.933259,C,ambiente,normal,protocolo_2
9997,B,64.661100,67.924710,37.296223,18.554790,D,ambiente,fragil,protocolo_1
9998,B,42.017064,72.635800,61.344982,31.943001,D,ambiente,normal,protocolo_2


In [4]:
data['embalaje'].replace(['A', 'B', 'C'], [0, 1, 2], inplace=True)
data['procedencia'].replace(['A', 'B', 'C', 'D'], [0, 1, 2, 3], inplace=True)
data['temperatura'].replace(['ambiente', 'refrigerado'], [0, 1], inplace=True)
data['manipulacion'].replace(['normal', 'fragil'], [0, 1], inplace=True)
data['protocolo'].replace(['protocolo_1', 'protocolo_2'], [1, 2], inplace=True)
data

Unnamed: 0,embalaje,largo,ancho,alto,peso,procedencia,temperatura,manipulacion,protocolo
0,0,34.687708,51.459835,42.095164,23.857571,3,1,1,1
1,0,49.464008,50.378068,38.858520,32.119336,0,0,0,2
2,1,66.617850,52.215517,27.181471,23.359145,2,0,1,1
3,2,24.215506,41.945443,51.965421,20.012881,2,0,0,2
4,2,56.804813,60.880069,56.895350,23.560809,2,0,0,2
...,...,...,...,...,...,...,...,...,...
9995,0,60.906016,52.911421,24.009152,20.668354,2,0,0,2
9996,2,50.839682,53.362869,49.981335,19.933259,2,0,0,2
9997,1,64.661100,67.924710,37.296223,18.554790,3,0,1,1
9998,1,42.017064,72.635800,61.344982,31.943001,3,0,0,2


In [5]:
class KMeanProductOrganizer:
    
    def __init__(self, original_dataset):
        self.area_num_to_letter = {0:'A', 1:'B', 2:'C'}
        self.features = ['largo', 'ancho', 'alto', 'peso', 'procedencia', 'temperatura', 'manipulacion']
        self.area_list = range(0,3)
        self.protocol_list = range(1,3)
        
        self.clustering_datasets = self._get_dataset_for_clustering(original_dataset)
        self.clustering_models_trained = self._get_trained_models()
        
    
    def _get_dataset_for_clustering(self, data):
        """ split the original dataset into each area+protocol dataset """
        clustering_datasets = {}
        for area in self.area_list:
            for protocol in self.protocol_list:
                key = f"{self.area_num_to_letter[area]}_{protocol}"
                clustering_datasets[key] = data[ (data['embalaje'] == area) & (data['protocolo'] == protocol) ]
        return clustering_datasets 
    
    def _get_trained_models(self):
        """ train each area+protocol dataset """
        clustering_models_trained = {}
        for key, dataset in self.clustering_datasets.items():
            clustering_models_trained[key] = self._kmeans_model_train(dataset[self.features])
        return clustering_models_trained 
    
    def _kmeans_model_train(self, dataset):
        """ kmean model and calculate the silhouette score"""
        scaler = StandardScaler()
        # normalizacion de los datos de las caracteristicas
        data_scaled = scaler.fit_transform(dataset)
        kmeans = KMeans(n_clusters=4, n_init=25, random_state=123)
        kmeans.fit(data_scaled)
        return {'model': kmeans, 'silhouette_score': silhouette_score(data_scaled, kmeans.labels_)}
    
    def predict_product_sample_cluster(self, product_data, product_area, product_protocol):
        """ Receive a product sample, the area and the protocol. 
            Select the area+protocol model trained and predict the cluster for the product
        """
        key = f"{product_area}_{product_protocol}"
        
        if key in self.clustering_models_trained:
            product_cluster = self.clustering_models_trained[key]['model'].predict(product_data.reshape(1, -1))[0]
            return product_cluster
        else:
            return None

In [6]:
kmean_obj = KMeanProductOrganizer(data)



#### Prediccion del Cluster de un Producto

In [7]:
sample = np.array([34.687708,51.459835,42.095164,23.857571,3,1,1])
product_cluster = kmean_obj.predict_product_sample_cluster(sample, 'A', 1)
print(f"Product Cluster: {product_cluster}")

Product Cluster: 0
