
<div class="jumbotron">
  <h1><i class="fa fa-bar-chart" aria-hidden="true"></i> Modelado de datos</h1>
  <p></p>
</div>

In [1]:
# ! pip install --upgrade setuptools pip

In [26]:
import mlflow
import mlflow.sklearn
from pandas import DataFrame
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.neighbors import KNeighborsClassifier
import pickle
from datetime import datetime
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
from os import chdir

# retroceder al directorio principal
chdir("..")

In [4]:
def get_timestamp():
    now = datetime.now()
    year = '{:02d}'.format(now.year)
    month = '{:02d}'.format(now.month)
    day = '{:02d}'.format(now.day)
    hour = '{:02d}'.format(now.hour)
    minute = '{:02d}'.format(now.minute)
    return '{}-{}-{}-{}-{}'.format(year, month, day,hour,minute)
    

In [5]:
## Tenga cuidado con sklearn y pandas en el entorno de producción.

In [6]:
mlflow.set_tracking_uri("http://localhost:8002")

# Cargar datos

In [7]:
# Load data (deserialize)
# Load data (deserialize)
with open('data/interim/transform.pickle','rb') as handle:
    df = pickle.load(handle)

In [8]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,class
0,5.1,3.5,1.4,0.2,Setosa,0
1,4.9,3.0,1.4,0.2,Setosa,0
2,4.7,3.2,1.3,0.2,Setosa,0
3,4.6,3.1,1.5,0.2,Setosa,0
4,5.0,3.6,1.4,0.2,Setosa,0


In [9]:
variables_numericas = df.select_dtypes(include=['int64', 'float64']).columns
variables_categoricas = df.select_dtypes(include=['category']).columns


## Label

In [10]:
label = 'class'

# Resumen de datos

In [11]:
df_summary = df.groupby([label])\
.size()\
.reset_index()\
.rename(columns={0:'count_'})\
#.sort_values(by=['variety'])

In [12]:
df_summary.head()

Unnamed: 0,class,count_
0,0,50
1,1,50
2,2,49


### Eliminar variables superfluas

In [13]:
# NA

# Modeling

Podemos ver los resultados de MLFlow si ejecutamos en la terminal:



 _mlflow ui -p 1234_
 

## Hiperparámetros

## set dataset train 

In [14]:
Y = df[label]

In [15]:
X = df[variables_numericas].values

In [16]:
# Parámetros para MLFlow

In [64]:
columns = [
'param_n_neighbors',
'param_p',
'param_weights',
'params',
'mean_fit_time',
'split0_test_f1',
'split1_test_f1',
'split2_test_f1',
'split3_test_f1',
'split4_test_f1',
'mean_test_precision',
'mean_test_recall',
'mean_test_accuracy',
'mean_test_f1',
'std_test_f1']

# Especificar las métricas de evaluación que deseas usar en forma de un diccionario
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score,average='micro'),
    'recall': make_scorer(recall_score,average='micro'),
    'f1': make_scorer(f1_score,average='micro')
}

# Crear funciones para Grid Search y Randomized Search
def grid_search_knn(X_train, y_train):
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5,n_jobs = -1 , scoring =scoring , verbose = 1,refit='f1' )
    grid_search.fit(X_train, y_train)
    return grid_search

def randomized_search_knn(X_train, y_train):
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }
    knn = KNeighborsClassifier()
    randomized_search = RandomizedSearchCV(estimator=knn, param_distributions=param_grid, n_iter=10, cv=5,n_jobs = -1 , scoring =scoring , verbose = 1,refit='f1')
    randomized_search.fit(X_train, y_train)
    return randomized_search

In [75]:
name_experiment = get_timestamp()
name_experiment.replace('-','')

'202309110954'

In [78]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

name_experiment = get_timestamp()
name_experiment = name_experiment.replace('-','')


run_name = "Grid Search"
run_name="Randomized Search"

results = None

name_project = 'reto'
name_algo = 'kNN'
name_experiment = name_project + '_' + name_algo + '_' + run_name + '_' + name_experiment
mlflow.set_experiment(name_experiment)


if run_name == "Grid Search":
    # Realizar Grid Search
    with mlflow.start_run(run_name="Grid Search"):
        best_params_grid = grid_search_knn(X_train, y_train)
        
        mlflow.log_params(best_params_grid.best_params_)
        mlflow.sklearn.log_model(best_params_grid.best_estimator_, "knn_model_grid")
        mlflow.log_metric("f1",best_params_grid.best_score_)
        results = DataFrame(best_params_grid.cv_results_)
        results.sort_values(by='rank_test_f1', inplace=True)
        
        for index, row in results[columns].head().iterrows():
            # print(row)
            run_name = "ranking " + str(index)
            with mlflow.start_run(run_name=run_name, nested=True):
                for column in columns :
                    mlflow.log_param(column,row[column])

        display(best_params_grid.best_score_)
        display(best_params_grid.best_params_)
        display(results[columns].head())
        
        
else:
    # Realizar Randomized Search
    with mlflow.start_run(run_name="Randomized Search"):
        best_params_random = randomized_search_knn(X_train, y_train)
        mlflow.log_params(best_params_random.best_params_)
        mlflow.sklearn.log_model(best_params_random.best_estimator_, "knn_model_random")
        mlflow.log_metric("f1",best_params_random.best_score_)
        results = DataFrame(best_params_random.cv_results_)
        results.sort_values(by='rank_test_f1', inplace=True)

        for index, row in results[columns].head().iterrows():
            # print(row)
            run_name = "ranking " + str(index)
            with mlflow.start_run(run_name=run_name, nested=True):
                for column in columns :
                    mlflow.log_param(column,row[column])

        display(best_params_random.best_score_)
        display(best_params_random.best_params_)
        display(results[columns].head())

# Finalizar MLflow
mlflow.end_run()


2023/09/11 10:08:23 INFO mlflow.tracking.fluent: Experiment with name 'reto_kNN_Randomized Search_202309111008' does not exist. Creating a new experiment.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


0.9614285714285714

{'weights': 'uniform', 'p': 2, 'n_neighbors': 9}

Unnamed: 0,param_n_neighbors,param_p,param_weights,params,mean_fit_time,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_precision,mean_test_recall,mean_test_accuracy,mean_test_f1,std_test_f1
3,9,2,uniform,"{'weights': 'uniform', 'p': 2, 'n_neighbors': 9}",0.00062,1.0,0.952381,0.904762,1.0,0.95,0.961429,0.961429,0.961429,0.961429,0.035775
5,9,1,uniform,"{'weights': 'uniform', 'p': 1, 'n_neighbors': 9}",0.000707,0.952381,0.904762,0.904762,1.0,0.95,0.942381,0.942381,0.942381,0.942381,0.03552
7,9,2,distance,"{'weights': 'distance', 'p': 2, 'n_neighbors': 9}",0.000519,1.0,0.857143,0.904762,1.0,0.95,0.942381,0.942381,0.942381,0.942381,0.055459
2,3,2,distance,"{'weights': 'distance', 'p': 2, 'n_neighbors': 3}",0.000627,1.0,0.857143,0.904762,1.0,0.9,0.932381,0.932381,0.932381,0.932381,0.057649
0,7,1,distance,"{'weights': 'distance', 'p': 1, 'n_neighbors': 7}",0.000652,0.952381,0.809524,0.904762,1.0,0.95,0.923333,0.923333,0.923333,0.923333,0.06439
