https://github.com/ricardovroma/eng_ml

# Config

In [36]:
import os

import joblib
import numpy as np
import pandas as pd
import pycaret
import requests
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from pycaret import classification
from pycaret.classification import add_metric
from sklearn import model_selection, metrics
from sklearn.metrics import log_loss

os.environ['MLFLOW_TRACKING_URI'] = 'sqlite:///mlruns.db'  # Criando uma variável de ambiente
import mlflow

In [25]:
SEED = 10

np.random.seed(SEED)

classification_model_name = 'kobe-classification'
regression_model_name = 'kobe-regression'
min_precision = 0.5
model_version = -1
samples_qt = 5

# Definindo Features e colunas

In [26]:
experiment_name = 'Kobe Classifier'
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment(experiment_id)
experiment_id = experiment.experiment_id

percentual_test = 0.2
target_col = 'shot_made_flag'
selected_features = {
    'lat': float,
    'lon': float,
    'minutes_remaining': int,
    'period': int,
    'playoffs': int,
    'shot_distance': int,
    'shot_made_flag': str,
    'shot_type': str,
}

## Carregando a base, limpando os nulos do campo shot_made_flag, filtrando shot_type por 2PT Field Goal

In [27]:
with mlflow.start_run(experiment_id=experiment_id, run_name='PreparacaoDados', description="PreparacaoDados"):
    df_naive = pd.read_csv('../Data/dataset_kobe.csv', sep=',',
                     usecols=list(selected_features.keys()),
                     converters = selected_features,
                     na_filter=True,
                     )
    df = df_naive.loc[ (df_naive['shot_made_flag'] != "") & (df_naive['shot_type'] == '2PT Field Goal') ].drop('shot_type', axis=1)
    df_3pts = df_naive.loc[ (df_naive['shot_made_flag'] != "") & (df_naive['shot_type'] == '3PT Field Goal') ].drop('shot_type', axis=1)
    del selected_features['shot_type']
    df['shot_made_flag'] = df['shot_made_flag'].astype(int)


    df_train, df_test, y_train, y_test = model_selection.train_test_split(
        df,
        df[target_col],
        test_size=percentual_test,
        random_state=None,
        shuffle=True,
        stratify=None)

    df_train[target_col] = y_train
    df_test[target_col] = y_test

    df.to_parquet('../Data/processed/data_filtered.parquet')
    df_train.to_parquet('../Data/operalization/base_train.parquet')
    df_test.to_parquet('../Data/operalization/base_test.parquet')
    df_3pts.to_parquet('../Data/operalization/base_3pts.parquet')

    mlflow.log_param("selected_features", selected_features)

    mlflow.log_metric("Filtered data", df.shape[0])
    mlflow.log_metric("Train data", df_train.shape[0])
    mlflow.log_metric("Test data", df_test.shape[0])
    mlflow.log_metric("3pts data", df_3pts.shape[0])

    mlflow.log_artifact('../Data/processed/data_filtered.parquet')
    mlflow.log_artifact('../Data/operalization/base_train.parquet')
    mlflow.log_artifact('../Data/operalization/base_test.parquet')
    mlflow.log_artifact('../Data/operalization/base_3pts.parquet')

mlflow.end_run()

print('==================== Data ======================================')
print(f'Filtered data: {df.shape}')
print(f'Train data: {df_train.shape}')
print(f'test data: {df_test.shape}')
print(f'3pts data: {df_3pts.shape}')
print(f'Columns: {df_train.columns}')

Filtered data: (20285, 7)
Train data: (16228, 7)
test data: (4057, 7)
3pts data: (5412, 7)
Columns: Index(['lat', 'lon', 'minutes_remaining', 'period', 'playoffs',
       'shot_distance', 'shot_made_flag'],
      dtype='object')


In [28]:
# mlflow ui --backend-store-uri sqlite:///mlruns.db -p 5002

# Treinamento do modelo

In [29]:
numeric_features = [list(selected_features)[k] for k, x in enumerate(selected_features.values()) if x in [float, int] ]
numeric_features

['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance']

In [45]:
min_precision=0.5
with mlflow.start_run(experiment_id=experiment_id, run_name='Treinamento', description='Treinamento'):
    pycaret.classification.setup(data=df_train,
                 test_data=df_test,
                 target=target_col,
                 silent=True,
                 log_experiment=True,
                 experiment_name=experiment_name,
                 log_plots=True,
                 normalize=True,
                 )

    add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)

    bestmodel = pycaret.classification.compare_models(n_select = 1)
    # pycaret.classification.evaluate_model(bestmodel)
    # pycaret.classification.interpret_model(bestmodel, plot='correlation', feature=target_col)


    classification_plots = ['auc', 'pr', 'confusion_matrix',
                            'threshold', 'learning', 'vc', 'feature', 'class_report']
    for plot_type in classification_plots:
        print('=> Aplicando plot ', plot_type)
        try:
            pass
            # artifact = pycaret.classification.plot_model(bestmodel, plot=plot_type, save=True, use_train_data=False)
            # mlflow.log_artifact(artifact)
        except:
            print('=> Nao possivel plotar: ', plot_type)
            continue

    pred_holdout = pycaret.classification.predict_model(bestmodel)
    pr = metrics.precision_score(pred_holdout[target_col], pred_holdout['Label'],)

    if pr > min_precision:

        pycaret.classification.save_model(bestmodel, f'./{classification_model_name}')
        model_pipe = pycaret.classification.load_model(f'./{classification_model_name}')

        model_features = list(df.drop(target_col, axis=1).columns)
        inf_signature = infer_signature(df[model_features], model_pipe.predict(df))
        input_example = {x: df[x].values[:samples_qt] for x in model_features}

        mlflow.sklearn.log_model(
                sk_model=model_pipe,
                artifact_path="sklearn-model",
                registered_model_name=classification_model_name,
                signature=inf_signature,
                input_example=input_example
            )

        client = MlflowClient()
        if model_version == -1:
            model_version = client.get_latest_versions(classification_model_name)[-1].version

        # Registrar o modelo como staging
        client.transition_model_version_stage(
            name=classification_model_name,
            version=model_version,
            stage="Staging"
        )

        pr = metrics.precision_score(pred_holdout[target_col], pred_holdout['Label'], )
        accuracy_score = metrics.accuracy_score(pred_holdout[target_col], pred_holdout['Label'], )
        logloss = metrics.log_loss(pred_holdout[target_col], pred_holdout['Label'])
        f1_score = metrics.f1_score(pred_holdout[target_col], pred_holdout['Label'])


        mlflow.log_metric("new_version", model_version)
        mlflow.log_metric("precisao", pr)
        mlflow.log_metric("logloss", logloss)
        mlflow.log_metric("f1_score", f1_score)
        mlflow.log_metric("accuracy_score", accuracy_score)

        ############### WIP ####################
        # results = {
        #     'model': bestmodel,
        #     'data': df,
        #     'features': list(selected_features),
        #     'target_col': target_col,
        #     'threshold': 0.5
        # }
        # joblib.dump(results, '../Data/model_kobe.pkl', compress=9)
        ###################################

        print(pycaret.classification.pull())
    else:
        print(f'=> Modelo rejeitado precisão: {pr} (min: {min_precision})')
mlflow.end_run()

Unnamed: 0,Description,Value
0,session_id,578
1,Target,shot_made_flag
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(5412, 7)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,3
8,Ordinal Features,False
9,High Cardinality Features,False


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.6546,0.524,0.0658,0.3036,0.1081,-0.0059,-0.0086


In [46]:
with mlflow.start_run(experiment_id=experiment_id, run_name='Testando aderencia', description='Testando aderencia'):

    bestmodel = pycaret.classification.load_model(f'./{classification_model_name}')

    pycaret.classification.setup(data=df,
                 test_data=None,
                 target=target_col,
                 silent=True,
                 log_experiment=True,
                 experiment_name=experiment_name,
                 log_plots=True,
                 normalize=True,
                 )

    add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)

    pred_holdout = pycaret.classification.predict_model(bestmodel)

    pr_2pts = metrics.precision_score(pred_holdout[target_col], pred_holdout['Label'],)
    # accuracy_score = metrics.accuracy_score(pred_holdout[target_col], pred_holdout['Label'], )
    logloss_2pts = metrics.log_loss(pred_holdout[target_col], pred_holdout['Label'])
    # f1_score = metrics.f1_score(pred_holdout[target_col], pred_holdout['Label'])

    #########################

    pycaret.classification.setup(data=df_3pts,
                 test_data=None,
                 target=target_col,
                 silent=True,
                 log_experiment=True,
                 experiment_name=experiment_name,
                 log_plots=True,
                 normalize=True,
                 )

    pred_holdout = pycaret.classification.predict_model(bestmodel)

    pr_3pts = metrics.precision_score(pred_holdout[target_col], pred_holdout['Label'],)
    logloss_3pts = metrics.log_loss(pred_holdout[target_col], pred_holdout['Label'])


    mlflow.log_metric("3pts - precisao", pr_3pts)
    mlflow.log_metric("3pts - logloss", logloss_3pts)

    mlflow.log_metric("2pts - precisao", pr_2pts)
    mlflow.log_metric("2pts - logloss", logloss_2pts)

mlflow.end_run()

Unnamed: 0,Description,Value
0,session_id,3700
1,Target,shot_made_flag
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(5412, 7)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,3
8,Ordinal Features,False
9,High Cardinality Features,False


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.6656,0.5442,0.0671,0.3269,0.1113,0.0056,0.0083


# Serviço do Modelo

In [None]:
# executar no terminal
# export MLFLOW_TRACKING_URI='sqlite:///mlruns.db'

# mlflow models serve -m "models:/kobe-classification/Staging" --no-conda -p 5001
# ou
# mlflow models serve -m "models:/kobe-regression/Staging" --no-conda -p 5001

# chamada a api

In [None]:
def call_api(df):
    # Configura a requisição
    host = 'localhost'
    port = '5001'
    url = f'http://{host}:{port}/invocations'
    headers = {'Content-Type': 'application/json',}

    http_data = df.drop(target_col,axis=1).to_json(orient='split')

    r = requests.post(url=url, headers=headers, data=http_data)

    df.loc[:, 'operation_label'] = pd.read_json(r.text).values[:,0]

    df.to_parquet('../Data/operalization/base_3pts.parquet')

    print(df[df.operation_label == 1].sort_values('shot_distance', ascending=False).head(50))

In [None]:
call_api(df_3pts)
