# CASE: BOMBOM - Advanced Analytics IBM + Vale

## Realizado por: Sabrina Otoni da Silva

## Etapa de treinamento, predição e otimização (redução do custo)

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor

from hyperopt import hp, fmin, tpe

import joblib

import warnings
warnings.simplefilter("ignore", UserWarning)

In [2]:
datapath = Path('../data')

In [3]:
data_model = pd.read_csv(f'{datapath}/02_intermediate/data_hot_processed.csv')
data_predictive = pd.read_csv(f'{datapath}/02_intermediate/data_predictive_hot_encoded.csv')
data_prescriptive = pd.read_csv(f'{datapath}/02_intermediate/data_prescriptive_hot_encoded.csv')

In [4]:
X_train = data_model.drop(['VAR_2_A', 'PESO_BOMBOM'], axis=1)
y_train = data_model['PESO_BOMBOM']

In [5]:
rf_model = RandomForestRegressor(max_depth = 10, n_estimators = 50)
rf_model.fit(X_train.values, y_train.values)
modelpath = Path('../model')
joblib.dump(rf_model, modelpath / 'rf_model.pkl')

['..\\model\\rf_model.pkl']

In [6]:
rf_model = joblib.load(modelpath / 'rf_model.pkl')

In [7]:
data_predictive['PESO_BOMBOM'] = rf_model.predict(data_predictive[['QTD_CHOC', 'VAR_1', 'VAR_2_B', 'VAR_2_C']])

In [8]:
def cost_function(p):
    return 200 / (1 + np.exp(10 * (p - 9.5))) + 200 / (1 + np.exp(-0.8 * (p - 12)))

def optimize_qtd_choc(row):
    space = hp.uniform('QTD_CHOC', data_model['QTD_CHOC'].min(), data_model['QTD_CHOC'].max())

    def objective(qtd_choc):
        predicted_weight = rf_model.predict(np.array([[qtd_choc, row['VAR_1'], row['VAR_2_B'], row['VAR_2_C']]]))[0]
        cost = cost_function(predicted_weight)
        return cost, predicted_weight

    def objective_wrapper(qtd_choc):
        return objective(qtd_choc)[0]

    best = fmin(
        fn=objective_wrapper,
        space=space,
        algo=tpe.suggest,
        max_evals=100,
        rstate=np.random.default_rng(0)
    )
    _, predicted_weight = objective(best['QTD_CHOC'])
    return best['QTD_CHOC'], predicted_weight

In [9]:
results = data_prescriptive.apply(lambda row: optimize_qtd_choc(row) if pd.isnull(row['QTD_CHOC']) else (row['QTD_CHOC'], np.nan), axis=1)
data_prescriptive['QTD_CHOC'] = results.apply(lambda x: x[0])
data_prescriptive['PESO_BOMBOM_PREV'] = results.apply(lambda x: x[1])

100%|██████████| 100/100 [00:00<00:00, 197.36trial/s, best loss: 34.69387669594737]
100%|██████████| 100/100 [00:00<00:00, 192.29trial/s, best loss: 34.69080655301293]
100%|██████████| 100/100 [00:00<00:00, 190.93trial/s, best loss: 34.819270475480415]
100%|██████████| 100/100 [00:00<00:00, 174.98trial/s, best loss: 34.69554243285974]
100%|██████████| 100/100 [00:00<00:00, 217.69trial/s, best loss: 34.691739559393056]
100%|██████████| 100/100 [00:00<00:00, 229.72trial/s, best loss: 48.56348487600369]
100%|██████████| 100/100 [00:00<00:00, 195.17trial/s, best loss: 37.97024415395306]
100%|██████████| 100/100 [00:00<00:00, 222.87trial/s, best loss: 34.69042288485706]
100%|██████████| 100/100 [00:00<00:00, 234.93trial/s, best loss: 34.691044933449206]
100%|██████████| 100/100 [00:00<00:00, 181.96trial/s, best loss: 34.694752500922334]
100%|██████████| 100/100 [00:00<00:00, 220.91trial/s, best loss: 34.69130415864887]
100%|██████████| 100/100 [00:00<00:00, 211.51trial/s, best loss: 40.6216

In [10]:
data_predictive.to_csv(f'{datapath}/03_processed/data_predictive_processed.csv', index=False)
data_prescriptive.to_csv(f'{datapath}/03_processed/data_prescriptive_processed.csv', index=False)

In [11]:
data_predictive.head(10)

Unnamed: 0,QTD_CHOC,VAR_1,PESO_BOMBOM,VAR_2_B,VAR_2_C
0,286.63,2.72,11.167999,0.0,0.0
1,257.27,1.97,8.509156,1.0,0.0
2,353.16,2.64,12.693101,0.0,0.0
3,317.78,2.76,11.817574,0.0,0.0
4,248.1,2.63,10.055342,0.0,1.0
5,301.79,0.93,10.605924,0.0,0.0
6,246.52,2.98,10.638673,1.0,0.0
7,356.48,2.38,12.612862,0.0,0.0
8,349.41,1.59,9.578502,1.0,0.0
9,282.74,2.11,10.826691,0.0,1.0


In [12]:
data_predictive.describe()

Unnamed: 0,QTD_CHOC,VAR_1,PESO_BOMBOM,VAR_2_B,VAR_2_C
count,50.0,50.0,50.0,50.0,50.0
mean,294.6134,2.1912,10.236129,0.44,0.3
std,42.792363,0.632225,1.130256,0.501427,0.46291
min,208.0,0.63,7.932927,0.0,0.0
25%,265.4575,1.9025,9.477331,0.0,0.0
50%,295.12,2.32,10.090145,0.0,0.0
75%,320.1425,2.7275,10.790412,1.0,1.0
max,387.76,2.98,12.807545,1.0,1.0


In [13]:
data_prescriptive.head(10)

Unnamed: 0,QTD_CHOC,VAR_1,VAR_2_B,VAR_2_C,PESO_BOMBOM_PREV
0,329.143682,2.07,0.0,1.0,9.944308
1,215.058905,2.94,1.0,0.0,9.947925
2,303.089887,2.53,1.0,0.0,9.985518
3,317.608186,2.19,1.0,0.0,9.956665
4,329.520638,2.02,0.0,1.0,9.953302
5,435.537231,0.98,1.0,0.0,9.714989
6,290.244992,1.04,0.0,0.0,10.176934
7,287.870792,1.32,0.0,0.0,9.950301
8,386.776642,1.97,1.0,0.0,9.952236
9,137.271563,1.82,0.0,0.0,9.956113


In [14]:
data_prescriptive.describe()

Unnamed: 0,QTD_CHOC,VAR_1,VAR_2_B,VAR_2_C,PESO_BOMBOM_PREV
count,50.0,50.0,50.0,50.0,50.0
mean,287.455464,2.0536,0.34,0.32,9.956043
std,72.588759,0.763041,0.478518,0.471212,0.069714
min,137.271563,0.03,0.0,0.0,9.714989
25%,243.086477,1.59,0.0,0.0,9.946482
50%,283.118989,2.295,0.0,0.0,9.953302
75%,326.362246,2.6425,1.0,1.0,9.967941
max,435.537231,2.97,1.0,1.0,10.176934
