In [1]:
# Data wrangling:
import pandas as pd

# Pipelines:
from sklearn.pipeline import (
    Pipeline,
    make_pipeline
)
from sklearn.compose import (
    make_column_selector as selector,
    ColumnTransformer,
)

# Seleção e validação dos modelos:
from sklearn import metrics
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV
)

# Pre-processing:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    OneHotEncoder,
)

# Modelos:
from sklearn.tree import DecisionTreeRegressor

# Deployment:
import pickle

# Config:
from sklearn import set_config
set_config(transform_output='pandas')

## DataFrame:

In [2]:
path = r'../data/Dados Solução Casting.xlsx'
df = pd.read_excel(
    io=path
)

In [3]:
display(df.head(3))

Unnamed: 0,VUF_CODIGO,VUF_CODIGO_BOLETO,UND_CODIGO,FUN_CODIGO,VUF_DT,PRO_CODIGO,CAT_CODIGO,VUF_QTBOLETO,VUF_QTPRODUTO,VUF_VLRBRUTOVENDA,VUF_VLRDESCONTO,VUF_VLRLIQFINAL,VUF_VLRTROCA,CLV_BANCO
0,B28DDB4B-8A49-4658-8539-5DA3F20AA39F,1000,D4758B39-E7EA-4D97-B6B1-6236CE2C05A2,BBEF56E1-66E0-47A6-95AC-5479DE79577B,2022-06-18,6768760F-758A-4B31-AE33-CEB127A956CB,7B6AD9C1-F745-4137-B62D-4A67E865E781,1,1.0,15.0,0.0,15.0,0.0,CASTING_DB108
1,8E22E8FC-65DC-44BA-AFFA-4D661108F0B4,1001,D4758B39-E7EA-4D97-B6B1-6236CE2C05A2,AF858E1C-9C72-4B63-AC51-A8185ECCCC35,2022-06-18,0136ECF4-9F4D-4DBE-A78D-B2ED565CE60C,7B6AD9C1-F745-4137-B62D-4A67E865E781,1,1.0,32.0,0.0,32.0,0.0,CASTING_DB108
2,E36DEDC5-78CB-4452-9E04-AD174E32747C,1002,D4758B39-E7EA-4D97-B6B1-6236CE2C05A2,BBEF56E1-66E0-47A6-95AC-5479DE79577B,2022-06-20,B0974005-D231-438C-A0C1-F6A211B00697,7B6AD9C1-F745-4137-B62D-4A67E865E781,1,1.0,15.0,0.0,15.0,0.0,CASTING_DB108


## Código:

In [4]:
class Casting():
    pass

if __name__ == '__main__':
    casting = Casting()

## Model:

In [5]:
list(df.columns)

['VUF_CODIGO',
 'VUF_CODIGO_BOLETO',
 'UND_CODIGO',
 'FUN_CODIGO',
 'VUF_DT',
 'PRO_CODIGO',
 'CAT_CODIGO',
 'VUF_QTBOLETO',
 'VUF_QTPRODUTO',
 'VUF_VLRBRUTOVENDA',
 'VUF_VLRDESCONTO',
 'VUF_VLRLIQFINAL',
 'VUF_VLRTROCA',
 'CLV_BANCO']

### Variáveis:

In [6]:
NUMERICAL_FEATURES = [
    'VUF_QTBOLETO',
    'VUF_QTPRODUTO',
    'VUF_VLRBRUTOVENDA',
    'VUF_VLRDESCONTO',
    'VUF_VLRTROCA',
]
CATEGORICAL_FEATURES = [
    # 'VUF_CODIGO',
    # 'VUF_CODIGO_BOLETO',
    # 'UND_CODIGO',
    # 'FUN_CODIGO',
    # 'VUF_DT',
    # 'PRO_CODIGO',
    'CAT_CODIGO',
    'CLV_BANCO'
]
FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
TARGET = 'VUF_VLRLIQFINAL'

### Seed:

In [7]:
seed = 69

### Separating x and y:

In [8]:
x = df[FEATURES]
y = df[TARGET]

### Train-test Split:

In [9]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    random_state=seed
)

Tratando Nulos:

In [10]:
numeric_processor = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(), 
)
categorical_processor  = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False),
)

### Pipeline de pre-processig:

In [11]:
pre_processing = ColumnTransformer([
    (
        'one', 
        categorical_processor,
        CATEGORICAL_FEATURES
    ),
    (
        'Scaler', 
        numeric_processor, 
        NUMERICAL_FEATURES
    ),
])

### Pipeline:

In [12]:
pipeline = Pipeline([
    ('pre_processing', pre_processing),
    ('model', DecisionTreeRegressor(max_depth=3)),
])

In [13]:
pipeline.fit(x_train, y_train)

### Deployment:

In [14]:
path = r'../models/model.pkl' 

with open(path, 'wb') as file:
    pickle.dump(pipeline, file)

O código acima não está pronto, o que você ver acima é apenas um esqueleto, e necessário antes fazer a ter um bom entendimento do negócio, um bom entendimento dos dados, a preparação dos dados e por fim começamos a criar o modelo, além disso, é necessário testamos vários modelos para ver qual desempenha melhor no cenário que temos, e após isso otimizamos o modelo.

Fique a vontade para fazer um `pull request` para adicionar melhorias ao projeto.

:)