In [0]:
#!pip install python-slugify
#!pip install loguru
#!pip install openpyxl

!pip install --upgrade pip
!pip install python-slugify loguru openpyxl scikit-learn joblib shap imbalanced-learn

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.8 MB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m36.4 MB/s[0m eta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-25.1.1
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[33mDEPRECATION: Using the pkg_resources me

In [0]:
from uuid import uuid4
from abc import ABC, abstractmethod
import json
import io
import os

import pandas as pd
import numpy as np
import pyspark.pandas as ps
import requests
from requests.structures import CaseInsensitiveDict
from slugify import slugify
from functools import partial
from loguru import logger
from pyspark.sql import SparkSession
from pyspark.dbutils import DBUtils
from pyspark.sql.functions import split, trim, col, substring
from pyspark.sql.types import StringType, DateType, FloatType, IntegerType
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import joblib

from pyspark.sql import SparkSession
from pyspark.dbutils import DBUtils


In [0]:
## Permite a conexão com Sharepoint
spark = SparkSession.builder.getOrCreate()
delta = SparkSession.builder.appName("").getOrCreate()
dbutils = DBUtils(spark)

secret_dict = {
'Databricks':'key-trdg-bricks-token',
'Sharepoint':'key-trdg-client-secret',
'FTP':'key-trdg-ftp',
'SAS':'key-trdg-sas',
}

def get_secret(which: str):
    try:
        return dbutils.secrets.get(scope="keyvault", key=secret_dict[which])
    except:
        print('The secrets available are: Databricks[Token], Sharepoint[Token], FTP[Password] and SAS[User and Password].')


URL_TOKEN = 'https://login.microsoftonline.com/24090322-b104-494d-a1d3-662da14cddd4/oauth2/v2.0/token'
URL_MSG = 'https://graph.microsoft.com/v1.0/sites/'

BLOB_PATH = 'abfss://sandbox@adltrdgwestus.dfs.core.windows.net/'

#spark.conf.set(
#    "fs.azure.account.key.adltrdgwestus.dfs.core.windows.net",
#    dbutils.secrets.get(scope="scope-keyvault-prd", key="secret-databricks-trdg-prd"))


class Sharepoint(ABC):
    """Class that implements the logic to extract and load
    datasets from the Sharepoint to the Databricks.
    """
    def __init__(self, path_file: str, name_site: str = 'DadosBI', host: str = 'achelaboratorios.sharepoint.com'):
        self.path_file = path_file 
        self.name_site = name_site 
        self.host = host

        self.headers = self.get_bearer_token()
        self.data = self.download_file()

    
    def get_bearer_token(self) -> dict:
        body = {'grant_type': 'client_credentials', 
                'client_id': '01c78346-6928-48c4-8cd6-d0ef71ec7021', 
                'client_secret': get_secret(which='Sharepoint'),
                'scope': 'https://graph.microsoft.com/.default'
                }
        
        headers = CaseInsensitiveDict()
        headers["Accept"] = "application/json"
        headers["Authorization"] = f"Bearer {json.loads(requests.post(URL_TOKEN, data=body).text)['access_token']}"

        return headers


    def get_ids(self, is_file=True) -> str:
        id_site = (requests
                   .get(f'{URL_MSG}/{self.host}:/sites/'+self.name_site+'?$select=id', 
                        headers=self.headers)
                   .json()['id']
                   .split(',')
                   [1]
                   )
        id_drives = (requests
                    .get(URL_MSG+id_site+f"/drive", 
                                 headers=self.headers)
                    .json()
                    ['id']
                    )
        if is_file:
            url_content = URL_MSG+id_site+'/drives/'+id_drives+f'/root:/{self.path_file}:/content'
        else: 
            url_content = URL_MSG+id_site+'/drives/'+id_drives+f'/root:/{self.path_file}:/children'
        return url_content
    

    def download_file(self) -> dict:
        if '.' in self.path_file:
            url_content = self.get_ids()
        else:
            url_content = self.get_ids(is_file=False)
        return (requests.get(url_content, headers=self.headers)).content
    
    def ls(self) -> list:
        data_dict = json.loads(self.data)
        list_files = []
        for files in data_dict['value']:
            list_files.append(files['name'])
        return list_files
    
    def read_file(self, enconding:str = 'utf-8', **params) -> pd.DataFrame:
        type_file = self.path_file.split('.')[1]
        if type_file in ['txt', 'csv']:
            self.read_data = pd.read_csv(io.StringIO(self.data.decode(enconding)), **params)
            return self
        elif type_file in ['xls', 'xlsx']:
            self.read_data = pd.read_excel(self.data, **params)
            return self
        else:
            print('Este tipo de arquivo não está implementado!')
    

    def to_dataframe(self) -> pd.DataFrame:
        return self.read_data
    

    def export_blob_storage_csv(self, path:str, delimiter:str = ';', encoding:str = "UTF-8") -> str:
        ps.from_pandas(self.read_data).to_spark() \
        .coalesce(1) \
            .write \
            .mode("overwrite") \
            .option("header", "true") \
            .option("delimiter", delimiter) \
            .option("encoding",encoding) \
            .csv(BLOB_PATH + path)


    def slugify_columns(self, columns):
        slug = partial(slugify, separator="_")
        return [slug(column.replace("%", "percent").replace("+", "_")) for column in columns]


    def export_blob_storage_parquet(self, path:str, compression:str = 'snappy') -> str:
        df = self.read_data
        df.columns = self.slugify_columns(df.columns)
        ps.from_pandas(df).to_spark() \
        .coalesce(1) \
            .write \
            .mode("overwrite") \
            .option("compression", compression) \
            .parquet(BLOB_PATH + path)
    
    
    def export_datalake(self, path) -> str:
        ps.from_pandas(self.read_data).to_spark() \
         .write \
         .mode("overwrite") \
         .option("overwriteSchema", "true") \
         .saveAsTable(path)


In [0]:
# Função para pré-processar os dados
def preprocess_data(df):
    logger.info("Iniciando pré-processamento...")
    df = df.dropna()
    features = ['SEGMENTO', 'PRIMEIRA_COMPRA', 'PRIMEIRA_COMPRA_CRM_LISTA_DE_BLOQUEIO_perc',
                'PRIMEIRA_COMPRA_CPF_INATIVO_perc', 'QTD_MARCA_AVG', 'QTD_MARCA_qt50']
    X = df[features].copy()
    Y = df['Desvio'].copy()

    # Transforma a coluna target em numérico
    Y = Y.map({'N': 0, 'S': 1}).astype(int)

    X['SEGMENTO'] = LabelEncoder().fit_transform(X['SEGMENTO'])
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    logger.success("Pré-processamento finalizado.")
    return X_scaled, Y.values, scaler

# Função para treinar o modelo
def train_model(X, Y):
    logger.info("Treinando modelo...")
    rf = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
    smote = SMOTE(random_state=42)
    X_resampled, Y_resampled = smote.fit_resample(X,Y)
    rf.fit(X_resampled, Y_resampled)
    logger.success("Modelo treinado.")
    return rf

# Função para validação cruzada
def validate_model(model, X, Y):
    logger.info("Validando modelo com cross-validation...")
    skf = StratifiedKFold(n_splits=5)
    scores = cross_val_score(model, X, Y, cv=skf, scoring='accuracy')
    logger.success(f"Validação finalizada. Acurácia média: {scores.mean():.4f}")
    return scores

# Função para avaliação no teste
def evaluate_model(model, X_test, Y_teste):
    logger.info("Avaliando modelo...")
    Y_pred = model.predict(X_test)
    print(classification_report(Y_teste, Y_pred))
    print(f"Accuracy: {accuracy_score(Y_teste, Y_pred):.4f}")
    print(f"ROC AUC: {roc_auc_score(Y_teste, Y_pred):.4f}")
    cm = confusion_matrix(Y_teste, Y_pred)
    print("Matriz de Confusão:\n", cm)

# Função para salvar o modelo e o scaler
def save_model(model, scaler, model_path='modelo_rf.pkl', scaler_path='scaler.pkl'):
    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    logger.success("Modelo e scaler salvos.")

# Função para predição em novos dados
def predict_new_data(df_new, model, scaler):
    logger.info("Realizando predição em novos dados...")
    features = ['SEGMENTO', 'PRIMEIRA_COMPRA', 'PRIMEIRA_COMPRA_CRM_LISTA_DE_BLOQUEIO_perc',
                'PRIMEIRA_COMPRA_CPF_INATIVO_perc', 'QTD_MARCA_AVG', 'QTD_MARCA_qt50']
    df = df_new[features].copy()
    df['SEGMENTO'] = LabelEncoder().fit_transform(df['SEGMENTO'])
    X_scaled = scaler.transform(df)
    preds = model.predict(X_scaled)
    df_new['Desvio_predito'] = preds
    return df_new


In [0]:
# Pipeline de execução: Leitura, treino, validação, avaliação e salvamento

# 1. Ler os dados
aprendizagem = Sharepoint('General/Arquivo_Databricks/Aprendizagem_fraude_teste.xlsx').read_file(engine = 'openpyxl').to_dataframe()


# 2. Pré-processar os dados
X, Y, scaler = preprocess_data(aprendizagem)  

# 3. Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

# 4. Treinar modelo
modelo_rf = train_model(X_train, y_train)

# 5. Validar modelo
validate_model(modelo_rf, X, Y)

# 6. Avaliar modelo no teste
evaluate_model(modelo_rf, X_test, y_test)

# 7. Salvar modelo e scaler
save_model(modelo_rf, scaler)

# 8 Teste de melhores hiperparâmetros



[32m2025-05-26 11:01:48.501[0m | [1mINFO    [0m | [36m__main__[0m:[36mpreprocess_data[0m:[36m3[0m - [1mIniciando pré-processamento...[0m
[32m2025-05-26 11:01:48.511[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mpreprocess_data[0m:[36m16[0m - [32m[1mPré-processamento finalizado.[0m
[32m2025-05-26 11:01:48.514[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_model[0m:[36m21[0m - [1mTreinando modelo...[0m


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/3 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

[32m2025-05-26 11:02:16.729[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtrain_model[0m:[36m26[0m - [32m[1mModelo treinado.[0m
[32m2025-05-26 11:02:16.731[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_model[0m:[36m31[0m - [1mValidando modelo com cross-validation...[0m
[32m2025-05-26 11:02:19.042[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mvalidate_model[0m:[36m34[0m - [32m[1mValidação finalizada. Acurácia média: 0.7100[0m
[32m2025-05-26 11:02:19.044[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_model[0m:[36m39[0m - [1mAvaliando modelo...[0m


              precision    recall  f1-score   support

           0       0.91      0.94      0.92        31
           1       0.75      0.67      0.71         9

    accuracy                           0.88        40
   macro avg       0.83      0.80      0.81        40
weighted avg       0.87      0.88      0.87        40

Accuracy: 0.8750
ROC AUC: 0.8011
Matriz de Confusão:
 [[29  2]
 [ 3  6]]


[32m2025-05-26 11:02:20.579[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36msave_model[0m:[36m51[0m - [32m[1mModelo e scaler salvos.[0m


[0;31m---------------------------------------------------------------------------[0m
[0;31mKeyError[0m                                  Traceback (most recent call last)
File [0;32m/databricks/python/lib/python3.10/site-packages/pandas/core/indexes/base.py:3629[0m, in [0;36mIndex.get_loc[0;34m(self, key, method, tolerance)[0m
[1;32m   3628[0m [38;5;28;01mtry[39;00m:
[0;32m-> 3629[0m     [38;5;28;01mreturn[39;00m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_engine[49m[38;5;241;43m.[39;49m[43mget_loc[49m[43m([49m[43mcasted_key[49m[43m)[49m
[1;32m   3630[0m [38;5;28;01mexcept[39;00m [38;5;167;01mKeyError[39;00m [38;5;28;01mas[39;00m err:

File [0;32m/databricks/python/lib/python3.10/site-packages/pandas/_libs/index.pyx:136[0m, in [0;36mpandas._libs.index.IndexEngine.get_loc[0;34m()[0m

File [0;32m/databricks/python/lib/python3.10/site-packages/pandas/_libs/index.pyx:163[0m, in [0;36mpandas._libs.index.IndexEngine.get_loc[0;34m()[0m

F