## Criando diretório Bronze

### Criei um volume delta já que no free tier o acesso a mnt/ é negado

In [0]:
%py
# Criando diretório Bronze
# Definir caminhos base
# BRONZE_PATH = "/mnt/delta/bronze"

BRONZE_PATH = "/Volumes/workspace/default/delta/bronze/credit-fraud"
SILVER_PATH = "/Volumes/workspace/default/delta/silver/credit-fraud"
GOLD_PATH = "/Volumes/workspace/default/delta/gold/credit-fraud"
SOURCE_PATH = "dbfs:/databricks-datasets/"


## Verificando os dados de teste disponíveis

### Dataset de fraudes em cartão de crédito


In [0]:
with open("/dbfs/databricks-datasets/credit-card-fraud/description.txt", "r") as f:
    print(f.read())


In [0]:
# Ler da camada Bronze
df = spark.read.format("delta").load(F"{BRONZE_PATH}/credit_card_fraud")
print(df)

In [0]:
df.show()

In [0]:
print(df)

In [0]:
# renomear a coluna para features já que alguns modelos de ml esperam esse atributo
df_ml = df.withColumnRenamed("pcaVector", "features")


In [0]:
# train test split
train_df, test_df = df_ml.randomSplit([0.8, 0.2], seed=42)


In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(train_df)


In [0]:
predictions = model.transform(test_df)
display(predictions.select("label", "prediction", "probability"))


In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


In [0]:
from pyspark.sql import functions as F

# Gera uma tabela de contagem real × previsão
confusion_df = (
    predictions.groupBy("label", "prediction")
    .count()
    .orderBy("label", "prediction")
)

display(confusion_df)


In [0]:
tp = predictions.filter("label = 1 AND prediction = 1").count()
tn = predictions.filter("label = 0 AND prediction = 0").count()
fp = predictions.filter("label = 0 AND prediction = 1").count()
fn = predictions.filter("label = 1 AND prediction = 0").count()

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"Acurácia: {accuracy:.4f}")
print(f"Precisão: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


In [0]:
import pandas as pd

conf_matrix = pd.DataFrame({
    "Real 0": [tn, fp],
    "Real 1": [fn, tp]
}, index=["Previsto 0", "Previsto 1"])

display(conf_matrix)
