# Demonstração de fluxo completo de Ciência de Dados
Este notebook acompanha as etapas descritas no README: EDA, benchmarking de modelos, construção de features, uso de uma feature store simples, integração com MLflow e uma simulação de CI/CD com deploy canário.

## 1. EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.datasets import load_wine

In [None]:
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.head()

In [None]:
df.describe()

In [None]:
sns.histplot(df['alcohol'], kde=True);
plt.show()

In [None]:
sns.pairplot(df.sample(100), hue='target');

In [None]:
corr = df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap='coolwarm');

Verificando a aderência à distribuição Normal de uma feature.

In [None]:
stat, p = stats.normaltest(df['alcohol'])
print('p-value', p)
if p < 0.05:
    print('Distribuição provavelmente não é Normal')
else:
    print('Distribuição parece Normal')

## 2. Benchmark e escolha de modelo

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
X = df.drop('target', axis=1)
y = df['target']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(random_state=42)
lr_scores = cross_val_score(lr, X_scaled, y, cv=5)
rf_scores = cross_val_score(rf, X, y, cv=5)
print('LR mean accuracy:', lr_scores.mean())
print('RF mean accuracy:', rf_scores.mean())

## 3. Pipeline de features e Feature Store

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
feature_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('select', SelectKBest(score_func=f_classif, k=8))
])
X_features = feature_pipeline.fit_transform(X, y)
pd.DataFrame(X_features).to_csv('feature_store.csv', index=False)
X_features[:5]

## 4. Integração com MLflow

In [None]:
import mlflow
mlflow.set_experiment('wine_demo')
with mlflow.start_run():
    lr.fit(X_scaled, y)
    acc = lr.score(X_scaled, y)
    mlflow.log_metric('accuracy', acc)
    mlflow.sklearn.log_model(lr, 'model')
print('accuracy logged:', acc)

## 5. CI/CD e Deploy Canário (simulado)

In [None]:
import random
def simulate_canary(old_metric, new_metric, threshold=0.02):
    if new_metric + threshold < old_metric:
        print('Canário falhou, iniciando rollback...')
        return old_metric, False
    else:
        print('Novo modelo aprovado!')
        return new_metric, True
current_metric = lr_scores.mean()
candidate_metric = current_metric - 0.05  # simula piora
current_metric, ok = simulate_canary(current_metric, candidate_metric)
if not ok:
    candidate_metric = current_metric + 0.03  # nova versão melhor
    current_metric, _ = simulate_canary(current_metric, candidate_metric)