In [1]:
# ==========================================================
# PROJETO: Previs√£o de Movimentos do Ibovespa
# P√≥s-gradua√ß√£o em Data Analytics ‚Äî FIAP
# ==========================================================

# üîπ Importa√ß√£o das bibliotecas principais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    roc_curve,
    auc,
    ConfusionMatrixDisplay,
)
from sklearn.ensemble import RandomForestClassifier

# ==========================================================
# 1Ô∏è‚É£ LEITURA E LIMPEZA DOS DADOS
# ==========================================================

# L√™ o CSV exportado do site de cota√ß√µes
file_path = r"C:\Users\vinic\OneDrive\√Årea de Trabalho\Vinicius\Data Analytics\Tech Challenge\Fase 02\Dados Hist√≥ricos - Ibovespa.csv"

# Renomeia as colunas para facilitar o manuseio
df.columns = ["Date", "Price", "Open", "High", "Low", "Vol", "Change"]

# Converte a coluna "Date" para o formato datetime
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)

# Ordena o dataset da data mais antiga para a mais recente
df = df.sort_values("Date").reset_index(drop=True)

# ==========================================================
# 2Ô∏è‚É£ TRATAMENTO DAS COLUNAS NUM√âRICAS
# ==========================================================

# Fun√ß√£o para converter volumes no formato 9.48M, 2.1B etc. para n√∫meros
def parse_volume(v):
    v = str(v).replace(",", "").strip()
    if v.endswith("M"):  # milh√µes
        return float(v[:-1]) * 1e6
    elif v.endswith("B"):  # bilh√µes
        return float(v[:-1]) * 1e9
    elif v.isdigit():  # n√∫mero puro
        return float(v)
    else:
        return np.nan

# Aplica a convers√£o √† coluna "Vol"
df["Vol"] = df["Vol"].apply(parse_volume)

# Converte ‚ÄúChange‚Äù (varia√ß√£o percentual) removendo o s√≠mbolo ‚Äú%‚Äù
df["Change"] = (
    df["Change"].astype(str).str.replace("%", "").str.replace(",", ".").astype(float)
)

# Converte colunas de pre√ßos em float
for col in ["Price", "Open", "High", "Low"]:
    df[col] = df[col].astype(str).str.replace(".", "").str.replace(",", ".").astype(float)

# ==========================================================
# 3Ô∏è‚É£ FEATURE ENGINEERING (CRIA√á√ÉO DE VARI√ÅVEIS)
# ==========================================================

# Cria a vari√°vel ‚ÄúTarget‚Äù (1 = pre√ßo subiu, 0 = pre√ßo caiu)
df["Target"] = (df["Price"].shift(-1) > df["Price"]).astype(int)

# Calcula retornos di√°rios
df["Return"] = df["Price"].pct_change()

# M√©dias m√≥veis de curto e longo prazo
df["MA5"] = df["Price"].rolling(window=5).mean()
df["MA20"] = df["Price"].rolling(window=20).mean()

# Remove as linhas com valores NaN (geradas pelas m√©dias m√≥veis)
df = df.dropna()

# ==========================================================
# 4Ô∏è‚É£ SEPARA√á√ÉO ENTRE TREINO E TESTE
# ==========================================================

X = df[["Open", "High", "Low", "Vol", "Return", "MA5", "MA20"]]
y = df["Target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# ==========================================================
# 5Ô∏è‚É£ PIPELINES COM DIFERENTES MODELOS
# ==========================================================

# üîπ Modelo 1: KNN
pipe_knn = Pipeline(
    [
        ("scaler", StandardScaler()),  # Normaliza os dados
        ("model", KNeighborsClassifier(n_neighbors=5)),
    ]
)

# üîπ Modelo 2: SVM
pipe_svm = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("model", SVC(kernel="rbf", probability=True, random_state=42)),
    ]
)

# üîπ Modelo 3: Random Forest
pipe_rf = Pipeline([("model", RandomForestClassifier(random_state=42))])

# ==========================================================
# 6Ô∏è‚É£ AVALIA√á√ÉO COM VALIDA√á√ÉO CRUZADA
# ==========================================================

def evaluate_model(pipe, name):
    """Executa valida√ß√£o cruzada e mostra m√©tricas."""
    scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="accuracy")
    print(f"\nüîπ {name}")
    print(f"Acur√°cia m√©dia (CV): {scores.mean():.3f} ¬± {scores.std():.3f}")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print("Classification Report (Teste):")
    print(classification_report(y_test, y_pred))
    return pipe

# Treina e avalia cada modelo
pipe_knn = evaluate_model(pipe_knn, "KNN")
pipe_svm = evaluate_model(pipe_svm, "SVM")
pipe_rf = evaluate_model(pipe_rf, "Random Forest")

# ==========================================================
# 7Ô∏è‚É£ CURVA ROC E AUC SCORE
# ==========================================================

plt.figure(figsize=(8, 6))
for name, model in {
    "KNN": pipe_knn,
    "SVM": pipe_svm,
    "Random Forest": pipe_rf,
}.items():
    y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], "k--")  # Linha de refer√™ncia (azar)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äî Modelos de Classifica√ß√£o")
plt.legend()
plt.show()

# ==========================================================
# 8Ô∏è‚É£ FORECASTING (PREVIS√ÉO FUTURA SIMPLES)
# ==========================================================

# Neste caso, fazemos uma previs√£o do pr√≥ximo dia usando o melhor modelo (Random Forest)
next_day = df.iloc[-1:].copy()
next_day_features = next_day[["Open", "High", "Low", "Vol", "Return", "MA5", "MA20"]]

# Predi√ß√£o do movimento do pr√≥ximo preg√£o
future_pred = pipe_rf.predict(next_day_features)[0]
print("\nüîÆ Previs√£o para o pr√≥ximo dia:")
print("O Ibovespa deve subir üìà" if future_pred == 1 else "O Ibovespa deve cair üìâ")


NameError: name 'df' is not defined