In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import warnings
import os
warnings.filterwarnings('ignore')

# --- Funções Auxiliares para Engenharia de Atributos Avançada ---

def calculate_rsi(prices, period=14):
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_macd(prices, fast=12, slow=26, signal=9):
    ema_fast = prices.ewm(span=fast).mean()
    ema_slow = prices.ewm(span=slow).mean()
    macd = ema_fast - ema_slow
    return macd

def calculate_adx(df_ohlc, period=14):
    high = df_ohlc['High']
    low = df_ohlc['Low']
    close = df_ohlc['Close']
    
    plus_dm = high.diff()
    minus_dm = -low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm < 0] = 0
    
    tr1 = high - low
    tr2 = abs(high - close.shift())
    tr3 = abs(low - close.shift())
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    
    atr = tr.rolling(window=period).mean()
    plus_di = 100 * (plus_dm.rolling(window=period).mean() / atr)
    minus_di = 100 * (minus_dm.rolling(window=period).mean() / atr)
    
    dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di)
    adx = dx.rolling(window=period).mean()
    return adx

# --- Classe Principal da Pipeline ---

class FinancialModelPipeline:
    def __init__(self, ticker="^BVSP", years_of_data=2, test_size=30, advanced_features=False):
        self.ticker = ticker
        self.years_of_data = years_of_data
        self.test_size = test_size
        self.advanced_features = advanced_features
        self.df = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.scaler = StandardScaler()
        self.models = {}
        self.predictions = {}
        self.results = {}

    # --- Download de dados ---
    def download_data(self):
        end_date = datetime.now().strftime('%Y-%m-%d')
        start_date = (datetime.now() - timedelta(days=self.years_of_data * 365)).strftime('%Y-%m-%d')
        
        print(f"Baixando dados do {self.ticker} de {start_date} até {end_date} ({self.years_of_data} anos)...")
        try:
            data = yf.download(self.ticker, start=start_date, end=end_date)
            self.df = data
            print(f"Dados baixados com sucesso. Total de {len(self.df)} linhas.")
        except Exception as e:
            print(f"Ocorreu um erro ao baixar os dados: {e}")
            exit()

    # --- Pré-processamento e engenharia de atributos ---
    def preprocess_and_engineer_features(self):
        if self.df is None:
            print("Dados não foram baixados. Execute download_data() primeiro.")
            return

        if 'Adj Close' in self.df.columns:
            self.df = self.df.drop(columns=['Adj Close'])
        self.df.columns = ['Open', 'High', 'Low', 'Close', 'Volume']

        # Target Variable
        self.df['Next_Close'] = self.df['Close'].shift(-1)
        self.df['Target'] = (self.df['Next_Close'] > self.df['Close']).astype(int)
        self.df = self.df.dropna()

        # Basic Features
        self.df['Daily_Return'] = self.df['Close'].pct_change() * 100
        self.df['SMA_5'] = self.df['Close'].rolling(window=5).mean()
        self.df['SMA_20'] = self.df['Close'].rolling(window=20).mean()
        self.df['Vol_20'] = self.df['Close'].rolling(window=20).std()

        for i in range(1, 6):
            self.df[f'Close_Lag_{i}'] = self.df['Close'].shift(i)
        
        if self.advanced_features:
            print("Aplicando engenharia de atributos avançada...")
            self.df['SMA_50'] = self.df['Close'].rolling(window=50).mean()
            self.df['Vol_50'] = self.df['Close'].rolling(window=50).std()

            for i in range(6, 11):
                self.df[f'Close_Lag_{i}'] = self.df['Close'].shift(i)
            for i in range(1, 11):
                self.df[f'Return_Lag_{i}'] = self.df['Daily_Return'].shift(i)

            self.df['RSI_14'] = calculate_rsi(self.df['Close'], 14)
            self.df['MACD'] = calculate_macd(self.df['Close'])
            self.df['Momentum_10'] = self.df['Close'].diff(10)
            self.df['High_Low_Ratio'] = (self.df['High'] - self.df['Low']) / self.df['Close']
            self.df['Close_Open_Ratio'] = (self.df['Close'] - self.df['Open']) / self.df['Open']
            self.df['Volume_MA_Ratio'] = self.df['Volume'] / self.df['Volume'].rolling(window=20).mean()
            self.df['Trend_5'] = (self.df['Close'] - self.df['SMA_5']) / self.df['SMA_5'] * 100
            self.df['Trend_20'] = (self.df['Close'] - self.df['SMA_20']) / self.df['SMA_20'] * 100
            self.df['Trend_50'] = (self.df['Close'] - self.df['SMA_50']) / self.df['SMA_50'] * 100
            self.df['HV_5'] = self.df['Daily_Return'].rolling(window=5).std()
            self.df['HV_20'] = self.df['Daily_Return'].rolling(window=20).std()
            self.df['HV_50'] = self.df['Daily_Return'].rolling(window=50).std()
            self.df['ADX_14'] = calculate_adx(self.df.copy(), 14)

        self.df = self.df.dropna()

        features = [col for col in self.df.columns if col not in ['Open', 'High', 'Low', 'Close', 'Next_Close', 'Target']]
        X = self.df[features]
        y = self.df['Target']

        X_scaled = self.scaler.fit_transform(X)
        X = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)

        self.X_train = X.iloc[:-self.test_size]
        self.X_test = X.iloc[-self.test_size:]
        self.y_train = y.iloc[:-self.test_size]
        self.y_test = y.iloc[-self.test_size:]

        print(f"Pré-processamento e engenharia de atributos concluídos.")
        print(f"Tamanho do conjunto de treino: {len(self.X_train)}, teste: {len(self.X_test)}")
        print(f"Features utilizadas: {len(features)}")

    # --- Treinamento dos modelos ---
    def train_models(self):
        if self.X_train is None:
            print("Dados não processados. Execute preprocess_and_engineer_features() primeiro.")
            return

        print("\nTreinando modelos...")

        rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
        rf_model.fit(self.X_train, self.y_train)
        self.models['RandomForest'] = rf_model
        self.predictions['RandomForest'] = rf_model.predict(self.X_test)

        lr_model = LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear')
        lr_model.fit(self.X_train, self.y_train)
        self.models['LogisticRegression'] = lr_model
        self.predictions['LogisticRegression'] = lr_model.predict(self.X_test)

        if self.advanced_features:
            # Modelos avançados
            xgb_model = xgb.XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                                          subsample=0.8, colsample_bytree=0.8, random_state=42,
                                          scale_pos_weight=1, verbosity=0)
            xgb_model.fit(self.X_train, self.y_train)
            self.models['XGBoost'] = xgb_model
            self.predictions['XGBoost'] = xgb_model.predict(self.X_test)

            lgb_model = lgb.LGBMClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                                           num_leaves=31, subsample=0.8, colsample_bytree=0.8,
                                           random_state=42, verbose=-1)
            lgb_model.fit(self.X_train, self.y_train)
            self.models['LightGBM'] = lgb_model
            self.predictions['LightGBM'] = lgb_model.predict(self.X_test)

            cat_model = CatBoostClassifier(iterations=200, depth=5, learning_rate=0.1,
                                           subsample=0.8, verbose=0, random_state=42)
            cat_model.fit(self.X_train, self.y_train, verbose=False)
            self.models['CatBoost'] = cat_model
            self.predictions['CatBoost'] = cat_model.predict(self.X_test)

            gb_model = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                                                  subsample=0.8, random_state=42)
            gb_model.fit(self.X_train, self.y_train)
            self.models['GradientBoosting'] = gb_model
            self.predictions['GradientBoosting'] = gb_model.predict(self.X_test)

            ada_model = AdaBoostClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
            ada_model.fit(self.X_train, self.y_train)
            self.models['AdaBoost'] = ada_model
            self.predictions['AdaBoost'] = ada_model.predict(self.X_test)

            svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42, probability=True)
            svm_model.fit(self.X_train, self.y_train)
            self.models['SVM'] = svm_model
            self.predictions['SVM'] = svm_model.predict(self.X_test)

            knn_model = KNeighborsClassifier(n_neighbors=5)
            knn_model.fit(self.X_train, self.y_train)
            self.models['KNN'] = knn_model
            self.predictions['KNN'] = knn_model.predict(self.X_test)

            estimators = [('xgb', xgb_model), ('lgb', lgb_model), ('cat', cat_model),
                          ('gb', gb_model), ('rf', rf_model)]
            voting_model = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
            voting_model.fit(self.X_train, self.y_train)
            self.models['Ensemble Voting'] = voting_model
            self.predictions['Ensemble Voting'] = voting_model.predict(self.X_test)

        print("Modelos treinados.")

    # --- Otimização de modelo ---
    def optimize_model(self, model_name='RandomForest'):
        if model_name not in self.models:
            print(f"Modelo {model_name} não encontrado para otimização.")
            return

        print(f"\nOtimização de hiperparâmetros para {model_name}...")

        if model_name == 'RandomForest':
            param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15],
                          'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4],
                          'class_weight': ['balanced']}
            estimator = RandomForestClassifier(random_state=42)
        elif model_name == 'XGBoost':
            param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7],
                          'learning_rate': [0.01, 0.1, 0.2], 'subsample': [0.7, 0.8, 0.9],
                          'colsample_bytree': [0.7, 0.8, 0.9]}
            estimator = xgb.XGBClassifier(random_state=42, scale_pos_weight=1, verbosity=0)
        else:
            print(f"Otimização para {model_name} não implementada.")
            return

        grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid,
                                   scoring='accuracy', cv=3, verbose=0, n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)

        best_model = grid_search.best_estimator_
        self.models[f'{model_name}_Optimized'] = best_model
        self.predictions[f'{model_name}_Optimized'] = best_model.predict(self.X_test)
        self.results[f'{model_name}_Optimized'] = self.evaluate_model(f'{model_name}_Optimized')

        print(f"Otimização concluída para {model_name}.")
        print(f"Melhores hiperparâmetros: {grid_search.best_params_}")
        print(f"Acurácia no teste: {self.results[f'{model_name}_Optimized']['Acurácia']:.4f}")

    # --- Avaliação de modelos ---
    def evaluate_model(self, model_name):
        if model_name not in self.models:
            print(f"Modelo {model_name} não encontrado para avaliação.")
            return None

        y_pred = self.predictions[model_name]
        accuracy = accuracy_score(self.y_test, y_pred)
        report = classification_report(self.y_test, y_pred, target_names=['Baixa (0)', 'Alta (1)'], output_dict=True)
        conf_matrix = confusion_matrix(self.y_test, y_pred)
        
        return {
            "Modelo": model_name,
            "Acurácia": accuracy,
            "Precisão (Alta)": report['Alta (1)']['precision'],
            "Recall (Alta)": report['Alta (1)']['recall'],
            "F1-Score (Alta)": report['Alta (1)']['f1-score'],
            "Precisão (Baixa)": report['Baixa (0)']['precision'],
            "Recall (Baixa)": report['Baixa (0)']['recall'],
            "F1-Score (Baixa)": report['Baixa (0)']['f1-score'],
            "Matriz de Confusão": conf_matrix.tolist()
        }

    # --- Geração de relatório ---
    def generate_evaluation_report(self, filename='model_evaluation_report.md'):
        print("\nGerando relatório de avaliação...")
        all_results = []
        for model_name in self.models:
            if model_name not in self.results:
                self.results[model_name] = self.evaluate_model(model_name)
            all_results.append(self.results[model_name])
        results_df = pd.DataFrame(all_results)
        markdown_report = results_df[['Modelo','Acurácia','Precisão (Alta)','Recall (Alta)','F1-Score (Alta)',
                                      'Precisão (Baixa)','Recall (Baixa)','F1-Score (Baixa)']].to_markdown(index=False, floatfmt=".4f")
        with open(filename, 'w') as f:
            f.write("# Relatório de Avaliação dos Modelos\n\n")
            f.write(markdown_report)
        print(f"Relatório salvo em {filename}")

    # --- Pipeline completa ---
    def run_pipeline(self, optimize_rf=False, optimize_xgb=False):
        self.download_data()
        self.preprocess_and_engineer_features()
        self.train_models()
        
        if optimize_rf:
            self.optimize_model('RandomForest')
        if optimize_xgb and self.advanced_features:
            self.optimize_model('XGBoost')

        self.generate_evaluation_report()

    # --- NOVO: salvar modelos --- 
    def save_models(self, path='.'):
        os.makedirs(path, exist_ok=True)
        for name, model in self.models.items():
            joblib.dump(model, f'{path}/{name.replace(" ", "_").lower()}_model.pkl')
        print(f"Modelos salvos em {path}")


# --- Execução ---
if __name__ == "__main__":
    print("\n--- Pipeline: 2 Anos, Features Básicas ---")
    pipeline_basic = FinancialModelPipeline(years_of_data=2, advanced_features=False)
    pipeline_basic.run_pipeline(optimize_rf=True)
    pipeline_basic.save_models('models_basic_2years')

    print("\n--- Pipeline: 5 Anos, Features Avançadas ---")
    pipeline_advanced = FinancialModelPipeline(years_of_data=5, advanced_features=True)
    pipeline_advanced.run_pipeline(optimize_rf=True, optimize_xgb=True)
    pipeline_advanced.save_models('models_advanced_5years')

    print("\n--- Execução concluída ---")



--- Pipeline: 2 Anos, Features Básicas ---
Dados não foram baixados. Execute download_data() primeiro.
Dados não processados. Execute preprocess_and_engineer_features() primeiro.
Modelo RandomForest não encontrado para otimização.

Gerando relatório de avaliação...


KeyError: "None of [Index(['Modelo', 'Acurácia', 'Precisão (Alta)', 'Recall (Alta)',\n       'F1-Score (Alta)', 'Precisão (Baixa)', 'Recall (Baixa)',\n       'F1-Score (Baixa)'],\n      dtype='object')] are in the [columns]"