# Import Libs

In [19]:
import pandas as pd
import numpy as np
import yfinance as yf
import pandas_ta as ta
import os

# Define the parameters for data collection

In [27]:
raw_data_path = '../data/raw'
dfs = {}
tickers_no_suffix = ['PETR4', 'VALE3']

for ticker in tickers_no_suffix:
    file_path = f"{raw_data_path}/{ticker}_raw.csv"

    try:
        df = pd.read_csv(file_path, index_col='Date', parse_dates=True)
        # df.iloc[1:] now removes the line with the ticker name that yfinance added.
        df = df.iloc[1:].astype('float64').copy()
        dfs[ticker] = df

        print(f"Raw data for {ticker} loaded and cleaned successfully.")

    except FileNotFoundError:
        print(f"Error: File {file_path} not found found. Please run the data collection notebook first.")

Raw data for PETR4 loaded and cleaned successfully.
Raw data for VALE3 loaded and cleaned successfully.


# Understanding Data

In [38]:
# Load saved data and perform inspection
for ticker in tickers_no_suffix:
    file_path = f"{raw_data_path}/{ticker}_raw.csv"
    
    try:
        # Load the file with the 'Date' column as the index
        df = pd.read_csv(file_path, index_col='Date', parse_dates=True)

        df = df.iloc[1:].copy()
        dfs[ticker] = df.iloc[1:].astype('float64').copy()
        
        print(f"Raw data for {ticker} loaded successfully.")
                
        print(f"\n{'=' * 50}")
        print(f"Quick Inspection for {ticker}")
        print(f"{'=' * 50}")

        print("\n### First 5 lines:")
        print(df.head())

        print("\n### DataFrame Information:")
        print(df.info())

        print("\n### Descriptive Statistics:")
        print(df.describe())

        print("\n### Missing Values:")
        print(df.isnull().sum())
        
        print("\n### Count of Unique Values (Top 5 columns):")
        
        for col in df.columns[:5]:
            print(f"- {col}: {df[col].nunique()} unique values")

    except FileNotFoundError:
        print(f"Error: File {file_path} not found. Please run the data collection step again.")

Raw data for PETR4 loaded successfully.

Quick Inspection for PETR4

### First 5 lines:
                        Close               High                Low  \
Date                                                                  
2020-01-02  9.300814628601074  9.300814628601074  9.182660567423897   
2020-01-03   9.22507381439209  9.464410398337765   9.22507381439209   
2020-01-06  9.334139823913574  9.373524704409501   9.07359622503792   
2020-01-07  9.297784805297852  9.355346431681072  9.231133649501015   
2020-01-08  9.240222930908203  9.322021764389632  9.161453748057209   

                         Open    Volume  
Date                                     
2020-01-02   9.24325242330177  37774500  
2020-01-03  9.355345304068166  71595600  
2020-01-06   9.21901598876109  81844000  
2020-01-07  9.337169106394095  32822000  
2020-01-08  9.297785137192925  48215600  

### DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1401 entries, 2020-01-02 to 2025-08-18


# Engenharia de Features e Criação da Variável-Alvo com RSI e MACD

In [39]:
processed_dfs = {}

for ticker, df in dfs.items():
    print(f"\nProcessing data for {ticker} with advanced features...")
    
    df_processed = df[['Close', 'Volume']].copy()
    
    # Feature Engineering
    df_processed['Daily_Return'] = df_processed['Close'].pct_change()
    df_processed['SMA_5'] = df_processed['Close'].rolling(5).mean()
    df_processed['SMA_10'] = df_processed['Close'].rolling(10).mean()
    df_processed['SMA_20'] = df_processed['Close'].rolling(20).mean()
    df_processed['RSI'] = ta.rsi(df_processed['Close'])
    
    macd_result = ta.macd(df_processed['Close'])
    df_processed['MACD'] = macd_result['MACD_12_26_9']
    
    df_processed['target'] = (df_processed['Close'].shift(-1) > df_processed['Close']).astype(int)
    
    df_processed.dropna(inplace=True)
    processed_dfs[ticker] = df_processed
    
    print(f"Data for {ticker} processed. New shape: {df_processed.shape}")
    print(f"Columns: {list(df_processed.columns)}")

combined_df = pd.concat(processed_dfs.values())

processed_data_path = '../data/processed'
os.makedirs(processed_data_path, exist_ok=True)
combined_df.to_csv(os.path.join(processed_data_path, 'combined_data.csv'))

print(f"\nCombined DataFrame shape: {combined_df.shape}")
print("First 5 rows:")
print(combined_df.head())
print("\nProcessed DataFrame saved successfully.")


Processing data for PETR4 with advanced features...
Data for PETR4 processed. New shape: (1375, 9)
Columns: ['Close', 'Volume', 'Daily_Return', 'SMA_5', 'SMA_10', 'SMA_20', 'RSI', 'MACD', 'target']

Processing data for VALE3 with advanced features...
Data for VALE3 processed. New shape: (1375, 9)
Columns: ['Close', 'Volume', 'Daily_Return', 'SMA_5', 'SMA_10', 'SMA_20', 'RSI', 'MACD', 'target']

Combined DataFrame shape: (2750, 9)
First 5 rows:
               Close      Volume  Daily_Return     SMA_5    SMA_10    SMA_20  \
Date                                                                           
2020-02-07  8.764579  54514600.0     -0.008567  8.683385  8.676115  8.838652   
2020-02-10  8.825171  45328100.0      0.006913  8.740947  8.709440  8.820474   
2020-02-11  8.931205  37518200.0      0.012015  8.792450  8.730041  8.812597   
2020-02-12  9.128127  64851000.0      0.022049  8.897880  8.768820  8.821383   
2020-02-13  9.003916  55277100.0     -0.013608  8.930599  8.792451  8.8