# Import Libs

In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
import pandas_ta as ta
import os

# Define the parameters for data collection

In [3]:
# stock tickers
tickers = ['PETR4.SA', 'VALE3.SA']

# Time period
start_date = '2015-01-01'
end_date = '2025-01-01'
raw_data_path = '../data/raw'

# Create the folder if it does not exist
os.makedirs(raw_data_path, exist_ok=True)

# Collect and save data for each action
for ticker in tickers:
    print(f"Downloading data to {ticker}...")
    
    # Download data using the yfinance library
    df = yf.download(ticker, start = start_date, end = end_date)

    # Check if the DataFrame is not empty
    if not df.empty:
        # Reset the index to turn the date column into a regular column
        df.reset_index(inplace=True)
        
        # Save the DataFrame to a CSV file inside the 'data/raw' folder
        file_path = os.path.join(raw_data_path, f'{ticker.replace(".SA", "")}_raw.csv')
        df.to_csv(file_path, index=False)
        
        print(f"{ticker} data saved to: {file_path}")
        
    else:
        print(f"Error: Unable to download data for {ticker}.")

print("\nData collection complete.")

Downloading data to PETR4.SA...


  df = yf.download(ticker, start = start_date, end = end_date)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start = start_date, end = end_date)


PETR4.SA data saved to: ../data/raw\PETR4_raw.csv
Downloading data to VALE3.SA...


[*********************100%***********************]  1 of 1 completed

VALE3.SA data saved to: ../data/raw\VALE3_raw.csv

Data collection complete.





# Understanding Data

In [4]:
dfs = {}

tickers_no_suffix = ['PETR4', 'VALE3']

# Load saved data and perform inspection
for ticker in tickers_no_suffix:
    file_path = f"{raw_data_path}/{ticker}_raw.csv"
    
    try:
        # Load the file with the 'Date' column as the index
        df = pd.read_csv(file_path, index_col='Date', parse_dates=True)

        df = df.iloc[1:].copy()
        df = df.astype('float64')
        
        # Ensure numeric columns are the correct type
        numeric_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
        
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')

        dfs[ticker] = df
                
        print(f"\n{'=' * 50}")
        print(f"Quick Inspection for {ticker}")
        print(f"{'=' * 50}")

        print("\n### First 5 lines:")
        print(df.head())

        print("\n### DataFrame Information:")
        print(df.info())

        print("\n### Descriptive Statistics:")
        print(df.describe())

        print("\n### Missing Values:")
        print(df.isnull().sum())
        
        print("\n### Count of Unique Values (Top 5 columns):")
        
        for col in df.columns[:5]:
            print(f"- {col}: {df[col].nunique()} unique values")

    except FileNotFoundError:
        print(f"Error: File {file_path} not found. Please run the data collection step again.")


Quick Inspection for PETR4

### First 5 lines:
               Close      High       Low      Open      Volume
Date                                                          
2015-01-02  2.630870  2.807948  2.628059  2.807948  49559500.0
2015-01-05  2.406008  2.583086  2.397576  2.569032  78385100.0
2015-01-06  2.327308  2.481899  2.259850  2.448170  84723300.0
2015-01-07  2.436927  2.453791  2.349793  2.406009  85531000.0
2015-01-08  2.594330  2.639302  2.456603  2.470657  83306300.0

### DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2487 entries, 2015-01-02 to 2024-12-30
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   2487 non-null   float64
 1   High    2487 non-null   float64
 2   Low     2487 non-null   float64
 3   Open    2487 non-null   float64
 4   Volume  2487 non-null   float64
dtypes: float64(5)
memory usage: 116.6 KB
None

### Descriptive Statistics:
             Close      

# Engenharia de Features e Criação da Variável-Alvo com RSI e MACD

In [5]:
for ticker in tickers_no_suffix:
    file_path = f"{raw_data_path}/{ticker}_raw.csv"
    try:
        df = pd.read_csv(file_path, index_col='Date', parse_dates=True)
        df = df.iloc[1:].astype('float64').copy()
        dfs[ticker] = df
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")

processed_dfs = {}

for ticker, df in dfs.items():
    print(f"\nProcessing data for {ticker} with advanced features...")
    
    df_processed = df[['Close', 'Volume']].copy()
    
    # Feature Engineering
    df_processed['Daily_Return'] = df_processed['Close'].pct_change()
    df_processed['SMA_5'] = df_processed['Close'].rolling(5).mean()
    df_processed['SMA_10'] = df_processed['Close'].rolling(10).mean()
    df_processed['SMA_20'] = df_processed['Close'].rolling(20).mean()
    df_processed['RSI'] = ta.rsi(df_processed['Close'])
    
    macd_result = ta.macd(df_processed['Close'])
    df_processed['MACD'] = macd_result['MACD_12_26_9']
    
    df_processed['target'] = (df_processed['Close'].shift(-1) > df_processed['Close']).astype(int)
    
    df_processed.dropna(inplace=True)
    processed_dfs[ticker] = df_processed
    
    print(f"Data for {ticker} processed. New shape: {df_processed.shape}")
    print(f"Columns: {list(df_processed.columns)}")

combined_df = pd.concat(processed_dfs.values())

processed_data_path = 'data/processed'
os.makedirs(processed_data_path, exist_ok=True)
combined_df.to_csv(os.path.join(processed_data_path, 'combined_data.csv'))

print(f"\nCombined DataFrame shape: {combined_df.shape}")
print("First 5 rows:")
print(combined_df.head())
print("Processed DataFrame saved successfully.")


Processing data for PETR4 with advanced features...
Data for PETR4 processed. New shape: (2462, 9)
Columns: ['Close', 'Volume', 'Daily_Return', 'SMA_5', 'SMA_10', 'SMA_20', 'RSI', 'MACD', 'target']

Processing data for VALE3 with advanced features...
Data for VALE3 processed. New shape: (2462, 9)
Columns: ['Close', 'Volume', 'Daily_Return', 'SMA_5', 'SMA_10', 'SMA_20', 'RSI', 'MACD', 'target']

Combined DataFrame shape: (4924, 9)
First 5 rows:
               Close       Volume  Daily_Return     SMA_5    SMA_10    SMA_20  \
Date                                                                            
2015-02-06  2.563411  117747100.0     -0.072228  2.683150  2.635929  2.641551   
2015-02-09  2.642112   72534200.0      0.030702  2.717441  2.623843  2.649421   
2015-02-10  2.504385   75478700.0     -0.052128  2.657853  2.589270  2.646891   
2015-02-11  2.552168   48681100.0      0.019080  2.605011  2.588427  2.649280   
2015-02-12  2.656168   61706000.0      0.040749  2.583649  2.6103