# Import Libs

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
import os

# split the data imports
from sklearn.model_selection import train_test_split

# Define the parameters for data collection

In [2]:
# stock tickers
tickers = ['PETR4.SA', 'VALE3.SA']

# Time period
start_date = '2015-01-01'
end_date = '2025-01-01'

# save raw data
raw_data_path = 'data/raw'

# Create the folder if it does not exist
os.makedirs(raw_data_path, exist_ok=True)

# Collect and save data for each action
for ticker in tickers:
    print(f"Downloading data to {ticker}...")
    
    # Download data using the yfinance library
    df = yf.download(ticker, start=start_date, end=end_date)

    # Check if the DataFrame is not empty
    if not df.empty:
        # Save the DataFrame to a CSV file inside the 'data/raw' folder
        file_path = os.path.join(raw_data_path, f'{ticker.replace(".SA", "")}_raw.csv')
        df.to_csv(file_path)
        print(f"{ticker} data saved to: {file_path}")
    else:
        print(f"Error: Unable to download data for {ticker}.")

Downloading data to PETR4.SA...


  df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start=start_date, end=end_date)


PETR4.SA data saved to: data/raw\PETR4_raw.csv
Downloading data to VALE3.SA...


[*********************100%***********************]  1 of 1 completed

VALE3.SA data saved to: data/raw\VALE3_raw.csv





Import libs, set tickers/period, download data with yf.download(), save CSV to data/raw with to_csv().

# Understanding Data

In [10]:
# --- Load data from PETR4 and VALE3 ---
try:
    petr4_df = pd.read_csv('data/raw/PETR4_raw.csv', index_col='Date', parse_dates=True)
    vale3_df = pd.read_csv('data/raw/VALE3_raw.csv', index_col='Date', parse_dates=True)
except FileNotFoundError:
    print("Error: No CSV files found. Please verify that Phase 1 was executed correctly.")
    exit()

print("--- PETR4 DataFrame analysis ---")
print(petr4_df.head())

print("--- VALE3 DataFrame analysis ---")
print(vale3_df.head())

# Check data types, non-null values, and memory usage
print("\nInformation about PETR4 data:")
petr4_df.info()

print("\nInformation about VALE3 data:")
vale3_df.info()

# Check for missing values (confirm)
print("\nChecking for null values in PETR4:")
print(petr4_df.isnull().sum())

print("\nChecking for null values in VALE3:")
print(vale3_df.isnull().sum())

# Check for duplicates in the index (data)
print("\nChecking for duplicates in the PETR4 index (dates):")
print(petr4_df.index.duplicated().sum())

print("\Checking for duplicates in the VALE3 index (dates):")
print(vale3_df.index.duplicated().sum())

ValueError: 'Date' is not in list

- df.info() → overview of types and nulls.
- df.isnull().sum() → counts nulls per column.
- df.index.duplicated().sum() → checks for duplicate dates.

# Engenharia de Features e Criação da Variável-Alvo

In [5]:
# --- Function to create features and the target variable ---

def create_features(df):
    """
    Creates predictor variables (features) and the target variable (target) in the DataFrame.
    """

    # daily return
    df["Daily_Return"] = df["Close"].pct_change()

    # 2. Simple Moving Averages (SMA)
    df["SMA_5"] = df["Close"].rolling(winodw = 5).mean()
    df["SMA_10"] = df["Close"].rolling(window = 10).mean()
    df["SMA_20"] = df["Close"].rolling(window = 20).mean()   

    # 3. Target Variable
    df["Target"] = (df["Close"].shift(-1) > df["Close"]).astype(int)
    df.dropna(inplace=True)
    return df

# --- Apply the function to each DataFrame ---
petr4_processed_df = create_features(petr4_df.copy())
vale3_processed_df = create_features(vale3_df.copy())

print("\nPETR4 DataFrame with features and target:")
print(petr4_processed_df.head())
print("\nInformation after feature engineering:")
petr4_processed_df.info()

print("\nVALE3 DataFrame with features and target:")
print(vale3_processed_df.head())
print("\nInformation after feature engineering:")
vale3_processed_df.info()



NameError: name 'petr4_df' is not defined

### 1:
- pct_change() calculates the percentage change between the current and previous values.
### 2:
- window is the number of days to calculate the average.
### 3:
- The target variable is 1 if the next day's closing price is higher than the current day's.
- We use .shift(-1) to compare the current price with the next day's price.
- .astype(int) converts True/False to 1/0.
- After calculating the features, the first 20 rows will have null values in the moving averages.
- For the model, we need to remove these rows.

# split the data