## Data gathering

In [4]:
import pandas as pd
import datetime
import yfinance as yf

In [11]:
import pandas as pd
from datetime import date
import glob

# Define the list of tickers
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN"]

# Get today's date
today = date.today().strftime("%Y-%m-%d")

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()

# Loop through each ticker and get the historical stock data
for ticker in tickers:
    data = yf.download(ticker, start="2018-01-01", end=today)
    
    # Append the data to the combined DataFrame
    combined_data = combined_data.append(data)

# Save the combined data as a CSV file
combined_data.to_csv("combined_stock_data.csv")

[*********************100%***********************]  1 of 1 completed


  combined_data = combined_data.append(data)


[*********************100%***********************]  1 of 1 completed


  combined_data = combined_data.append(data)


[*********************100%***********************]  1 of 1 completed


  combined_data = combined_data.append(data)


[*********************100%***********************]  1 of 1 completed


  combined_data = combined_data.append(data)


In [13]:
# Read the combined CSV file
combined_data = pd.read_csv("combined_stock_data.csv")

# Print the head of the data
print(combined_data.head())

         Date       Open       High        Low      Close  Adj Close  \
0  2018-01-02  42.540001  43.075001  42.314999  43.064999  40.831585   
1  2018-01-03  43.132500  43.637501  42.990002  43.057499  40.824474   
2  2018-01-04  43.134998  43.367500  43.020000  43.257500  41.014111   
3  2018-01-05  43.360001  43.842499  43.262501  43.750000  41.481060   
4  2018-01-08  43.587502  43.902500  43.482498  43.587502  41.326996   

      Volume  
0  102223600  
1  118071600  
2   89738400  
3   94640000  
4   82271200  


In [14]:
# Print the tail of the data
print(combined_data.tail())

            Date        Open        High         Low       Close   Adj Close  \
5431  2023-05-19  118.160004  118.309998  115.699997  116.250000  116.250000   
5432  2023-05-22  116.769997  116.769997  114.250000  115.010002  115.010002   
5433  2023-05-23  114.269997  117.139999  113.779999  114.989998  114.989998   
5434  2023-05-24  115.349998  117.339996  115.019997  116.750000  116.750000   
5435  2023-05-25  116.629997  116.870003  114.309998  115.000000  115.000000   

        Volume  
5431  54990200  
5432  70741100  
5433  67576300  
5434  63487900  
5435  66496700  


## Data cleaning

In [15]:
# check on missing values
missing_values = combined_data.isnull().sum()
print(missing_values)

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


In [17]:
# remove duplicates
combined_data.drop_duplicates(inplace=True)

### Perform basic data manipulation and feature engineering

In [18]:
# extract date-related featured 
combined_data['Date'] = pd.to_datetime(combined_data['Date'])
combined_data['Year'] = combined_data['Date'].dt.year
combined_data['Month'] = combined_data['Date'].dt.month
combined_data['Day'] = combined_data['Date'].dt.day

# calculate daily returns
combined_data['Daily_Return'] = combined_data['Close'].pct_change()

# Calculate moving averages
combined_data['MA_50'] = combined_data['Close'].rolling(window=50).mean()
combined_data['MA_200'] = combined_data['Close'].rolling(window=200).mean()

# Save the cleaned and preprocessed data to a new CSV file:
combined_data.to_csv("cleaned_stock_data.csv", index=False)

In [19]:
# Read the cleaned CSV file into a DataFrame
cleaned_data = pd.read_csv("cleaned_stock_data.csv")

In [20]:
# Retrieve the head of the data
head_data = cleaned_data.head()
print("Head of the data:")
print(head_data)

Head of the data:
         Date       Open       High        Low      Close  Adj Close  \
0  2018-01-02  42.540001  43.075001  42.314999  43.064999  40.831585   
1  2018-01-03  43.132500  43.637501  42.990002  43.057499  40.824474   
2  2018-01-04  43.134998  43.367500  43.020000  43.257500  41.014111   
3  2018-01-05  43.360001  43.842499  43.262501  43.750000  41.481060   
4  2018-01-08  43.587502  43.902500  43.482498  43.587502  41.326996   

      Volume  Year  Month  Day  Daily_Return  MA_50  MA_200  
0  102223600  2018      1    2           NaN    NaN     NaN  
1  118071600  2018      1    3     -0.000174    NaN     NaN  
2   89738400  2018      1    4      0.004645    NaN     NaN  
3   94640000  2018      1    5      0.011385    NaN     NaN  
4   82271200  2018      1    8     -0.003714    NaN     NaN  


In [21]:
# Retrieve the tail of the data
tail_data = cleaned_data.tail()
print("Tail of the data:")
print(tail_data)

Tail of the data:
            Date        Open        High         Low       Close   Adj Close  \
5431  2023-05-19  118.160004  118.309998  115.699997  116.250000  116.250000   
5432  2023-05-22  116.769997  116.769997  114.250000  115.010002  115.010002   
5433  2023-05-23  114.269997  117.139999  113.779999  114.989998  114.989998   
5434  2023-05-24  115.349998  117.339996  115.019997  116.750000  116.750000   
5435  2023-05-25  116.629997  116.870003  114.309998  115.000000  115.000000   

        Volume  Year  Month  Day  Daily_Return     MA_50     MA_200  
5431  54990200  2023      5   19     -0.016081  103.3994  105.77485  
5432  70741100  2023      5   22     -0.010667  103.8850  105.63705  
5433  67576300  2023      5   23     -0.000174  104.3362  105.50800  
5434  63487900  2023      5   24      0.015306  104.7736  105.39470  
5435  66496700  2023      5   25     -0.014989  105.1496  105.28055  


In [22]:
# clean remaining NaN
# Fill missing values with the mean
cleaned_data_filled = cleaned_data.fillna(cleaned_data.mean())

# Retrieve the head of the filled data
head_data_filled = cleaned_data_filled.head()
print("Head of the filled data:")
print(head_data_filled)

# Retrieve the tail of the filled data
tail_data_filled = cleaned_data_filled.tail()
print("Tail of the filled data:")
print(tail_data_filled)

Head of the filled data:
         Date       Open       High        Low      Close  Adj Close  \
0  2018-01-02  42.540001  43.075001  42.314999  43.064999  40.831585   
1  2018-01-03  43.132500  43.637501  42.990002  43.057499  40.824474   
2  2018-01-04  43.134998  43.367500  43.020000  43.257500  41.014111   
3  2018-01-05  43.360001  43.842499  43.262501  43.750000  41.481060   
4  2018-01-08  43.587502  43.902500  43.482498  43.587502  41.326996   

      Volume  Year  Month  Day  Daily_Return       MA_50      MA_200  
0  102223600  2018      1    2      0.000649  127.270888  128.850617  
1  118071600  2018      1    3     -0.000174  127.270888  128.850617  
2   89738400  2018      1    4      0.004645  127.270888  128.850617  
3   94640000  2018      1    5      0.011385  127.270888  128.850617  
4   82271200  2018      1    8     -0.003714  127.270888  128.850617  
Tail of the filled data:
            Date        Open        High         Low       Close   Adj Close  \
5431  2023-

  cleaned_data_filled = cleaned_data.fillna(cleaned_data.mean())
