# Data Collection and PreProcessing

## Data Source

In [4]:
import yfinance as yf
from datetime import datetime

# Define stock tickers and date range
start_date = "2014-08-01"
end_date = "2016-11-30"
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']

# Download data for each stock
for ticker in tickers:
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    print(stock_data.head())  # Print first few rows
    stock_data.to_csv(f'{ticker}_stock.csv')  # Save to CSV


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Price        Adj Close       Close        High         Low        Open  \
Ticker             IBM         IBM         IBM         IBM         IBM   
Date                                                                     
2014-08-01  115.999710  180.831741  183.078400  180.554489  182.122375   
2014-08-04  116.300217  181.300186  181.596558  180.305923  181.022949   
2014-08-05  114.742508  178.871887  180.879547  178.240921  180.449326   
2014-08-06  114.724068  177.791580  178.661575  176.328873  177.208420   
2014-08-07  113.693855  176.195023  178.470367  175.506699  178.432129   

Price        Volume  
Ticker          IBM  
Date                 
2014-08-01  5419431  
2014-08-04  2223691  
2014-08-05  3460063  
2014-08-06  4023962  
2014-08-07  2833196  
Price       Adj Close      Close       High        Low       Open     Volume
Ticker           AAPL       AAPL       AAPL       AAPL       AAPL       AAPL
Date                                                                        





## Data Cleaning

In [5]:
import pandas as pd

# Function to clean individual stock data
def clean_stock_data(file_path, output_path):
    # Load the dataset
    df = pd.read_csv(file_path, index_col=0)
    
    # Ensure columns are standardized
    df.columns = [col.split(' ')[-1] for col in df.columns]
    
    # Drop unnecessary columns (keeping only essential ones)
    df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
    
    # Handle missing data (drop rows with NaN values)
    df = df.dropna()
    
    # Reset the index and ensure Date column is properly formatted
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Date"}, inplace=True)
    
    # Export the cleaned data to a new CSV
    df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")

# Iterate through each stock file and clean the data
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
for ticker in tickers:
    input_file = f'{ticker}_stock.csv'
    output_file = f'{ticker}_stock_cleaned.csv'
    clean_stock_data(input_file, output_file)


Cleaned data saved to IBM_stock_cleaned.csv
Cleaned data saved to AAPL_stock_cleaned.csv
Cleaned data saved to META_stock_cleaned.csv
Cleaned data saved to GOOGL_stock_cleaned.csv


## Stationarity Check - Augmented Dickey-Fuller (ADF) Test

In [7]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller

# Function to check stationarity using the Augmented Dickey-Fuller test
def check_stationarity(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)
    
    # Ensure 'Date' is a datetime object and sort by date
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date')
    
    # Focus on the 'Close' column for stationarity check
    close_prices = df['Close']
    
    # Perform Augmented Dickey-Fuller Test
    adf_result = adfuller(close_prices)
    
    # Print the results
    print(f"Results of ADF Test for {file_path.split('_')[0]}:")
    print(f"ADF Statistic: {adf_result[0]}")
    print(f"p-value: {adf_result[1]}")
    print(f"Critical Values: {adf_result[4]}")
    
    # Interpret the p-value
    if adf_result[1] <= 0.05:
        print("The time series is stationary (reject the null hypothesis).")
    else:
        print("The time series is non-stationary (fail to reject the null hypothesis).")
    print("\n")

# Iterate through the cleaned stock files and check stationarity
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
for ticker in tickers:
    cleaned_file = f'{ticker}_stock_cleaned.csv'
    check_stationarity(cleaned_file)


KeyError: 'Date'

In [8]:
df = pd.read_csv('IBM_stock_cleaned.csv')
print(df.head())
print(df.columns)


        Price                Open                High                 Low  \
0      Ticker                 IBM                 IBM                 IBM   
1  2014-08-01  182.12237548828125  183.07839965820312   180.5544891357422   
2  2014-08-04     181.02294921875   181.5965576171875  180.30592346191406   
3  2014-08-05  180.44932556152344  180.87954711914062   178.2409210205078   
4  2014-08-06   177.2084197998047   178.6615753173828  176.32887268066406   

                Close             Close.1   Volume  
0                 IBM                 IBM      IBM  
1  115.99971008300781   180.8317413330078  5419431  
2  116.30021667480469  181.30018615722656  2223691  
3  114.74250793457031  178.87188720703125  3460063  
4  114.72406768798828   177.7915802001953  4023962  
Index(['Price', 'Open', 'High', 'Low', 'Close', 'Close.1', 'Volume'], dtype='object')
