# Data Preparation

In [1]:
import pandas as pd
import yfinance as yf
from datetime import datetime

In [2]:
reddit_file = '../Data/analyzed_stock_data.csv'
reddit_data = pd.read_csv(reddit_file)

In [3]:
reddit_data['date_only'] = pd.to_datetime(reddit_data['date_only'])

In [4]:
def fetch_stock_data(tickers, start_date, end_date):
    stock_data = []
    for ticker in tickers:
        try:
            print(f"Fetching data for {ticker}...")
            data = yf.download(ticker, start=start_date, end=end_date)
            if not data.empty:
                data['ticker'] = ticker
                
                data['date_only_'] = data.index.date
                
                data.columns = [f"{col[0]}_{col[1]}" if isinstance(col, tuple) else col for col in data.columns]
                
                data.rename(columns={col: 'date_only' if 'date_only' in col else col for col in data.columns}, inplace=True)
                print(f"Date_only column added for {ticker}. Columns: {data.columns}")
                stock_data.append(data)
            else:
                print(f"No data found for {ticker}.")
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
    
    
    if stock_data:
        combined_data = pd.concat(stock_data, ignore_index=True)
        print(f"Combined stock data columns: {combined_data.columns}")
    else:
        combined_data = pd.DataFrame()  
        print("No stock data fetched.")

    
    if 'date_only' in combined_data.columns:
        combined_data['date_only'] = pd.to_datetime(combined_data['date_only'])
    else:
        print("Warning: 'date_only' column not found in combined stock data.")

    return combined_data



In [5]:
tickers = ['AI', 'MSTR', 'NVDA', 'VOO']
start_date = (datetime.now() - pd.DateOffset(months=6)).strftime('%Y-%m-%d')
end_date = datetime.now().strftime('%Y-%m-%d')

combined_stock_data = fetch_stock_data(tickers, start_date, end_date)

if 'date_only' in combined_stock_data.columns:
    print("date_only column found in stock data!")
else:
    print("date_only column not found in stock data.")


combined_stock_data = combined_stock_data.reset_index(drop=True)
reddit_data = reddit_data.reset_index(drop=True)

Fetching data for AI...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Date_only column added for AI. Columns: Index(['Adj Close_AI', 'Close_AI', 'High_AI', 'Low_AI', 'Open_AI', 'Volume_AI',
       'ticker_', 'date_only'],
      dtype='object')
Fetching data for MSTR...
Date_only column added for MSTR. Columns: Index(['Adj Close_MSTR', 'Close_MSTR', 'High_MSTR', 'Low_MSTR', 'Open_MSTR',
       'Volume_MSTR', 'ticker_', 'date_only'],
      dtype='object')
Fetching data for NVDA...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Date_only column added for NVDA. Columns: Index(['Adj Close_NVDA', 'Close_NVDA', 'High_NVDA', 'Low_NVDA', 'Open_NVDA',
       'Volume_NVDA', 'ticker_', 'date_only'],
      dtype='object')
Fetching data for VOO...
Date_only column added for VOO. Columns: Index(['Adj Close_VOO', 'Close_VOO', 'High_VOO', 'Low_VOO', 'Open_VOO',
       'Volume_VOO', 'ticker_', 'date_only'],
      dtype='object')
Combined stock data columns: Index(['Adj Close_AI', 'Close_AI', 'High_AI', 'Low_AI', 'Open_AI', 'Volume_AI',
       'ticker_', 'date_only', 'Adj Close_MSTR', 'Close_MSTR', 'High_MSTR',
       'Low_MSTR', 'Open_MSTR', 'Volume_MSTR', 'Adj Close_NVDA', 'Close_NVDA',
       'High_NVDA', 'Low_NVDA', 'Open_NVDA', 'Volume_NVDA', 'Adj Close_VOO',
       'Close_VOO', 'High_VOO', 'Low_VOO', 'Open_VOO', 'Volume_VOO'],
      dtype='object')
date_only column found in stock data!





In [6]:
merged_data = pd.merge(
    reddit_data, combined_stock_data, on='date_only', how='inner'
)
print('Merge Successful')

Merge Successful


In [7]:
merged_data.drop(columns=['date_only', 'created_utc'], inplace=True)


In [8]:
output_file = '../Data/stock_sentiment_combined_data.csv'
merged_data.to_csv(output_file, index=False)
print(f'Stock Sentiment Data saved to {output_file}')

Stock Sentiment Data saved to ../Data/stock_sentiment_combined_data.csv
