In [73]:
import os
import pandas as pd
from textblob import TextBlob

# Set the working directory one level up
os.chdir("..")

# Function to load stock data from multiple files
def load_stock_data(file_paths):
    stock_dfs = []
    for path in file_paths:
        df = pd.read_csv(path)
        df.columns = df.columns.str.strip().str.lower()
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df.rename(columns={'date': 'Date', 'close': 'Close'}, inplace=True)
        stock_dfs.append(df)
    return pd.concat(stock_dfs, keys=[p.split('/')[-1].split('.')[0] for p in file_paths]).reset_index(level=0).rename(columns={'level_0': 'Ticker'})

# Define stock file paths
stock_file_paths = [
    'C:/Users/hp/Desktop/week1/data/AAPL_historical_data.csv',
    'C:/Users/hp/Desktop/week1/data/AMZN_historical_data.csv',
    'C:/Users/hp/Desktop/week1/data/GOOG_historical_data.csv',
    'C:/Users/hp/Desktop/week1/data/META_historical_data.csv',
    'C:/Users/hp/Desktop/week1/data/MSFT_historical_data.csv',
    'C:/Users/hp/Desktop/week1/data/NVDA_historical_data.csv',
    'C:/Users/hp/Desktop/week1/data/TSLA_historical_data.csv'
]

# Load stock and news data
stock_data = load_stock_data(stock_file_paths)
news_data = pd.read_csv("C:/Users/hp/Desktop/week1/data/raw_analyst_ratings.csv")
news_data.columns = news_data.columns.str.strip().str.lower()
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')
news_data.rename(columns={'date': 'Date', 'headline': 'Headline'}, inplace=True)

# Standardize timezone to UTC
def standardize_timezone(df):
    if df['Date'].dt.tz is None:
        df['Date'] = df['Date'].dt.tz_localize('UTC')
    else:
        df['Date'] = df['Date'].dt.tz_convert('UTC')
    return df

# Apply timezone standardization
news_data = standardize_timezone(news_data)
stock_data = standardize_timezone(stock_data)

# Merge data
merged_data = pd.merge(news_data, stock_data, on='Date', how='outer')

# Sentiment analysis and daily returns
merged_data['Sentiment'] = merged_data['Headline'].apply(lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else None)
merged_data['Daily_Return'] = merged_data.groupby('Ticker')['Close'].pct_change() * 100

# Clean data and calculate correlation
clean_data = merged_data.dropna(subset=['Sentiment', 'Daily_Return'])

# Debugging: Check the cleaned data
print(clean_data[['Sentiment', 'Daily_Return']].head())
print(f"Number of non-null values: {clean_data[['Sentiment', 'Daily_Return']].notnull().sum()}")

# Calculate correlation only if there are enough data points
if len(clean_data) > 1:  # At least 2 points needed for correlation
    correlation = clean_data[['Sentiment', 'Daily_Return']].corr().iloc[0, 1]
    print(f"Correlation: {correlation:.2f}")
else:
    print("Not enough data to calculate correlation.")

         Sentiment  Daily_Return
1396536        0.0      0.197563
1396537        0.0      0.402248
1396538        0.0      0.077239
1396539        0.0      0.716185
1396540        0.0     -0.603527
Number of non-null values: Sentiment       7
Daily_Return    7
dtype: int64
Correlation: nan
