1. Import Packages

In [19]:
import pandas as pd
from textblob import TextBlob



2. Load datasets 

In [13]:
news_data = pd.read_csv('../Data/raw_analyst_ratings.csv')
stock_data = pd.read_csv('../Data/yfinance_data/AAPL_historical_data.csv')


3.Convert Timezone-Aware Datetime to Naive Datetime


In [16]:
# Remove timezone information to make the date column naive
news_data['date'] = news_data['date'].dt.tz_localize(None)

# Now merge the dataframes on the date column
aligned_data = pd.merge(news_data, stock_data, left_on='date', right_on='Date', how='inner')

# Print the first few rows of the merged DataFrame to verify
print(aligned_data.head())

   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

        date stock       Date       Open       High        Low      Close  \
0 2020-06-05     A 2020-06-05  80.837502  82.937500  80.807503 

In [17]:
# Check the data types of date columns
print(news_data['date'].dtype)
print(stock_data['Date'].dtype)


datetime64[ns]
datetime64[ns]


4. Data allignment

In [18]:
news_data['date'] = news_data['date'].dt.normalize()
stock_data['Date'] = stock_data['Date'].dt.normalize()

# Merge the datasets on the normalized date
aligned_data = pd.merge(news_data, stock_data, left_on='date', right_on='Date', how='inner')

# Check the aligned data
print(aligned_data.head())

   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

        date stock       Date       Open       High        Low      Close  \
0 2020-06-05     A 2020-06-05  80.837502  82.937500  80.807503 

5. Sentimate analysise 

In [20]:

# Ensure the 'headline' column exists in your dataset
if 'headline' in news_data.columns:
    # Function to get sentiment polarity
    def get_sentiment(text):
        blob = TextBlob(text)
        return blob.sentiment.polarity

    # Apply the sentiment function to the 'headline' column
    news_data['sentiment'] = news_data['headline'].apply(get_sentiment)

    # Categorize sentiment as positive, negative, or neutral
    def categorize_sentiment(polarity):
        if polarity > 0:
            return 'positive'
        elif polarity < 0:
            return 'negative'
        else:
            return 'neutral'

    news_data['sentiment_category'] = news_data['sentiment'].apply(categorize_sentiment)

    # Print the first few rows to check the results
    print(news_data[['headline', 'sentiment', 'sentiment_category']].head())
else:
    print("The 'headline' column is missing in the news dataset.")


                                            headline  sentiment  \
0            Stocks That Hit 52-Week Highs On Friday        0.0   
1         Stocks That Hit 52-Week Highs On Wednesday        0.0   
2                      71 Biggest Movers From Friday        0.0   
3       46 Stocks Moving In Friday's Mid-Day Session        0.0   
4  B of A Securities Maintains Neutral on Agilent...        0.0   

  sentiment_category  
0            neutral  
1            neutral  
2            neutral  
3            neutral  
4            neutral  


6. Calculate daily stock returns

In [21]:



# Ensure the 'Date' and 'Close' columns are in the correct format
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
stock_data.set_index('Date', inplace=True)

# Calculate daily returns as the percentage change in closing prices
stock_data['Daily_Return'] = stock_data['Close'].pct_change() * 100

# Drop any rows with NaN values that may have resulted from the pct_change() calculation
stock_data.dropna(subset=['Daily_Return'], inplace=True)

# Print the first few rows to verify the calculation
print(stock_data[['Close', 'Daily_Return']].head())


               Close  Daily_Return
Date                              
1980-12-15  0.121652     -5.217061
1980-12-16  0.112723     -7.339788
1980-12-17  0.115513      2.475091
1980-12-18  0.118862      2.899246
1980-12-19  0.126116      6.102867


7. Correlation analysise 