# Correlation Analysis

## import libraries

In [1]:
import os
import sys
import pandas as pd
import yfinance as yf

# import plotly as px

# Dynamically add the absolute path to the project directory and scripts
project_root = r"C:\Users\hp\Desktop\10Academy\Financial-News-and-Stock-Market-Analysis"
scripts_path = os.path.join(project_root, "scripts")

sys.path.append(project_root)  # Add project root to sys.path
sys.path.append(scripts_path)  # Add scripts directory to sys.path

## Load data combine datas and save

In [2]:
from scripts.load_stock_data import load_stock_data

# Path to the data folder
data_folder = '../Data/yfinance_data/'

# Load and process the data
combined_data = load_stock_data(data_folder)

# Save the combined data to a CSV file
combined_data.to_csv('../Data/yfinance_data/combined_historical_data.csv', index=False)

Files to load: ['AAPL_historical_data.csv', 'AMZN_historical_data.csv', 'combined_historical_data.csv', 'GOOG_historical_data.csv', 'META_historical_data.csv', 'MSFT_historical_data.csv', 'NVDA_historical_data.csv', 'TSLA_historical_data.csv']


In [4]:
print(combined_data.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits', 'Ticker'],
      dtype='object')


### Filter data within a specific date range

In [5]:

filtered_data = combined_data[(combined_data['Date'] >= '2020-01-01') & (combined_data['Date'] <= '2022-12-31')]
print("Filtered Data:", filtered_data.head())

Filtered Data:            Date       Open       High        Low      Close  Adj Close  \
9847 2020-01-02  74.059998  75.150002  73.797501  75.087502  72.876099   
9848 2020-01-03  74.287498  75.144997  74.125000  74.357498  72.167595   
9849 2020-01-06  73.447502  74.989998  73.187500  74.949997  72.742661   
9850 2020-01-07  74.959999  75.224998  74.370003  74.597504  72.400536   
9851 2020-01-08  74.290001  76.110001  74.290001  75.797501  73.565208   

         Volume  Dividends  Stock Splits Ticker  
9847  135480400        0.0           0.0   AAPL  
9848  146322800        0.0           0.0   AAPL  
9849  118387200        0.0           0.0   AAPL  
9850  108872000        0.0           0.0   AAPL  
9851  132079200        0.0           0.0   AAPL  


In [6]:
# File paths
combined_data_path = '../Data/yfinance_data/combined_historical_data.csv'
news_data_path = '../Data/raw_analyst_ratings.csv'
output_path = '../Data/task3_combined_sentiment_data.csv'

# Load data
combined_data = pd.read_csv(combined_data_path, parse_dates=['Date'])
news_data = pd.read_csv(news_data_path)



## normalize_dates

In [7]:
combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='ISO8601', errors='coerce')  # Handle malformed dates
news_data['date'] = pd.to_datetime(news_data['date'], format='ISO8601', errors='coerce')  # Adjust for 'date' column in news
print("Parsing dates in the data...")

Parsing dates in the data...


## Merge datasets and save to data

In [8]:
from scripts.correlatonal_analyzer import analyze_tickers
# Analyze sentiment and stock movements
analyze_tickers(combined_data, news_data, output_path)

Processing data for ticker: AAPL
Processing data for ticker: AMZN
Processing data for ticker: combined
Processing data for ticker: GOOG
Processing data for ticker: META
Processing data for ticker: MSFT
Processing data for ticker: NVDA
Processing data for ticker: TSLA
Combined data saved to ../Data/task3_combined_sentiment_data.csv


  final_combined_data = pd.concat(results, ignore_index=True)


## correlation analysis for AAPL

In [9]:
# Example: Filter data for a specific stock ticker ('AAPL')
aapl_data = combined_data[combined_data['Ticker'] == 'AAPL']
print("AAPL Data:", aapl_data.head())

AAPL Data:         Date      Open      High       Low     Close  Adj Close     Volume  \
0 1980-12-12  0.128348  0.128906  0.128348  0.128348   0.098943  469033600   
1 1980-12-15  0.122210  0.122210  0.121652  0.121652   0.093781  175884800   
2 1980-12-16  0.113281  0.113281  0.112723  0.112723   0.086898  105728000   
3 1980-12-17  0.115513  0.116071  0.115513  0.115513   0.089049   86441600   
4 1980-12-18  0.118862  0.119420  0.118862  0.118862   0.091630   73449600   

   Dividends  Stock Splits Ticker  
0        0.0           0.0   AAPL  
1        0.0           0.0   AAPL  
2        0.0           0.0   AAPL  
3        0.0           0.0   AAPL  
4        0.0           0.0   AAPL  


In [10]:
from scripts.correlatonal_analyzer import generate_random_headlines

# Generate random headlines for a specific ticker ('AAPL')
ticker = 'AAPL'
print(f"Random headlines for {ticker}:")
random_headlines = generate_random_headlines(news_data, ticker, num_headlines=10)
for idx, headline in enumerate(random_headlines, 1):
    print(f"{idx}. {headline}")


Random headlines for AAPL:
1. Walmart Shortens Store Operating Hours In Response To Coronavirus Outbreak
2. Apple Acquires DarkSky, Weather App To Be Pulled From Android
3. Apple shares are trading higher despite market weakness, not currently seeing company-specific news. It was reported the company is expected to begin iPhone 12 production in July.
4. Apple CEO Tim Cook "We are confident in our future and continue to make significant investments in all areas of our business to enrich our customers' lives ... including our five-year commitment to contribute $350 billion to the United States Economy"
5. Bulls And Bears Of The Week: Amazon, Boeing, Disney, Netflix And More
6. Loup Ventures Says Believes It Is Still Likely Apple Will Host A Fall 5G iPhone Launch; Notes China Manufacturing And Assembly, Including Some Apple Partners Are Starting To Ramp Up Production
7. 88 Stocks Moving In Friday's Mid-Day Session
8. Some Staples Stocks Bouncing Back After Sliding In Recent Rally As Cauti

In [12]:
from scripts.correlatonal_analyzer import process_ticker_data
ticker = 'AAPL'
print(f"Processing data for ticker: {ticker}")
aapl_data = process_ticker_data(ticker, combined_data, news_data)


Processing data for ticker: AAPL


In [13]:
print(aapl_data)

           Date      Close  Sentiment  Daily Returns
0    2020-03-09  66.542503  -0.466667            NaN
1    2020-03-09  66.542503   0.000000       0.000000
2    2020-03-09  66.542503   0.000000       0.000000
3    2020-03-10  71.334999   0.000000       0.072022
4    2020-03-10  71.334999   0.000000       0.000000
..          ...        ...        ...            ...
410  2020-06-10  88.209999   0.000000       0.000000
411  2020-06-10  88.209999   0.500000       0.000000
412  2020-06-10  88.209999   0.000000       0.000000
413  2020-06-10  88.209999   0.000000       0.000000
414  2020-06-10  88.209999   0.068182       0.000000

[415 rows x 4 columns]


In [14]:
from scripts.correlatonal_analyzer import analyze_ticker_correlation

# Analyze correlations for a specific ticker, e.g., 'AAPL'
ticker = 'AAPL'
print(f"Analyzing correlations for {ticker}...")
correlations = analyze_ticker_correlation(ticker, combined_data, news_data)

print(f"Correlation between Sentiment and Closing Prices: {correlations['sentiment_close']:.4f}")



Analyzing correlations for AAPL...
Correlation between Sentiment and Closing Prices: 0.0517


In [15]:
print(f"Correlation between Sentiment and Daily Returns: {correlations['sentiment_returns']:.4f}")

Correlation between Sentiment and Daily Returns: 0.1556


In [None]:
correlation = combined_data['Sentiment'].corr(combined_data['Close'])
print("Correlation between sentiment and closing:", correlation)

## correlation Analysis for AMZN 

In [41]:
# Example: Filter data for a specific stock ticker ('AAPL')
amzn_data = combined_data[combined_data['Ticker'] == 'AMZN']
print("AMZN Data:", amzn_data.head())

AMZN Data:             Date      Open      High       Low     Close  Adj Close  \
10998 1997-05-15  0.121875  0.125000  0.096354  0.097917   0.097917   
10999 1997-05-16  0.098438  0.098958  0.085417  0.086458   0.086458   
11000 1997-05-19  0.088021  0.088542  0.081250  0.085417   0.085417   
11001 1997-05-20  0.086458  0.087500  0.081771  0.081771   0.081771   
11002 1997-05-21  0.081771  0.082292  0.068750  0.071354   0.071354   

           Volume  Dividends  Stock Splits Ticker  
10998  1443120000        0.0           0.0   AMZN  
10999   294000000        0.0           0.0   AMZN  
11000   122136000        0.0           0.0   AMZN  
11001   109344000        0.0           0.0   AMZN  
11002   377064000        0.0           0.0   AMZN  


In [24]:
from scripts.correlatonal_analyzer import process_ticker_data
ticker = 'AMZN'
print(f"Processing data for ticker: {ticker}")
amzn_data = process_ticker_data(ticker, combined_data, news_data)


Processing data for ticker: AMZN


In [25]:
print(amzn_data)

           Date       Close  Sentiment  Daily Returns
0    2020-04-27  118.800003   0.350000            NaN
1    2020-04-27  118.800003  -0.250000            0.0
2    2020-04-27  118.800003   0.000000            0.0
3    2020-04-27  118.800003   0.400000            0.0
4    2020-04-27  118.800003   0.000000            0.0
..          ...         ...        ...            ...
260  2020-06-10  132.372498   0.433333            0.0
261  2020-06-10  132.372498   0.500000            0.0
262  2020-06-10  132.372498   0.227273            0.0
263  2020-06-10  132.372498   0.068182            0.0
264  2020-06-10  132.372498   0.000000            0.0

[265 rows x 4 columns]


## Correlation Analysis for GOOG

In [42]:
# Example: Filter data for a specific stock ticker ('AAPL')
goog_data = combined_data[combined_data['Ticker'] == 'GOOG']
print("GOOG Data:", goog_data.head())

GOOG Data:              Date      Open      High       Low     Close  Adj Close  \
290412 2004-08-19  2.490664  2.591785  2.390042  2.499133   2.496292   
290413 2004-08-20  2.515820  2.716817  2.503118  2.697639   2.694573   
290414 2004-08-23  2.758411  2.826406  2.716070  2.724787   2.721690   
290415 2004-08-24  2.770615  2.779581  2.579581  2.611960   2.608991   
290416 2004-08-25  2.614201  2.689918  2.587302  2.640104   2.637103   

           Volume  Dividends  Stock Splits Ticker  
290412  897427216        0.0           0.0   GOOG  
290413  458857488        0.0           0.0   GOOG  
290414  366857939        0.0           0.0   GOOG  
290415  306396159        0.0           0.0   GOOG  
290416  184645512        0.0           0.0   GOOG  


In [43]:
from scripts.correlatonal_analyzer import process_ticker_data
ticker = 'GOOG'
print(f"Processing data for ticker: {ticker}")
goog_data = process_ticker_data(ticker, combined_data, news_data)

Processing data for ticker: GOOG


In [45]:
print(goog_data)

            Date      Close  Sentiment  Daily Returns
0     2018-11-13  51.802502   0.000000            NaN
1     2018-11-13  51.802502   0.000000       0.000000
2     2018-11-14  52.182999   0.113636       0.007345
3     2018-11-14  52.182999   0.000000       0.000000
4     2018-11-15  53.235500   0.000000       0.020169
...          ...        ...        ...            ...
1163  2020-06-09  72.807999   0.000000       0.006602
1164  2020-06-09  72.807999   0.133333       0.000000
1165  2020-06-10  73.292503   0.033333       0.006655
1166  2020-06-10  73.292503   0.000000       0.000000
1167  2020-06-10  73.292503   0.000000       0.000000

[1168 rows x 4 columns]


## Correlation for META

In [46]:
# Example: Filter data for a specific stock ticker
meta_data = combined_data[combined_data['Ticker'] == 'META']
print("META Data:", meta_data.head())

META Data:              Date       Open       High        Low      Close  Adj Close  \
295432 2012-12-12  28.000000  28.139999  27.370001  27.580000  27.523441   
295433 2012-12-13  27.590000  28.750000  27.430000  28.240000  28.182087   
295434 2012-12-14  28.180000  28.330000  26.760000  26.809999  26.755020   
295435 2012-12-17  26.770000  27.000000  26.320000  26.750000  26.695143   
295436 2012-12-18  26.959999  27.910000  26.900000  27.709999  27.653173   

          Volume  Dividends  Stock Splits Ticker  
295432  46704200        0.0           0.0   META  
295433  81051600        0.0           0.0   META  
295434  91631600        0.0           0.0   META  
295435  57742500        0.0           0.0   META  
295436  60512900        0.0           0.0   META  


In [47]:
from scripts.correlatonal_analyzer import process_ticker_data
ticker = 'META'
print(f"Processing data for ticker: {ticker}")
meta_data = process_ticker_data(ticker, combined_data, news_data)

Processing data for ticker: META


In [48]:
print(meta_data)

Empty DataFrame
Columns: [Date, Close, Sentiment, Daily Returns]
Index: []


## correlation for MSFT


In [52]:
# Example: Filter data for a specific stock ticker
msft_data = combined_data[combined_data['Ticker'] == 'MSFT']
print("MSFT Data:", msft_data.head())

MSFT Data:              Date      Open      High       Low     Close  Adj Close  \
298358 1986-03-13  0.088542  0.101563  0.088542  0.097222   0.059946   
298359 1986-03-14  0.097222  0.102431  0.097222  0.100694   0.062087   
298360 1986-03-17  0.100694  0.103299  0.100694  0.102431   0.063158   
298361 1986-03-18  0.102431  0.103299  0.098958  0.099826   0.061552   
298362 1986-03-19  0.099826  0.100694  0.097222  0.098090   0.060482   

            Volume  Dividends  Stock Splits Ticker  
298358  1031788800        0.0           0.0   MSFT  
298359   308160000        0.0           0.0   MSFT  
298360   133171200        0.0           0.0   MSFT  
298361    67766400        0.0           0.0   MSFT  
298362    47894400        0.0           0.0   MSFT  


In [59]:
from scripts.correlatonal_analyzer import process_ticker_data
ticker = 'MSFT'
print(f"Processing data for ticker: {ticker}")
msft_data = process_ticker_data(ticker, combined_data, news_data)

Processing data for ticker: MSFT


In [60]:
print(msft_data)

Empty DataFrame
Columns: [Date, Close, Sentiment, Daily Returns]
Index: []


## Correlation for NVDA


In [61]:
# Example: Filter data for a specific stock ticker 
nvda_data = combined_data[combined_data['Ticker'] == 'NVDA']
print("NVDA Data:", nvda_data.head())

NVDA Data:              Date      Open      High       Low     Close  Adj Close  \
308030 1999-01-22  0.043750  0.048828  0.038802  0.041016   0.037621   
308031 1999-01-25  0.044271  0.045833  0.041016  0.045313   0.041562   
308032 1999-01-26  0.045833  0.046745  0.041146  0.041797   0.038337   
308033 1999-01-27  0.041927  0.042969  0.039583  0.041667   0.038218   
308034 1999-01-28  0.041667  0.041927  0.041276  0.041536   0.038098   

            Volume  Dividends  Stock Splits Ticker  
308030  2714688000        0.0           0.0   NVDA  
308031   510480000        0.0           0.0   NVDA  
308032   343200000        0.0           0.0   NVDA  
308033   244368000        0.0           0.0   NVDA  
308034   227520000        0.0           0.0   NVDA  


In [None]:
# Example: Filter data for a specific stock ticker
from scripts.correlatonal_analyzer import process_ticker_data
ticker = 'NVDA'
print(f"Processing data for ticker: {ticker}")
nvda_data = process_ticker_data(ticker, combined_data, news_data)

Processing data for ticker: NVDA


In [64]:
print(nvda_data)

            Date    Close  Sentiment  Daily Returns
0     2011-03-03  0.52175   -0.30000            NaN
1     2011-03-07  0.51175    0.00000      -0.019166
2     2011-03-07  0.51175   -0.25000       0.000000
3     2011-03-08  0.48875   -0.50000      -0.044944
4     2011-03-08  0.48875    0.50000       0.000000
...          ...      ...        ...            ...
3065  2020-06-08  8.80500    0.00000       0.000000
3066  2020-06-09  9.04600    0.26250       0.027371
3067  2020-06-10  9.36675    0.08125       0.035458
3068  2020-06-10  9.36675    0.00000       0.000000
3069  2020-06-10  9.36675    0.00000       0.000000

[3070 rows x 4 columns]


## Corralation for TSLA

In [65]:
# Example: Filter data for a specific stock ticker
tsla_data = combined_data[combined_data['Ticker'] == 'TSLA']
print("TSLA Data:", tsla_data.head())

TSLA Data:              Date      Open      High       Low     Close  Adj Close  \
314451 2010-06-29  1.266667  1.666667  1.169333  1.592667   1.592667   
314452 2010-06-30  1.719333  2.028000  1.553333  1.588667   1.588667   
314453 2010-07-01  1.666667  1.728000  1.351333  1.464000   1.464000   
314454 2010-07-02  1.533333  1.540000  1.247333  1.280000   1.280000   
314455 2010-07-06  1.333333  1.333333  1.055333  1.074000   1.074000   

           Volume  Dividends  Stock Splits Ticker  
314451  281494500        0.0           0.0   TSLA  
314452  257806500        0.0           0.0   TSLA  
314453  123282000        0.0           0.0   TSLA  
314454   77097000        0.0           0.0   TSLA  
314455  103003500        0.0           0.0   TSLA  


In [69]:
# Example: Filter data for a specific stock ticker ('AAPL')
from scripts.correlatonal_analyzer import process_ticker_data
ticker = 'TSLA'
print(f"Processing data for ticker: {ticker}")
tsla_data = process_ticker_data(ticker, combined_data, news_data)
print(tsla_data.head(10))

Processing data for ticker: TSLA
         Date      Close  Sentiment  Daily Returns
0  2019-07-01  15.144667   0.000000            NaN
1  2019-07-01  15.144667   0.000000       0.000000
2  2019-07-01  15.144667   0.066667       0.000000
3  2019-07-01  15.144667   0.250000       0.000000
4  2019-07-02  14.970000   0.000000      -0.011533
5  2019-07-02  14.970000   0.075000       0.000000
6  2019-07-02  14.970000   0.250000       0.000000
7  2019-07-02  14.970000   0.000000       0.000000
8  2019-07-02  14.970000   0.000000       0.000000
9  2019-07-02  14.970000   0.000000       0.000000


In [67]:
print(tsla_data)

            Date      Close  Sentiment  Daily Returns
0     2019-07-01  15.144667   0.000000            NaN
1     2019-07-01  15.144667   0.000000       0.000000
2     2019-07-01  15.144667   0.066667       0.000000
3     2019-07-01  15.144667   0.250000       0.000000
4     2019-07-02  14.970000   0.000000      -0.011533
...          ...        ...        ...            ...
1780  2020-06-10  68.336670   0.148182       0.000000
1781  2020-06-10  68.336670  -0.400000       0.000000
1782  2020-06-10  68.336670   0.000000       0.000000
1783  2020-06-10  68.336670   0.000000       0.000000
1784  2020-06-10  68.336670   0.250000       0.000000

[1785 rows x 4 columns]
