In [None]:
# https://www.kaggle.com/datasets/equinxx/stock-tweets-for-sentiment-analysis-and-prediction?resource=download&select=stock_tweets.csv
# https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm, LinearSegmentedColormap
from matplotlib.cm import ScalarMappable
from datetime import datetime

In [None]:
prices = pd.read_csv('stock_yfinance_data.csv')
tweets = pd.read_csv('stock_tweets.csv')

In [None]:
scored_tweets = pd.read_csv('scored_tweets.csv')

In [None]:
def list_to_score(lst):
    lst = list(map(float, lst.strip("[]").split()))
    return lst[2] - lst[0]

In [None]:
def list_to_score(lst):
    lst = list(map(float, lst.strip("[]").split()))
    return lst[2] - lst[0]

In [None]:
scored_tweets['sent_score'] = scored_tweets['score'].apply(lambda x: list_to_score(x))
scored_tweets.drop("score", axis=1, inplace=True)

In [None]:
clean_prices = prices.copy(deep=True)
clean_prices.drop(["High", "Low", "Close"], axis=1, inplace=True)

In [None]:
def interval_volatility(stock, window):
    returns = stock.pct_change()
    return returns.rolling(window=window).std()

In [None]:
# get returns for all stocks
cols = ['Date', 'Open', 'Adj Close', 'Volume', 'Stock Name', '1_DAY_RETURN', '2_DAY_RETURN', '3_DAY_RETURN', '7_DAY_RETURN', 'VOLATILITY_2D', 'VOLATILITY_10D', 'VOLATILITY_30D']
tickers = clean_prices["Stock Name"].unique()
stocks_data = pd.DataFrame(columns=cols)

for ticker in tickers:
    stock = clean_prices[clean_prices["Stock Name"] == ticker]
    stock['1_DAY_RETURN'] = stock['Adj Close'] - stock['Adj Close'].shift(1)    
    stock['2_DAY_RETURN'] = stock['Adj Close'] - stock['Adj Close'].shift(2)
    stock['3_DAY_RETURN'] = stock['Adj Close'] - stock['Adj Close'].shift(3)
    stock['7_DAY_RETURN'] = stock['Adj Close'] - stock['Adj Close'].shift(7)
    stock['VOLATILITY_2D'] = interval_volatility(stock['Adj Close'], 2)
    stock['VOLATILITY_10D'] = interval_volatility(stock['Adj Close'], 10)
    stock['VOLATILITY_30D'] = interval_volatility(stock['Adj Close'], 30)
    stocks_data = pd.concat([stocks_data, stock])

In [None]:
def date_formatter(date):
    dt = datetime.fromisoformat(date)
    formatted_date = dt.strftime('%Y-%m-%d')
    return formatted_date

In [None]:
scored_tweets['Date'] = scored_tweets['Date'].apply(lambda x: date_formatter(x))

In [None]:
merged_df = pd.merge(scored_tweets, stocks_data, on=['Date', 'Stock Name'])
merged_df.drop(merged_df.columns[0], inplace=True, axis=1)
merged_df.dropna()

In [None]:
def assign_sent_score(row, daily_group):
    row['sent_score'] = daily_group[row['Date']]
    return row

In [None]:
clean_merge = merged_df.copy(deep=True)
tickers = clean_merge['Stock Name'].unique()

intervals = ['1_DAY_RETURN', '2_DAY_RETURN', '3_DAY_RETURN', '7_DAY_RETURN', 'VOLATILITY_2D', 'VOLATILITY_10D', 'VOLATILITY_30D']
cols = ['Stock Name', 'sent_score', 'Company Name', 'Open', 'Adj Close', 'Volume'] + intervals

df = pd.DataFrame(columns=cols)

for ticker in tickers:
    stock_data = clean_merge[clean_merge["Stock Name"] == ticker]
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])
    daily_group = stock_data.groupby(pd.Grouper(key='Date', freq='D'))['sent_score'].mean()
    stock_data = stock_data.drop_duplicates(subset='Date')
    stock_data = stock_data.apply(lambda x: assign_sent_score(x, daily_group), axis=1)
    df = pd.concat([df, stock_data], ignore_index=True)

counts = scored_tweets['Stock Name'].value_counts()
filtered_stocks = counts[counts >= 1000].index
df = df[df['Stock Name'].isin(filtered_stocks)]
tickers = df['Stock Name'].unique()

In [None]:
lag_df = pd.DataFrame(columns=['Stock Name', 'Max Correlation', 'Lag'])
# print(tickers)
for ticker in tickers:
    stock_data = df[df['Stock Name'] == ticker]
    abs_max_correlation = 0
    max_correlation = 0
    lag = 0

    for i in range(-7, 7):
        # print(i)
        price = stock_data['1_DAY_RETURN']
        sentiment = stock_data['sent_score']
        sentiment_shifted = sentiment.shift(i)
        corr = sentiment_shifted.corr(price)
        # print(corr)
        if abs_max_correlation < abs(corr):
            abs_max_correlation = abs(corr)
            max_correlation = corr
            lag = i
    lag_df = pd.concat([lag_df, pd.DataFrame({'Stock Name': [stock_data['Stock Name'].values[0]], 
                                                    'Max Correlation': [max_correlation], 
                                                    'Lag': [lag]})], ignore_index=True)
lag_df = lag_df.sort_values(by='Max Correlation', ascending=False)
print(lag_df)

In [None]:
export_lag_df = lag_df.rename(columns={
    'Stock Name': 'stock_name',
    'Max Correlation': 'max_correlation',
    'Lag': 'lag'})

# export_lag_df.to_csv('lag_data.csv', index=False)

In [None]:
plt.figure(figsize=(10, 6))
bars = plt.bar(lag_df['Stock Name'], lag_df['Max Correlation'], color='skyblue')

# Add labels to bars (optional)
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval, round(yval, 2), ha='center', va='bottom')

# Add title and labels
plt.title('Max Correlation by Stock', fontsize=14)
plt.xlabel('Stock Name', fontsize=12)
plt.ylabel('Max Correlation', fontsize=12)

# Show the chart
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap, BoundaryNorm
from matplotlib.cm import ScalarMappable

# Ensure lags are numeric
numeric_data = pd.to_numeric(lag_df['Lag'], errors='coerce')

# Define a color for each bucket [-7 to 7]
color_list = [
    '#08306B', '#08519C', '#2171B5', '#4292C6', '#6BAED6', '#9ECAE1', '#C6DBEF',  # -7 to -1 (blues)
    'red',                                                                         #  0 (red)
    '#FEE0D2', '#FCBBA1', '#FC9272', '#FB6A4A', '#EF3B2C', '#CB181D', '#99000D'   #  1 to 7 (oranges/reds)
]

# Create colormap and normalizer
cmap = ListedColormap(color_list)
bounds = np.arange(-7.5, 8.5, 1)  # 16 edges for 15 buckets (-7 to 7)
norm = BoundaryNorm(bounds, ncolors=len(color_list))

# Map colors based on lag values
mapped_colors = cmap(norm(numeric_data))

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(lag_df['Stock Name'], lag_df['Max Correlation'], color=mapped_colors)

# Bar labels
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, yval, round(yval, 2), ha='center', va='bottom')

# Colorbar
sm = ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax, ticks=range(-7, 8), label="Lag (Days)")

# X-axis formatting
ax.set_xticks(range(len(lag_df['Stock Name'])))
ax.set_xticklabels(lag_df['Stock Name'], rotation=45, ha='right', fontsize=10)

# Labels and title
ax.set_title('Max Correlation by Stock with Discrete Lag-Based Coloring', fontsize=14)
ax.set_xlabel('Stock Name', fontsize=12)
ax.set_ylabel('Max Correlation', fontsize=12)

plt.tight_layout()
plt.show()
