In [None]:
import pandas as pd
import sys
sys.path.append("../scripts")
from db_utils import read_query_as_df

In [2]:
query = """
SELECT 
    s.stock,
    s.date,
    s.close,
    s.volume,
    a.polarity,
    a.subjectivity
FROM stock_prices s
JOIN (
    SELECT stock, date, 
           AVG(polarity) AS polarity,
           AVG(subjectivity) AS subjectivity
    FROM sentiment_scores
    GROUP BY stock, date
) a
ON s.stock = a.stock AND s.date = a.date
ORDER BY s.stock, s.date;
"""

In [3]:
stock_df = read_query_as_df("SELECT * FROM stock_prices")
sentiment_df = read_query_as_df("SELECT * FROM sentiment_scores")
stock_df['date'] = pd.to_datetime(stock_df['date']).dt.date
sentiment_df['date'] = pd.to_datetime(sentiment_df['date']).dt.date



In [4]:
sentiment_agg = sentiment_df.groupby(['stock', 'date']).agg({
    'polarity': 'mean',
    'subjectivity': 'mean'
}).reset_index()
sentiment_agg.head()

Unnamed: 0,stock,date,polarity,subjectivity
0,AAPL,2017-01-31,0.0,0.0
1,AAPL,2017-05-03,0.0,0.0
2,AAPL,2017-05-23,0.0,0.3
3,AAPL,2017-07-29,0.5,1.0
4,AAPL,2018-02-01,0.216667,0.4


In [5]:
merged_df = pd.merge(stock_df, sentiment_agg, on=['stock', 'date'], how='inner')

In [6]:
merged_df.shape

(60, 9)

In [7]:
merged_df['daily_return'] = merged_df.groupby('stock')['close'].pct_change()

In [8]:
merged_df['sentiment_spike'] = merged_df['polarity'] > 0.5
merged_df['neg_sentiment'] = merged_df['polarity'] < -0.3

In [9]:
merged_df['vol_spike'] = merged_df.groupby('stock')['volume'].transform(
    lambda x: x > x.rolling(7).mean() * 1.5
)

In [10]:
merged_df['day_of_week'] = pd.to_datetime(merged_df['date']).dt.day_name()

In [11]:
merged_df['sentiment_label'] = merged_df['polarity'].apply(lambda x: "Positive" if x > 0.05 else "Negative" if x < -0.05 else "Neutral")


In [12]:
def detect_spike(group):
    # Exclude 'stock' column from being recalculated if it's included
    group = group.copy()
    
    group['rolling_mean'] = group['polarity'].rolling(window=3, min_periods=2).mean()
    group['rolling_std'] = group['polarity'].rolling(window=3, min_periods=2).std()
    group['sentiment_spike'] = (abs(group['polarity'] - group['rolling_mean']) > 0.6 * group['rolling_std'])
    return group

# Apply the function to each ticker
merged_df = merged_df.sort_values(['stock', 'date'])  # just in case
grouped = [detect_spike(group) for _, group in merged_df.groupby('stock')]
merged_df = pd.concat(grouped).reset_index(drop=True)

In [13]:
merged_df['spike_direction'] = merged_df.apply(
    lambda row: 'Positive Spike' if row['sentiment_spike'] and row['polarity'] > 0 
    else 'Negative Spike' if row['sentiment_spike'] and row['polarity'] < 0 
    else 'None', axis=1
)

print(merged_df.groupby('spike_direction')['daily_return'].describe())

                 count      mean       std       min       25%       50%  \
spike_direction                                                            
Negative Spike    11.0  0.025738  0.050720 -0.038973  0.005469  0.014446   
None              26.0  0.000030  0.028133 -0.064539 -0.007848  0.002950   
Positive Spike    18.0  0.010249  0.031816 -0.053100 -0.012256  0.001619   

                      75%       max  
spike_direction                      
Negative Spike   0.031724  0.164009  
None             0.013208  0.059436  
Positive Spike   0.030569  0.073456  


In [14]:
merged_df['rolling_sentiment'] = merged_df.groupby('stock')['polarity'].transform(lambda x: x.rolling(7).mean())

In [15]:
def label_spike(row):
    if row['sentiment_spike'] and row['vol_spike']:
        return "Both"
    elif row['sentiment_spike']:
        return "Sentiment"
    elif row['vol_spike']:
        return "Volume"
    else:
        return "None"

merged_df['spike_type'] = merged_df.apply(label_spike, axis=1)

In [16]:
merged_df.to_csv("candlethrob_dataset.csv", index=False)