In [1]:
import os
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import glob

## Initialize FinBERT model

In [2]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [3]:
Google_df = pd.read_csv("reddit_sentiment_data.csv")
Google_df.head()

Unnamed: 0,date,subreddit,post
0,2023-06-08,wallstreetbets,I just applied for the AI position. Fingers cr...
1,2023-06-08,wallstreetbets,"""speak clearly"" \n\nThis ain't gonna work at T..."
2,2023-06-08,wallstreetbets,Jim Kramer said AI would replace $24 an hour h...
3,2023-06-08,wallstreetbets,I inquired about the job opportunities mention...
4,2023-06-08,wallstreetbets,Gonna be a long time before people understand ...


## Google data interpolation

In [4]:
def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        probabilities = probabilities.cpu().numpy()[0]

    # Calculate score and normalize to [0,1]
    sentiment_score = probabilities[0] * 1 + probabilities[1] * -1
    normalized_score = (sentiment_score + 1) / 2
    return normalized_score

def process_dataframe(df, text_col='text', date_col='date', min_count=3):
    # 1. Apply sentiment scoring to each row's text
    df['sentiment_score'] = df[text_col].apply(get_sentiment_score)

    # 2. Group by date
    grouped = df.groupby(date_col)

    # 3. Compute average sentiment score per date, skipping days with fewer than min_count texts
    results = []
    for date_val, group_df in grouped:
        if len(group_df) < min_count:
            print(f"Skipping {date_val}: Only {len(group_df)} rows found")
            continue
        
        daily_score = group_df['sentiment_score'].mean()
        results.append({
            'Date': date_val,
            'Sentiment_Score': daily_score
        })

    # Return as a new DataFrame
    result_df = pd.DataFrame(results)
    return result_df

In [5]:
# Process the dataframe to get daily sentiment
daily_sentiment_df = process_dataframe(Google_df, text_col='post', date_col='date', min_count=1)
daily_sentiment_df.head()

Unnamed: 0,Date,Sentiment_Score
0,2023-06-08,0.42527
1,2023-07-17,0.448462
2,2023-07-20,0.53906
3,2023-07-21,0.462572
4,2023-08-06,0.517055


In [6]:
import pandas as pd

# 1. Convert Date column to datetime if not already
daily_sentiment_df['Date'] = pd.to_datetime(daily_sentiment_df['Date'])
df = daily_sentiment_df.copy()
# 2. Set Date as index
df.set_index('Date', inplace=True)

# 3. Reindex the dataframe to include every day in the range
#    from the earliest to the latest date
full_date_range = pd.date_range(start='2023-06-01', end='2025-02-03', freq='D')
df = df.reindex(full_date_range)
df.index.name = 'Date'

# 4. Interpolate missing values in 'Sentiment_Score'
#    'time' interpolation is often best for time series
df['Sentiment_Score'] = df['Sentiment_Score'].interpolate(method='time')
df['Sentiment_Score'] = df['Sentiment_Score'].ffill().bfill()

df.reset_index(inplace=True)
print(df)


          Date  Sentiment_Score
0   2023-06-01         0.425270
1   2023-06-02         0.425270
2   2023-06-03         0.425270
3   2023-06-04         0.425270
4   2023-06-05         0.425270
..         ...              ...
609 2025-01-30         0.426807
610 2025-01-31         0.359362
611 2025-02-01         0.269470
612 2025-02-02         0.108197
613 2025-02-03         0.415664

[614 rows x 2 columns]


In [7]:
df.to_csv("Google_sentiment_data_with_interpolation.csv", index = False)

## CSV data interpolation

In [8]:
CVS_df = pd.read_csv("reddit_sentiment_data_cvs.csv")
CVS_df.head()

Unnamed: 0,date,subreddit,post
0,2023-06-19,pharmacy,I mean.... people just need to stop coming in....
1,2023-06-19,pharmacy,Charities have volunteers and the last I check...
2,2023-06-19,pharmacy,All these pharmacists working for free are hur...
3,2023-07-09,stocks,Iv been and will keep adding to CVS and CI. I ...
4,2023-07-09,stocks,CVS also owns Aetna


In [9]:
# Process the dataframe to get daily sentiment
daily_sentiment_cvs_df = process_dataframe(CVS_df, text_col='post', date_col='date', min_count=1)
daily_sentiment_cvs_df.head()

Unnamed: 0,Date,Sentiment_Score
0,2023-06-19,0.296824
1,2023-07-09,0.55719
2,2023-07-18,0.88388
3,2023-07-29,0.346103
4,2023-08-05,0.327648


In [10]:
# 1. Convert Date column to datetime if not already
daily_sentiment_cvs_df['Date'] = pd.to_datetime(daily_sentiment_cvs_df['Date'])
df = daily_sentiment_cvs_df.copy()
# 2. Set Date as index
df.set_index('Date', inplace=True)

# 3. Reindex the dataframe to include every day in the range
#    from the earliest to the latest date
full_date_range = pd.date_range(start='2023-06-01', end='2025-02-03', freq='D')
df = df.reindex(full_date_range)
df.index.name = 'Date'

# 4. Interpolate missing values in 'Sentiment_Score'
#    'time' interpolation is often best for time series
df['Sentiment_Score'] = df['Sentiment_Score'].interpolate(method='time')
df['Sentiment_Score'] = df['Sentiment_Score'].ffill().bfill()
df.reset_index(inplace=True)
print(df)

          Date  Sentiment_Score
0   2023-06-01         0.296824
1   2023-06-02         0.296824
2   2023-06-03         0.296824
3   2023-06-04         0.296824
4   2023-06-05         0.296824
..         ...              ...
609 2025-01-30         0.502613
610 2025-01-31         0.502613
611 2025-02-01         0.502613
612 2025-02-02         0.502613
613 2025-02-03         0.502613

[614 rows x 2 columns]


In [11]:
df.to_csv("CVS_sentiment_data_with_interpolation.csv", index = False)