In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [3]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        probabilities = probabilities.cpu().numpy()[0]

    # Calculate score and normalize to [0,1]
    sentiment_score = probabilities[0] * 1 + probabilities[1] * -1
    normalized_score = (sentiment_score + 1) / 2
    return normalized_score

def process_dataframe(df, text_col='text', date_col='date', min_count=3):
    # 1. Apply sentiment scoring to each row's text
    df['sentiment_score'] = df[text_col].apply(get_sentiment_score)

    # 2. Group by date
    grouped = df.groupby(date_col)

    # 3. Compute average sentiment score per date, skipping days with fewer than min_count texts
    results = []
    for date_val, group_df in grouped:
        if len(group_df) < min_count:
            print(f"Skipping {date_val}: Only {len(group_df)} rows found")
            continue
        
        daily_score = group_df['sentiment_score'].mean()
        results.append({
            'Date': date_val,
            'Sentiment_Score': daily_score
        })

    # Return as a new DataFrame
    result_df = pd.DataFrame(results)
    return result_df

In [11]:
# Path to your AMZN folder
amzn_folder_path = "AMZN"

all_texts = []

# Loop over each file in the AMZN folder
for filename in tqdm(sorted(os.listdir(amzn_folder_path))):
    full_path = os.path.join(amzn_folder_path, filename)

    # Skip directories or hidden files
    if os.path.isdir(full_path):
        continue
        
    # The date is just the filename (e.g. "2020-06-01", "2020-05-31", etc.)
    date_str = filename

    # Open and read line by line
    with open(full_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Each line is JSON with a "text" field
            data = json.loads(line.strip())
            text = data.get("text", "")
                
            # Collect into our master list
            all_texts.append({
                "text": text,
                "date": date_str
            })

# Create a DataFrame
df_all = pd.DataFrame(all_texts)

# Process DataFrame to compute daily sentiment
result_df = process_dataframe(df_all, text_col='text', date_col='date', min_count=3)

# Save results
result_df.to_csv("amazon_daily_sentiment(using tweets).csv", index=False)
print("Done! Results in amzn_daily_sentiment.csv")

100%|██████████| 757/757 [00:00<00:00, 10399.90it/s]


Done! Results in amzn_daily_sentiment.csv


In [14]:
# Path to your AMZN folder
amzn_folder_path = "CVS"

all_texts = []

# Loop over each file in the AMZN folder
for filename in tqdm(sorted(os.listdir(amzn_folder_path))):
    full_path = os.path.join(amzn_folder_path, filename)

    # Skip directories or hidden files
    if os.path.isdir(full_path):
        continue
        
    # The date is just the filename (e.g. "2020-06-01", "2020-05-31", etc.)
    date_str = filename

    # Open and read line by line
    with open(full_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Each line is JSON with a "text" field
            data = json.loads(line.strip())
            text = data.get("text", "")
                
            # Collect into our master list
            all_texts.append({
                "text": text,
                "date": date_str
            })

# Create a DataFrame
df_all = pd.DataFrame(all_texts)

# Process DataFrame to compute daily sentiment
result_df = process_dataframe(df_all, text_col='text', date_col='date', min_count=3)

# Save results
result_df.to_csv("cvs_daily_sentiment(using tweets).csv", index=False)
print("Done! Results in cvs_daily_sentiment.csv")

100%|██████████| 755/755 [00:00<00:00, 7210.17it/s]


Done! Results in cvs_daily_sentiment.csv


In [15]:
# Path to your AMZN folder
amzn_folder_path = "GOOG"

all_texts = []

# Loop over each file in the AMZN folder
for filename in tqdm(sorted(os.listdir(amzn_folder_path))):
    full_path = os.path.join(amzn_folder_path, filename)

    # Skip directories or hidden files
    if os.path.isdir(full_path):
        continue
        
    # The date is just the filename (e.g. "2020-06-01", "2020-05-31", etc.)
    date_str = filename

    # Open and read line by line
    with open(full_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Each line is JSON with a "text" field
            data = json.loads(line.strip())
            text = data.get("text", "")
                
            # Collect into our master list
            all_texts.append({
                "text": text,
                "date": date_str
            })

# Create a DataFrame
df_all = pd.DataFrame(all_texts)

# Process DataFrame to compute daily sentiment
result_df = process_dataframe(df_all, text_col='text', date_col='date', min_count=3)

# Save results
result_df.to_csv("google_daily_sentiment(using tweets).csv", index=False)
print("Done! Results in google_daily_sentiment.csv")

100%|██████████| 756/756 [00:00<00:00, 5355.81it/s]


Done! Results in google_daily_sentiment.csv


In [12]:
Amazon_df_1 = pd.read_csv("amazon_daily_sentiment(using tweets).csv")
Amazon_df_2 = pd.read_csv("Amazon_sentiment_data_with_interpolation.csv")
# Merge vertically (stack rows)
merged_df = pd.concat([Amazon_df_1, Amazon_df_2], ignore_index=True)
merged_df.head()
# Save the merged dataset (optional)
merged_df.to_csv("Amazon_merged_file.csv", index=False)



In [16]:
CVS_df_1 = pd.read_csv("cvs_daily_sentiment(using tweets).csv")
CVS_df_2 = pd.read_csv("CVS_sentiment_data_with_interpolation.csv")
# Merge vertically (stack rows)
merged_df = pd.concat([CVS_df_1, CVS_df_2], ignore_index=True)
merged_df.head()
# Save the merged dataset (optional)
merged_df.to_csv("CVS_merged_file.csv", index=False)

In [17]:
Google_df_1 = pd.read_csv("google_daily_sentiment(using tweets).csv")
Google_df_2 = pd.read_csv("Google_sentiment_data_with_interpolation.csv")
# Merge vertically (stack rows)
merged_df = pd.concat([Google_df_1, Google_df_2], ignore_index=True)
merged_df.head()
# Save the merged dataset (optional)
merged_df.to_csv("Google_merged_file.csv", index=False)