In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from huggingface_hub import login
from google.colab import drive
import re

In [None]:
# Mount the google drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/SDU/DS821 - Market Sentiment Analysis/News_and_Market_Sentiment_Analytics-main/exam'

Mounted at /content/gdrive


In [None]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

class StockPostFilterLLM:
    def __init__(self, model="HuggingFaceH4/zephyr-7b-beta"):
        self.model_name = model

        # Load the model and tokenizer
        self.pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto")

        # Context for the model
        self.context = """
          You are a financial expert. Your task is to determine if a Reddit post could influence GME stock.
          Look for mentions of GME, price predictions, news, or strong bullish/bearish sentiment.
          Answer 'yes' if the post is likely to impact GME's price or trading activity, otherwise answer 'no'."""

    def construct_message(self, context, post):
        """
        Construct a prompt from the given context and prompt.
        """
        # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
        message = [
            {
                "role": "system",
                "content": context,
            },
            {
                "role": "user",
                "content": post
            },
        ]
        return message
    def check_response_for_yes(self, response):
      # Regex pattern to ensure 'yes' appears after <|assistant|>
      pattern = r'<\|assistant\|>.*?\byes\b'
      return 1 if re.search(pattern, response, re.IGNORECASE | re.DOTALL) else 0
    
    def __call__(self, post, temperature=0.7):

      # Format the messages in chat format
      message = self.construct_message(self.context, post)
      prompt = self.pipe.tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
      outputs = self.pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
      resp = outputs[0]["generated_text"]
      print(resp)
      print(f"----- {self.check_response_for_yes(resp)} -----")
      # Extract sentiment label using regex
      return self.check_response_for_yes(resp)




In [None]:
# Initialize the LLM model
llm_filter = StockPostFilterLLM()

df = pd.read_csv('data/rd_clean.csv', sep=',')
df = df[df['gme'] == 1]
df["sentiment"] = np.nan  # New column to store results

# Set a batch save interval to improve performance
SAVE_INTERVAL = 10  # Save progress every 10 rows
processed_count = 0  # Track processed rows for saving

# Loop through the rows
for i, row in df.iterrows():
    torch.cuda.empty_cache()  # Clear GPU cache to free memory

    # Skip already processed rows
    if not pd.isna(row['sentiment']):
        continue

    print(f"\nProcessing row {i+1}/{len(df)}...")

    # Generate sentiment
    sentiment_result = llm_filter(row['body'])
    df.at[i, 'sentiment'] = sentiment_result  # Update DataFrame

    # Increment processed count
    processed_count += 1

    # Save progress in batches
    if processed_count % SAVE_INTERVAL == 0:
        df.to_csv('data/df_gme_sentiment_fixed.csv', index=False)
        print(f"Progress saved after {processed_count} rows.")

# Final save
df.to_csv('data/df_gme_sentiment_fixed.csv', index=False)
print("Sentiment analysis complete!")
