In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from huggingface_hub import login
from google.colab import drive

# Login to hugging face. Required for Mistral.
token = 'MY_TOKEN'
login(token)

In [None]:
# Mount the google drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/SDU/DS821 - Market Sentiment Analysis/News_and_Market_Sentiment_Analytics-main/exam'

Mounted at /content/gdrive


In [None]:
class StockPostFilterLLM:
    def __init__(self, model="mistralai/Mistral-7B-v0.1"):
        self.model = model
        
        # Context for the model
        self.context = """
          You are a financial expert. Your task is to determine if a Reddit post could influence GME stock.
          Look for mentions of GME, price predictions, news, or strong bullish/bearish sentiment.
          Answer 'yes' if the post is likely to impact GME's price or trading activity, otherwise answer 'no'."""

        print("Starting model loading...")
        self.pipe = self.load_model(self.model)
        print("Model loaded successfully!")

    def load_model(self, model):
        """
        Load the pretrained language model using Hugging Face pipeline.
        """
        print(f"Loading model '{model}' with device_map='auto'...")
        pipe = pipeline(
            "text-generation",
            model=model,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        print("Pipeline initialized.")
        return pipe

    def construct_prompt(self, post):
        """
        Construct a prompt by combining the system context and the Reddit post.
        """
        print("Constructing the prompt...")
        prompt = f"{self.context}\nReddit post: '{post}'\nResponse:"
        print(f"Prompt constructed.")
        return prompt

    def __call__(self, post, temperature=0.5):
        """
        Generate a response to determine if the post might influence the stock market.
        """
        print("Calling LLM with the post...")
        prompt = self.construct_prompt(post)
        print("Starting text generation...")
        outputs = self.pipe(prompt, max_new_tokens=15, do_sample=True, temperature=temperature, top_p=0.95)
        print("Text generation complete!")
        return outputs[0]["generated_text"]

In [None]:
# Initialize the LLM model
llm_filter = StockPostFilterLLM(model="mistralai/Mistral-7B-v0.1")

df = pd.read_csv('data/rd_clean.csv', sep=',')
df = df[df['gme'] == 1]
df["sentiment"] = np.nan  # New column to store results

# Set a save interval to improve performance
SAVE_INTERVAL = 10  # Save progress every 10 rows
processed_count = 0  # Track processed rows for saving

# Loop through the rows
for i, row in df.iterrows():
    torch.cuda.empty_cache()  # Clear GPU cache to free memory

    # Skip already processed rows
    if not pd.isna(row['sentiment']):
        continue

    print(f"\nProcessing row {i+1}/{len(df)}...")

    try:
        # Generate sentiment
        sentiment_result = llm_filter(row['body'])
        df.at[i, 'sentiment'] = sentiment_result  # Update DataFrame
        print(f"Post: {row['body']}\nSentiment: {sentiment_result}")

        # Increment processed count
        processed_count += 1

        # Save progress in batches
        if processed_count % SAVE_INTERVAL == 0:
            df.to_csv('data/df_gme_sentiment.csv', index=False)
            print(f"Progress saved after {processed_count} rows.")

    except Exception as e:
        print(f"Error processing row {i}: {e}")
        df.at[i, 'sentiment'] = "Error"  # Log error in the sentiment column
        torch.cuda.empty_cache()  # Clear cache after failure to ensure stability

# Final save
df.to_csv('data/df_gme_sentiment_fixed.csv', index=False)
print("Sentiment analysis complete!")
