In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import re

In [None]:

class ContextualSentimentAnalyzer:
    def __init__(self, model="HuggingFaceH4/zephyr-7b-beta", save_interval=10):
        """
        Initialize the Mistral model and tokenizer for sentiment analysis.
        """
        self.model_name = model
        self.save_interval = save_interval
        print(f"Loading model: {self.model_name}")

        # Load the model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto", torch_dtype=torch.bfloat16)

        print("Model and tokenizer loaded successfully!")

        # System context for the task
        self.context = (
            "You are analyzing Reddit posts about GME stock to determine their sentiment.\n"
            "Classify each post strictly as one of the following:\n"
            "- Bullish: Positive sentiment, optimism, or support for GME.\n"
            "- Bearish: Negative sentiment, pessimism, or doubt about GME.\n"
            "- Neutral: No strong positive or negative sentiment.\n\n"
            "Respond only with 'bullish', 'bearish', or 'neutral'."
        )

    def extract_sentiment(self, response):
      """
      Extract sentiment label ('bullish', 'bearish', 'neutral') from the content after <|assistant|>.
      """
      try:
          # Extract everything after <|assistant|>
          assistant_response = re.split(r"<\|assistant\|>", response, maxsplit=1)
          if len(assistant_response) < 2:
              return "uncertain"  # Default if <|assistant|> is missing

          # Clean and strip whitespace
          clean_response = assistant_response[1].strip()
          print(clean_response)
          # Search for 'bull', 'bear', or 'neut' (to match variations like bullish, bearish, neutral)
          match = re.search(r"\b(bullish\w*|bearish\w*|neutral\w*)\b", clean_response.lower())
          if match:
              return match.group(1)  # Return the matching sentiment keyword
          return "uncertain"  # If no sentiment is found
      except Exception as e:
          print(f"Error extracting sentiment: {e}")
          return "error"

    def analyze_post(self, post):
        """
        Analyze a post with contextual prompting and return a sentiment label.
        """
        try:
            # Format the messages in chat format
            messages = [
                {"role": "system", "content": self.context},
                {"role": "user", "content": f"Reddit Post: '{post}'\nSentiment:"}
            ]
            # Tokenize the messages using the chat template
            model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.model.device)

            # Generate the model response
            generated_ids = self.model.generate(model_inputs, max_new_tokens=20, do_sample=False, temperature=0.2)
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            print(f"DEBUG: Raw Model Response: '{response}'")
            print(f"DEBUG_END")
            # Extract sentiment label using regex
            sentiment = self.extract_sentiment(response)
            return sentiment

        except Exception as e:
            print(f"Error processing post: {e}")
            return "error"


    def process_dataframe(self, df, text_column, output_file):

        """
        Process the DataFrame row by row and save progress periodically.
        """
        if 'sentiment' not in df.columns:
            df['sentiment'] = None

        processed_count = 0

        for i, row in df.iterrows():
            torch.cuda.empty_cache()  # Free up GPU memory (optional)

            if pd.notna(row['sentiment']):
                continue  # Skip already processed rows

            print(f"\nProcessing row {i + 1}/{len(df)}...")
            sentiment = self.analyze_post(row[text_column])
            df.at[i, 'sentiment'] = sentiment
            print(f"Post: {row[text_column]}\nSentiment: {sentiment}")

            processed_count += 1

            # Save progress every SAVE_INTERVAL rows
            if processed_count % self.save_interval == 0:
                df.to_csv(output_file, index=False)
                print(f"Progress saved after {processed_count} rows.")

        # Final save
        df.to_csv(output_file, index=False)
        print("Sentiment analysis complete! Final file saved.")

In [None]:
# Load your dataset
df = pd.read_csv('data/sentiment_ready_2.csv')
output_file = "data/sentiment_done_2.csv"


# Initialize the ContextualSentimentAnalyzer
sentiment_analyzer = ContextualSentimentAnalyzer(
    model="HuggingFaceH4/zephyr-7b-beta",
    save_interval=10
)
# Process the DataFrame
sentiment_analyzer.process_dataframe(df=df, text_column='post', output_file=output_file)


# Display the updated DataFrame
print(df.head())