In [1]:
# !git clone https://github.com/Sharp-4rth/temporal_belief_analysis.git

In [None]:
# For colab:
from google.colab import drive
drive.mount('/content/drive')

# imports:
from temporal_belief_analysis.src.temporal_belief.models.bart_classifier import BARTZeroShotClassifier
from temporal_belief_analysis.src.temporal_belief.utils.config import TOPIC_STANCE_HYPOTHESES, GENERAL_STANCE_HYPOTHESES, ProjectConfig

# Restart after installing:
!pip install convokit

In [1]:
# import unsloth
# import unsloth_zoo
from convokit import Corpus, download
import convokit
# corpus = Corpus(filename=download("subreddit-PoliticalDiscussion"))
corpus = Corpus(filename="/Users/leonidas/.convokit/saved-corpora/PoliticalDiscussion_test_100conv_20250618_110426")
# corpus = Corpus(filename="Users/leonidas/GitHub/temporal_belief_analysis/data/reddit-corpus-small")

  import pkg_resources


In [None]:
# For runpod-jupyter or local
import sys
import os

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from temporal_belief.models.bart_classifier import BARTZeroShotClassifier
from temporal_belief.utils.config import TOPIC_STANCE_HYPOTHESES, GENERAL_STANCE_HYPOTHESES, ProjectConfig

In [None]:
def prepare_text(text):
    clean_text = text.strip()
    if len(clean_text) > 500:
        clean_text = clean_text[:500] + "..."
    return clean_text

In [14]:
def get_contextual_framing_for_topic(topic, text):
    if topic:
        contextual_text = f"In a discussion about {topic}, this comment states: {clean_text}"
    else:
        contextual_text = f"In a political discussion, this comment states: {clean_text}"
    return contextual_text

In [6]:
# Helper function. Return candidate stances based on the topic:
# Pass in the topic
def get_stance_hypotheses_for_topic(topic):
     # Normalize topic name for lookup
    topic_lower = topic.lower() if topic else ""

    # Topic mapping with flexible matching
    if any(keyword in topic_lower for keyword in ['abortion', 'reproductive', 'pro-choice', 'pro-life']):
        return TOPIC_STANCE_HYPOTHESES['abortion and reproductive rights']
    elif any(keyword in topic_lower for keyword in ['gun', 'firearm', 'second amendment', 'weapon']):
        return TOPIC_STANCE_HYPOTHESES['gun rights and control']
    elif any(keyword in topic_lower for keyword in ['immigration', 'border', 'refugee', 'immigrant']):
        return TOPIC_STANCE_HYPOTHESES['immigration']
    elif any(keyword in topic_lower for keyword in ['healthcare', 'medical', 'insurance', 'medicare']):
        return TOPIC_STANCE_HYPOTHESES['healthcare']
    elif any(keyword in topic_lower for keyword in ['climate', 'environment', 'carbon', 'global warming']):
        return TOPIC_STANCE_HYPOTHESES['climate change']
    else:
        # Use general hypotheses for unknown topics
        return GENERAL_STANCE_HYPOTHESES

In [4]:
import re
# Helper function:

def mark_quotes(text):
    """Replace ConvoKit quote markers with standard quotation marks."""

    # Split text into lines for processing
    lines = text.split('\n')
    result_lines = []
    in_quote = False

    for line in lines:
        # Check if line starts a quote (begins with &gt;)
        if line.strip().startswith('&gt;'):
            if not in_quote:
                # Start of new quote - replace &gt; with opening quote
                cleaned_line = line.replace('&gt;', '"', 1).lstrip()
                result_lines.append(cleaned_line)
                in_quote = True
            else:
                # Continuation of quote - just remove &gt;
                cleaned_line = line.replace('&gt;', '', 1).lstrip()
                result_lines.append(cleaned_line)

        # Check if we're ending a quote (empty line or no more &gt; markers)
        elif in_quote and (line.strip() == '' or not line.strip().startswith('&gt;')):
            # End the quote by adding closing quote to previous line
            if result_lines and not result_lines[-1].strip().endswith('"'):
                result_lines[-1] = result_lines[-1].rstrip() + '"'

            # Add current line if it's not empty
            if line.strip():
                result_lines.append(line)
            else:
                result_lines.append(line)  # Keep empty lines

            in_quote = False

        else:
            # Regular line, not in quote
            result_lines.append(line)

    # Handle case where quote goes to end of text
    if in_quote and result_lines and not result_lines[-1].strip().endswith('"'):
        result_lines[-1] = result_lines[-1].rstrip() + '"'

    return '\n'.join(result_lines)

In [5]:
"""Stance detection functionality for conversation analysis."""

import logging
from typing import List, Dict, Any, Optional
from tqdm import tqdm
from src.temporal_belief.utils.logger import setup_logging

logger = setup_logging("DEBUG")

class StanceDetector:
    """Detect political stance in ConvoKit utterances using BART."""

    def __init__(self, stance_labels: Optional[List[str]] = None,
                 config: ProjectConfig = None):
        """Initialize stance detector."""
        self.config = config or ProjectConfig()
        self.classifier = BARTZeroShotClassifier(self.config.bart_model_name)
        self.stance_labels = stance_labels or STANCE_LABELS
        logger.info(f"Initialized stance detector with labels: {self.stance_labels}")


    def detect_utterance_stance(self, utterance) -> Dict[str, Any]:
        """Detect stance for a single utterance."""
        # text = mark_quotes(utterance.text)
        text = prepare_text(utterance.text)
        contextual_text = get_contextual_framing_for_topic(utterance.meta['thread_topic'], text)

        if not text or text == '[removed]' or text == '[deleted]' or text.strip() == '.':
            # logger.warning(f"No utterance found in {utterance.id}")
            return {"stance": "unknown", "confidence": 0.0}

        # Pass one of the candidate stances
        result = self.classifier.classify_text(text, self.stance_labels)
        return {
            "stance": result["label"],
            "confidence": result["confidence"],
            "all_scores": result["all_scores"]
        }

    def process_corpus_utterances(self, corpus, batch_size: int = 50,
                              save_path: Optional[str] = None) -> None:
        """Process all utterances in corpus for stance detection."""
        utterances = list(corpus.iter_utterances())
        logger.info(f"Processing {len(utterances)} utterances for stance detection")

        for i in tqdm(range(0, len(utterances), batch_size),
                      desc="Processing utterances"):
            batch = utterances[i:i+batch_size]

            for utt in batch:
                try:
                    stance_result = self.detect_utterance_stance(utt)

                    # Add to utterance metadata
                    utt.add_meta("detected_stance", stance_result["stance"])
                    utt.add_meta("stance_confidence", stance_result["confidence"])
                    utt.add_meta("stance_scores", stance_result["all_scores"])

                except Exception as e:
                    logger.error(f"Failed to process utterance {utt.id}: {e}")
                    utt.add_meta("detected_stance", "unknown")
                    utt.add_meta("stance_confidence", 0.0)

        if save_path:
            corpus.dump(save_path)
            logger.info(f"Saved processed corpus to {save_path}")

        logger.info("Stance detection processing complete")

In [13]:
# Play around with a single part
# print(corpus.random_utterance().text)
stance_detector = StanceDetector()
utts = list(corpus.iter_utterances())
utt_text = utts[1].text
utt_text = mark_quotes(utt_text)
# print(utt_text)
stance = stance_detector.detect_utterance_stance(utts[1])
# print(stance['thread_topic'])
print(utts[1].meta['thread_topic'])

Device set to use mps
2025-06-28 15:59:39,027 - temporal_belief.models.bart_classifier - INFO - bart_classifier:35 - Loaded BART model: facebook/bart-large-mnli on device: CPU
2025-06-28 15:59:39,033 - temporal_belief - INFO - 2224113277:20 - Initialized stance detector with labels: ['strongly liberal', 'moderately liberal', 'neutral', 'moderately conservative', 'strongly conservative']


abortion and reproductive rights


In [None]:
# Testing 'process_corpus_utterances()'
SAVE_PATH = "/workspace/temporal_belief_analysis/pd_corpus_with_stances"
stance_detector = StanceDetector()
stance_detector.process_corpus_utterances(corpus, save_path=SAVE_PATH)

In [None]:
# Check if metadata gets added
utterances = list(corpus.iter_utterances())
first_utt = utterances[0]
print(f"First utterance ID: {first_utt.id}")
print(f"Has stance metadata: {'detected_stance' in first_utt.meta}")
if 'detected_stance' in first_utt.meta:
    print(f"Stance: {first_utt.meta['detected_stance']}")
    print(f"Confidence: {first_utt.meta['stance_confidence']}")

In [5]:
# Testing 'detect_conversation_topic()' and 'dump()'
logger = setup_logging("DEBUG")
i = 0
utts = list(corpus.iter_utterances())
stance_detector = StanceDetector()
for i in range(30):
  logger.info("=" * 100)
  logger.info(f"Comment: {utts[i].text}\n")
  stance = stance_detector.detect_utterance_stance(utts[i])
  logger.info(f"Stance: {stance['stance']}")
  logger.info(f"Confidence: {stance['confidence']}")
  # logger.debug(f"Raw utterance text length: {len(utts[i].text)}")
  logger.warning(f"Low confidence score: {stance['confidence']}")
  # utts[i].add_meta("detected_stance", stance["stance"])
  # utts[i].add_meta("stance_confidence", stance["stance"])
  # utts[i].add_meta("stance_scores", stance["all_scores"])
  i += 1

del utts
# Test it saves properly:
# corpus_small.dump("/content/drive/MyDrive/MScProject/Corpora/corpus_small")


Device set to use mps
2025-06-28 12:11:05,913 - temporal_belief.models.bart_classifier - INFO - bart_classifier:35 - Loaded BART model: facebook/bart-large-mnli on device: CPU
2025-06-28 12:11:05,914 - temporal_belief - INFO - 2224113277:20 - Initialized stance detector with labels: ['strongly liberal', 'moderately liberal', 'neutral', 'moderately conservative', 'strongly conservative']
2025-06-28 12:11:05,915 - temporal_belief - INFO - 847246108:8 - Comment: 

2025-06-28 12:11:05,915 - temporal_belief - INFO - 847246108:10 - Stance: unknown
2025-06-28 12:11:05,915 - temporal_belief - INFO - 847246108:11 - Confidence: 0.0
2025-06-28 12:11:05,916 - temporal_belief - INFO - 847246108:8 - Comment: One of the major arguments in favor of abortion is respect for the mother's bodily autonomy. There are many arguments against abortion, but this is the first one I've found that highlights how anti-feminist this rhetoric actually is because it seeks to identify the male as sexually normative. Th