In [12]:
!pip install convokit

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [13]:
# For runpod-jupyter or local (run twice)
import sys
import os

# Change to the correct working directory (same as Jupyter)
os.chdir('/workspace/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from temporal_belief.models.bart_classifier import BARTZeroShotClassifier
from temporal_belief.utils.config import TOPIC_STANCE_HYPOTHESES, GENERAL_STANCE_HYPOTHESES, ProjectConfig
from temporal_belief.data.preprocessors import StancePreprocessor

Changed working directory to: /workspace/temporal_belief_analysis/notebooks


In [None]:
# For colab:
from google.colab import drive
drive.mount('/content/drive')

# imports:
from temporal_belief_analysis.src.temporal_belief.models.bart_classifier import BARTZeroShotClassifier
from temporal_belief_analysis.src.temporal_belief.utils.config import TOPIC_STANCE_HYPOTHESES, GENERAL_STANCE_HYPOTHESES, ProjectConfig

# Restart after installing:
!pip install convokit

In [4]:
# Unzip with python:
import zipfile
zipfile.ZipFile("/workspace/temporal_belief_analysis/pd_corpus_with_topics.zip").extractall("/workspace/temporal_belief_analysis")

In [None]:
# Download file from Google Drive or cloud service
!pip install gdown
file_id = "1nWaj5N8nsG7u5homv_kAh4CLPDv01M_Z"
!gdown "https://drive.google.com/file/d/1nWaj5N8nsG7u5homv_kAh4CLPDv01M_Z/view?usp=share_link" -O "/workspace/temporal_belief_analysis/pd_corpus_with_topics.zip" --fuzzy

In [14]:
# Run twice
# import unsloth
# import unsloth_zoo
from convokit import Corpus, download
import convokit

In [None]:
# Load corpus:
corpus = Corpus(filename="/workspace/temporal_belief_analysis/pd_corpus_with_stances100000_chronological")

In [6]:
# Helper functions:
import re

def prepare_text(text):
    clean_text = text.strip()
    if len(clean_text) > 500:
        clean_text = clean_text[:500] + "..."
    return clean_text
    
def get_contextual_framing_for_topic(topic, text):
    if topic:
        contextual_text = f"In a discussion about {topic}, this comment states: {text}"
    else:
        contextual_text = f"In a political discussion, this comment states: {text}"
    return contextual_text

def mark_quotes(text):
    """Replace ConvoKit quote markers with standard quotation marks."""

    # Split text into lines for processing
    lines = text.split('\n')
    result_lines = []
    in_quote = False

    for line in lines:
        # Check if line starts a quote (begins with &gt;)
        if line.strip().startswith('&gt;'):
            if not in_quote:
                # Start of new quote - replace &gt; with opening quote
                cleaned_line = line.replace('&gt;', '"', 1).lstrip()
                result_lines.append(cleaned_line)
                in_quote = True
            else:
                # Continuation of quote - just remove &gt;
                cleaned_line = line.replace('&gt;', '', 1).lstrip()
                result_lines.append(cleaned_line)

        # Check if we're ending a quote (empty line or no more &gt; markers)
        elif in_quote and (line.strip() == '' or not line.strip().startswith('&gt;')):
            # End the quote by adding closing quote to previous line
            if result_lines and not result_lines[-1].strip().endswith('"'):
                result_lines[-1] = result_lines[-1].rstrip() + '"'

            # Add current line if it's not empty
            if line.strip():
                result_lines.append(line)
            else:
                result_lines.append(line)  # Keep empty lines

            in_quote = False

        else:
            # Regular line, not in quote
            result_lines.append(line)

    # Handle case where quote goes to end of text
    if in_quote and result_lines and not result_lines[-1].strip().endswith('"'):
        result_lines[-1] = result_lines[-1].rstrip() + '"'

    return '\n'.join(result_lines)

In [23]:
"""Stance detection functionality for conversation analysis."""
import numpy as np
import logging
from typing import List, Dict, Any, Optional
from tqdm import tqdm
from temporal_belief.utils.logger import setup_logging

logger = setup_logging("DEBUG")

class StanceDetector:
    """Detect political stance in ConvoKit utterances using BART."""

    def __init__(self, stance_labels: Optional[List[str]] = None,
                 config: ProjectConfig = None):
        """Initialize stance detector."""
        self.config = config or ProjectConfig()
        self.classifier = BARTZeroShotClassifier(self.config.bart_model_name)
        self.stance_labels = stance_labels or TOPIC_STANCE_HYPOTHESES
        # logger.info(f"Initialized stance detector with labels: {self.stance_labels}")


    def detect_utterance_stance(self, utterance, corpus) -> Dict[str, Any]:
        """Detect stance for a single utterance."""
        # text = mark_quotes(utterance.text)
        if not utterance.text or utterance.text == '[removed]' or utterance.text == '[deleted]' or utterance.text.strip() == '.':
            # logger.warning(f"No utterance found in {utterance.id}")
            return {"stance": "unknown", "confidence": 0.0}
        convo = corpus.get_conversation(utterance.conversation_id)
        topic = convo.meta['detected_topic']
        clean_text = StancePreprocessor.prepare_text(utterance.text)
        text = StancePreprocessor.get_contextual_framing_for_topic(topic, clean_text)
        # MAKE SURE THE TOPIC KEYS MATCH AND IF NOT RETURN A GENERAL
        stance_hypotheses = TOPIC_STANCE_HYPOTHESES.get(topic, GENERAL_STANCE_HYPOTHESES)
        # logger.info(f"Text: {text}")

        #
        # # Pass one of the candidate stances
        # result = self.classifier.classify_text(text, self.stance_labels)
        # return {
        #     "stance": result["label"],
        #     "confidence": result["confidence"],
        #     "all_scores": result["all_scores"]
        # }

        confidence_threshold = 0.25
        stance_results = {}
        template_consistency_scores = {}

        for stance, hypotheses in stance_hypotheses.items():
            stance_scores = []

            # Test each hypothesis template for this stance
            for hypothesis in hypotheses:
                result = self.classifier.classify_text(text, [hypothesis])
                # Get score for this specific hypothesis
                stance_scores.append(result["all_scores"].get(hypothesis, 0.0))

            # Average across templates for this stance
            avg_confidence = np.mean(stance_scores)
            stance_results[stance] = avg_confidence

            # Measure consistency across templates (lower std = more consistent)
            template_consistency_scores[stance] = 1.0 - (np.std(stance_scores) / (np.mean(stance_scores) + 1e-8))

        # Find best stance
        best_stance = max(stance_results.keys(), key=lambda k: stance_results[k])
        best_confidence = stance_results[best_stance]
        overall_consistency = np.mean(list(template_consistency_scores.values()))

        # Apply confidence threshold
        if best_confidence < confidence_threshold:
            best_stance = 'neutral'
            best_confidence = stance_results.get('neutral', 0.0)

        return {
            'stance': best_stance,
            'confidence': best_confidence,
            'all_scores': stance_results,
            'method_used': 'multi_template_spinos',
            'template_consistency': overall_consistency,
            'reliable': best_confidence > confidence_threshold and overall_consistency > 0.7,
            'topic_context': topic
        }

    def process_corpus_utterances(self, corpus, batch_size: int = 50,
                                  max_utterances: Optional[int] = None,
                              save_path: Optional[str] = None) -> None:
        """Process all utterances in corpus for stance detection."""
        sorted_utts = sorted(list(corpus.iter_utterances()), key=lambda utt: utt.timestamp)
        all_utterances = sorted_utts

        if max_utterances is not None:
            utterances = all_utterances[:max_utterances]
            # logger.info(f"Processing {len(utterances)} of {len(all_utterances)} total utterances")
        else:
            utterances = all_utterances
            # logger.info(f"Processing all {len(utterances)} utterances for stance detection")

        for i in tqdm(range(0, len(utterances), batch_size),
                      desc="Processing utterances"):
            batch = utterances[i:i+batch_size]

            for utt in batch:
                try:
                    stance_result = self.detect_utterance_stance(utt, corpus)

                    # Add to utterance metadata
                    utt.add_meta("detected_stance", stance_result["stance"])
                    utt.add_meta("stance_confidence", stance_result["confidence"])
                    utt.add_meta("stance_scores", stance_result["all_scores"])

                except Exception as e:
                    # logger.error(f"Failed to process utterance {utt.id}: {e}")
                    utt.add_meta("detected_stance", "unknown")
                    utt.add_meta("stance_confidence", 0.0)

        if save_path:
            corpus.dump(save_path)
            # logger.info(f"Saved processed corpus to {save_path}")

        # logger.info("Stance detection processing complete")


In [16]:
# Initialize with 4 GPUs
stance_detector = StanceDetector(num_gpus=1)

# Process corpus (keep your existing parameters)
SAVE_PATH = "/workspace/temporal_belief_analysis/pd_corpus_with_stances50_chronological"
stance_detector.process_corpus_utterances(corpus, max_utterances=50, save_path=SAVE_PATH)

Device set to use cuda:0
2025-07-29 14:50:03,458 - temporal_belief.models.bart_classifier - INFO - bart_classifier:49 - Loaded single classifier on device: GPU
2025-07-29 14:50:03,496 - temporal_belief - INFO - 585778893:19 - Initialized stance detector with 1 GPUs
2025-07-29 14:50:12,284 - temporal_belief - INFO - 585778893:80 - Processing 50 of 4655894 total utterances
Processing utterances: 100%|██████████| 1/1 [00:07<00:00,  7.32s/it]
2025-07-29 14:52:45,239 - temporal_belief - INFO - 585778893:92 - Saved processed corpus to /workspace/temporal_belief_analysis/pd_corpus_with_stances50_chronological
2025-07-29 14:52:45,242 - temporal_belief - INFO - 585778893:94 - Stance detection processing complete


In [24]:
# Testing 'process_corpus_utterances()'
SAVE_PATH = "/workspace/temporal_belief_analysis/pd_corpus_with_stances20000_chronological_old"
stance_detector = StanceDetector()
stance_detector.process_corpus_utterances(corpus, max_utterances=20000, save_path=SAVE_PATH)

Device set to use cuda:0
2025-07-29 15:08:06,565 - temporal_belief.models.bart_classifier - INFO - bart_classifier:49 - Loaded single classifier on device: GPU
Processing utterances:  13%|█▎        | 53/400 [06:01<39:24,  6.81s/it]


KeyboardInterrupt: 

In [None]:
processed_convos = list(corpus.iter_conversations())
print(f"Processed: {processed_convos[1].meta['detected_topic']}\n")

In [None]:
# Play around with a single part
# print(corpus.random_utterance().text)
stance_detector = StanceDetector()
utts = list(corpus.iter_utterances())
utt_text = utts[0].text
# utt_text = mark_quotes(utt_text)
# print(utt_text)
stance = stance_detector.detect_utterance_stance(utts[0])
print(stance['stance'])
# print(utts[1].meta[''])

In [None]:
# Check if metadata gets added
utterances = list(corpus.iter_utterances())
first_utt = utterances[0]
print(f"First utterance ID: {first_utt.id}")
print(f"Has stance metadata: {'detected_stance' in first_utt.meta}")
if 'detected_stance' in first_utt.meta:
    print(f"Stance: {first_utt.meta['detected_stance']}")
    print(f"Confidence: {first_utt.meta['stance_confidence']}")

In [None]:
# Testing 'detect_conversation_topic()' and 'dump()'
logger = setup_logging("DEBUG")
i = 0
utts = list(corpus.iter_utterances())
stance_detector = StanceDetector()
for i in range(30):
  logger.info("=" * 100)
  logger.info(f"Comment: {utts[i].text}\n")
  stance = stance_detector.detect_utterance_stance(utts[i])
  logger.info(f"Stance: {stance['stance']}")
  logger.info(f"Confidence: {stance['confidence']}")
  # logger.debug(f"Raw utterance text length: {len(utts[i].text)}")
  logger.warning(f"Low confidence score: {stance['confidence']}")
  # utts[i].add_meta("detected_stance", stance["stance"])
  # utts[i].add_meta("stance_confidence", stance["stance"])
  # utts[i].add_meta("stance_scores", stance["all_scores"])
  i += 1

del utts
# Test it saves properly:
# corpus_small.dump("/content/drive/MyDrive/MScProject/Corpora/corpus_small")
