In [None]:
!git clone https://github.com/Sharp-4rth/temporal_belief_analysis.git
!pip install convokit
import unsloth
import unsloth_zoo
from convokit import Corpus, download
import convokit

corpus = Corpus(filename=download("subreddit-PoliticalDiscussion"))

In [11]:
"""Topic detection functionality for conversation analysis."""

from typing import List, Dict, Any, Optional
import logging
from tqdm import tqdm

from temporal_belief_analysis.src.temporal_belief.models.bart_classifier import BARTZeroShotClassifier
from temporal_belief_analysis.src.temporal_belief.utils.config import POLITICAL_TOPICS, ProjectConfig

logger = logging.getLogger(__name__)

class TopicDetector:
    """Detect topics in ConvoKit conversations using BART."""

    def __init__(self, topics: Optional[List[str]] = None,
                 config: ProjectConfig = None):
        """Initialize topic detector."""
        self.config = config or ProjectConfig()
        self.classifier = BARTZeroShotClassifier(self.config.bart_model_name)
        self.topics = topics or POLITICAL_TOPICS
        logger.info(f"Initialized topic detector with {len(self.topics)} topics")

    def detect_conversation_topic(self, conversation) -> Dict[str, Any]:
        """Detect topic for a single conversation."""
        utterances = list(conversation.iter_utterances())
        if not utterances:
            logger.warning(f"No utterances found in conversation {conversation.id}")
            return {"topic": "unknown", "confidence": 0.0}

        original_post = utterances[0].text
        result = self.classifier.classify_text(original_post, self.topics)

        # Clean up the result
        # clean_topic = result["label"].replace(" policy", "")

        return {
            "topic": result["label"],
            "confidence": result["confidence"],
            "all_scores": result["all_scores"],
            "text_length": len(original_post),
            "num_utterances": len(utterances)
        }

    def process_corpus(self, corpus, batch_size: int = 50,
                       save_path: Optional[str] = None) -> None:
        """Process entire corpus for topic detection."""
        conversations = list(corpus.iter_conversations())
        logger.info(f"Processing {len(conversations)} conversations for topic detection")

        # Progress bar library:
        for i in tqdm(range(0, len(conversations), batch_size),
                      desc="Processing conversations"):
            batch = conversations[i:i + batch_size]

            for conv in batch:
                try:
                    topic_result = self.detect_conversation_topic(conv)

                    # Add to conversation metadata
                    conv.add_meta("detected_topic", topic_result["topic"])
                    conv.add_meta("topic_confidence", topic_result["confidence"])
                    conv.add_meta("topic_scores", topic_result["all_scores"])

                except Exception as e:
                    logger.error(f"Failed to process conversation {conv.id}: {e}")
                    conv.add_meta("detected_topic", "unknown")
                    conv.add_meta("topic_confidence", 0.0)

        if save_path:
            corpus.dump(save_path)
            logger.info(f"Saved processed corpus to {save_path}")

        logger.info("Topic detection processing complete")

In [42]:
convo = corpus.random_conversation()
utterances = list(convo.iter_utterances())
og_post = utterances[0].text
print(f"OG post:{og_post}")
print(f"title: {convo.meta['title']}")


OG post:**pheww**, glad it's all finally over!

I think that this election shows atleast some significant insight as to the disparity of what (or who) **AMERICA** has been defined by in the past versus what it's starting to be difined as.

Bill O'Reily said America's becoming less "traditional"--by traditional he means less defined by **White testosterone**--what other tradition was there in this country? when we look back we only see racism, de facto segregation, esoterism, and manifest destiny, **that's "tradition!"**

They say the majority who voted for Romney were male, were 72% white and mostly over the age of 40. Obama's were mostly women, were 93% black people, some 27% white and some 70% Hispanic, minority turn out is growing in general.

Judging by the victory Obama had over Romney, and the eclectic nature of his constituency, it's safe to say more people want a say in what the definition of **American** means, a **certain group of elitist individules aren't taking a liking to

In [43]:
topic_detector = TopicDetector()
i = 0
convos = list(corpus.iter_conversations())
for i in range(3):
  title = convos[i].meta['title']
  print(f"Title: {title}")
  topic_detector.detect_conversation_topic(title)
  i += 1

Device set to use cuda:0


Title: If Austrian economics is so wonderful, why don't the Austrians use it?


AttributeError: 'str' object has no attribute 'iter_utterances'