In [None]:
!git clone https://github.com/Sharp-4rth/temporal_belief_analysis.git

In [None]:
!pip install convokit

In [1]:
import unsloth
import unsloth_zoo
from convokit import Corpus, download
import convokit


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
corpus = Corpus(filename=download("subreddit-PoliticalDiscussion"))

No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem
Downloading subreddit-PoliticalDiscussion to /root/.convokit/saved-corpora/subreddit-PoliticalDiscussion
Downloading subreddit-PoliticalDiscussion from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/corpus-zipped/PokkenGameCirclejerk~-~PoliticalVideos/PoliticalDiscussion.corpus.zip (801.4MB)... Done


In [11]:
"""Topic detection functionality for conversation analysis."""

from typing import List, Dict, Any, Optional
import logging
from tqdm import tqdm

from temporal_belief_analysis.src.temporal_belief.models.bart_classifier import BARTZeroShotClassifier
from temporal_belief_analysis.src.temporal_belief.utils.config import POLITICAL_TOPICS, ProjectConfig

logger = logging.getLogger(__name__)

class TopicDetector:
    """Detect topics in ConvoKit conversations using BART."""

    def __init__(self, topics: Optional[List[str]] = None,
                 config: ProjectConfig = None):
        """Initialize topic detector."""
        self.config = config or ProjectConfig()
        self.classifier = BARTZeroShotClassifier(self.config.bart_model_name)
        self.topics = topics or POLITICAL_TOPICS
        logger.info(f"Initialized topic detector with {len(self.topics)} topics")

    def detect_conversation_topic(self, conversation) -> Dict[str, Any]:
        """Detect topic for a single conversation."""
        utterances = list(conversation.iter_utterances())
        if not utterances:
            logger.warning(f"No utterances found in conversation {conversation.id}")
            return {"topic": "unknown", "confidence": 0.0}

        original_post = utterances[0].text
        result = self.classifier.classify_text(original_post, self.topics)

        # Clean up the result
        # clean_topic = result["label"].replace(" policy", "")

        return {
            "topic": result["label"],
            "confidence": result["confidence"],
            "all_scores": result["all_scores"],
            "text_length": len(original_post),
            "num_utterances": len(utterances)
        }

    def process_corpus(self, corpus, batch_size: int = 50,
                       save_path: Optional[str] = None) -> None:
        """Process entire corpus for topic detection."""
        conversations = list(corpus.iter_conversations())
        logger.info(f"Processing {len(conversations)} conversations for topic detection")

        # Progress bar library:
        for i in tqdm(range(0, len(conversations), batch_size),
                      desc="Processing conversations"):
            batch = conversations[i:i + batch_size]

            for conv in batch:
                try:
                    topic_result = self.detect_conversation_topic(conv)

                    # Add to conversation metadata
                    conv.add_meta("detected_topic", topic_result["topic"])
                    conv.add_meta("topic_confidence", topic_result["confidence"])
                    conv.add_meta("topic_scores", topic_result["all_scores"])

                except Exception as e:
                    logger.error(f"Failed to process conversation {conv.id}: {e}")
                    conv.add_meta("detected_topic", "unknown")
                    conv.add_meta("topic_confidence", 0.0)

        if save_path:
            corpus.dump(save_path)
            logger.info(f"Saved processed corpus to {save_path}")

        logger.info("Topic detection processing complete")

In [12]:
corpus.random_utterance().text

"It was after Sandy Hook, it would have been voluntary having trouble finding an article. \n\nHere's a more recent article on Two Republican and Two democrat gun bills voted down https://www.washingtonpost.com/news/powerpost/wp/2016/06/20/senate-heads-for-gun-control-showdown-likely-to-go-nowhere/?utm_term=.cb0a93df0660&amp;wpisrc=al_alert-COMBO-politics%252Bnation"