In [None]:
# For colab:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!git clone https://github.com/Sharp-4rth/temporal_belief_analysis.git

Cloning into 'temporal_belief_analysis'...
remote: Enumerating objects: 131, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 131 (delta 60), reused 109 (delta 48), pack-reused 0 (from 0)[K
Receiving objects: 100% (131/131), 98.40 KiB | 3.64 MiB/s, done.
Resolving deltas: 100% (60/60), done.


In [None]:
# Get latest version
%cd temporal_belief_analysis
!git pull

/content/temporal_belief_analysis/temporal_belief_analysis
Already up to date.


In [2]:
# For colab:
!pip install convokit

Collecting convokit
  Downloading convokit-3.2.0.tar.gz (205 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/205.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m143.4/205.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.0/205.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting msgpack-numpy>=0.4.3.2 (from convokit)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting clean-text>=0.6.0 (from convokit)
  Downloading clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting unidecode>=1.1.1 (from convokit)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting pymongo>=4.0 (from convokit)
 

In [1]:
# For colab:
import unsloth
import unsloth_zoo
from convokit import Corpus, download
import convokit
# corpus = Corpus(filename=download("subreddit-PoliticalDiscussion"))
from temporal_belief_analysis.src.temporal_belief.models.bart_classifier import BARTZeroShotClassifier
from temporal_belief_analysis.src.temporal_belief.utils.config import POLITICAL_TOPICS, ProjectConfig

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
# For local:
from src.temporal_belief.models.bart_classifier import BARTZeroShotClassifier
from src.temporal_belief.utils.config import POLITICAL_TOPICS, ProjectConfig

In [2]:
"""Topic detection functionality for conversation analysis."""

from typing import List, Dict, Any, Optional
import logging
from tqdm import tqdm

logger = logging.getLogger(__name__)

class TopicDetector:
    """Detect topics in ConvoKit conversations using BART."""

    def __init__(self, topics: Optional[List[str]] = None,
                 config: ProjectConfig = None):
        """Initialize topic detector."""
        self.config = config or ProjectConfig()
        self.classifier = BARTZeroShotClassifier(self.config.bart_model_name)
        self.topics = topics or POLITICAL_TOPICS
        logger.info(f"Initialized topic detector with {len(self.topics)} topics")

    def detect_conversation_topic(self, conversation) -> Dict[str, Any]:
        """Detect topic for a single conversation."""
        utterances = list(conversation.iter_utterances())
        title = conversation.meta['title']
        original_post = utterances[0].text
        if not original_post and not title:
            logger.warning(f"No utterances or title found in conversation {conversation.id}")
            return {"topic": "unknown", "confidence": 0.0}

        combined_text = f"Title: {title}. Original Post: {original_post}"
        result = self.classifier.classify_text(combined_text, self.topics)

        # Clean up the result
        # clean_topic = result["label"].replace(" policy", "")

        return {
            "topic": result["label"],
            "confidence": result["confidence"],
            "all_scores": result["all_scores"],
            "text_length": len(original_post),
            "num_utterances": len(utterances)
        }

    def _prepare_conversation_text(self, conversation):
      """Extract and prepare text from conversation. Returns (text, metadata)."""
      try:
          title = conversation.meta.get('title', '')
          first_utterance = next(conversation.iter_utterances(), None)
          original_post = first_utterance.text if first_utterance else ''

          if not original_post and not title:
              return None, {"topic": "unknown", "confidence": 0.0, "topic_scores": {}}

          # Consistent text formatting and truncation
          combined_text = f"{title}. {original_post}"[:2000]

          metadata = {
              "text_length": len(original_post),
              "num_utterances": len(list(conversation.iter_utterances()))
          }

          return combined_text, metadata

      except Exception as e:
          logger.error(f"Failed to prepare conversation {conversation.id}: {e}")
          return None, {"topic": "unknown", "confidence": 0.0, "topic_scores": {}}

    def _set_conversation_metadata(self, conversation, result: Dict[str, Any]):
        """Set topic metadata on conversation."""
        conversation.add_meta("detected_topic", result["topic"])
        conversation.add_meta("topic_confidence", result["confidence"])
        conversation.add_meta("topic_scores", result.get("all_scores", {}))

    def detect_conversation_topic(self, conversation) -> Dict[str, Any]:
        """Detect topic for a single conversation."""
        combined_text, metadata = self._prepare_conversation_text(conversation)

        if combined_text is None:
            return metadata  # Already contains error result

        result = self.classifier.classify_text(combined_text, self.topics)

        return {
            "topic": result["label"],
            "confidence": result["confidence"],
            "all_scores": result["all_scores"],
            **metadata  # Include text_length, num_utterances
        }

    def process_corpus(self, corpus, batch_size: int = 200,
                    save_path: Optional[str] = None) -> None:
        """Process entire corpus for topic detection."""
        conversations = list(corpus.iter_conversations())
        logger.info(f"Processing {len(conversations)} conversations for topic detection")

        for i in tqdm(range(0, len(conversations), batch_size),
                      desc="Processing conversations"):
            batch = conversations[i:i + batch_size]

            # Prepare batch using shared logic
            batch_data = []

            for conv in batch:
                combined_text, metadata = self._prepare_conversation_text(conv)

                if combined_text is None:
                    # Set error metadata and skip
                    self._set_conversation_metadata(conv, metadata)
                    continue

                batch_data.append((conv, combined_text))

            # Process entire batch at once
            if batch_data:
                texts = [data[1] for data in batch_data]
                conversations_to_process = [data[0] for data in batch_data]

                try:
                    print(f"🚀 Attempting batch of {len(texts)} texts...")
                    import time
                    start = time.time()

                    batch_results = self.classifier.classify_batch(texts, self.topics)

                    end = time.time()
                    print(f"✅ Batch completed in {end-start:.2f}s ({(end-start)/len(texts):.3f}s per text)")

                    # Apply results using shared logic
                    for conv, result in zip(conversations_to_process, batch_results):
                        self._set_conversation_metadata(conv, result)

                except Exception as e:
                    print(f"❌ Batch processing failed: {e}")
                    logger.error(f"Batch classification failed: {e}")

                    # Fallback: mark all as unknown
                    unknown_result = {"topic": "unknown", "confidence": 0.0, "topic_scores": {}}
                    for conv in conversations_to_process:
                        self._set_conversation_metadata(conv, unknown_result)

        if save_path:
            corpus.dump(save_path)
            logger.info(f"Saved processed corpus to {save_path}")

        logger.info("Topic detection processing complete")

In [3]:
corpus_small = Corpus(filename=download("reddit-corpus-small"))

No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem
Downloading reddit-corpus-small to /root/.convokit/saved-corpora/reddit-corpus-small
Downloading reddit-corpus-small from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/reddit-corpus-small.corpus.zip (37.9MB)... Done


In [4]:
# Testing 'process_corpus()'
SAVE_PATH = "/content/drive/MyDrive/MScProject/Corpora/corpus_small"
topic_detector = TopicDetector()
topic_detector.process_corpus(corpus_small, save_path=SAVE_PATH)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Processing conversations:   0%|          | 0/42 [00:00<?, ?it/s]

🚀 Attempting batch of 200 texts...


Processing conversations:   0%|          | 0/42 [00:33<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Testing 'detect_conversation_topic()' and 'dump()'
i = 0
convos_small = list(corpus_small.iter_conversations())
topic_detector = TopicDetector()
for i in range(3):
  utterances = list(convos_small[i].iter_utterances())
  title = convos_small[i].meta['title']
  og_post = utterances[0].text
  print(100*'-')
  print(f"Title: {title} \n")
  print(f"OG post: {og_post} \n")
  topic = topic_detector.detect_conversation_topic(convos_small[i])
  print(f"Detected topic: {topic['topic']} \n")
  print(f"Confidence: {topic['confidence']} \n")
  convos_small[i].add_meta("detected_topic", topic["topic"])
  convos_small[i].add_meta("topic_confidence", topic["confidence"])
  convos_small[i].add_meta("topic_scores", topic["all_scores"])
  i += 1

corpus_small.dump("/content/drive/MyDrive/MScProject/Corpora/corpus_small")


Device set to use cuda:0


----------------------------------------------------------------------------------------------------
Title: /r/singapore random discussion and small questions thread for September 02, 2018 

OG post: Talk about your day. Anything goes, but subreddit rules still apply. Please be polite to each other! 
 

Detected topic: media and political commentary 

Confidence: 0.1350071281194687 

----------------------------------------------------------------------------------------------------
Title: What are your biggest complaints about singapore? 

OG post: I went to visit a few days ago and Ioved it. I can’t find any negatives other than how small the place is. I’m also just a visitor so the perspective is entirely different from someone who lives there.  

Detected topic: media and political commentary 

Confidence: 0.09838560968637466 

----------------------------------------------------------------------------------------------------
Title: Worst taxi driver you’ve ever encountered? 

OG 