In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
!git clone https://github.com/Sharp-4rth/temporal_belief_analysis.git

Cloning into 'temporal_belief_analysis'...
remote: Enumerating objects: 120, done.[K
remote: Counting objects: 100% (120/120), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 120 (delta 53), reused 101 (delta 44), pack-reused 0 (from 0)[K
Receiving objects: 100% (120/120), 96.09 KiB | 16.01 MiB/s, done.
Resolving deltas: 100% (53/53), done.


In [14]:
# Get latest version
%cd temporal_belief_analysis
!git pull

/content/temporal_belief_analysis/temporal_belief_analysis
Already up to date.


In [3]:
!pip install convokit

Collecting convokit
  Downloading convokit-3.2.0.tar.gz (205 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/205.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.0/205.0 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting msgpack-numpy>=0.4.3.2 (from convokit)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting clean-text>=0.6.0 (from convokit)
  Downloading clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting unidecode>=1.1.1 (from convokit)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting pymongo>=4.0 (from convokit)
  Downloading pymongo-4.13.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython>=1.16.0 (from 

In [1]:
import unsloth
import unsloth_zoo
from convokit import Corpus, download
import convokit
corpus = Corpus(filename=download("subreddit-PoliticalDiscussion"))

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem
Downloading subreddit-PoliticalDiscussion to /root/.convokit/saved-corpora/subreddit-PoliticalDiscussion
Downloading subreddit-PoliticalDiscussion from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/corpus-zipped/PokkenGameCirclejerk~-~PoliticalVideos/PoliticalDiscussion.corpus.zip (801.4MB)... Done


In [2]:
SAVE_PATH = "/content/drive/MyDrive/MScProject/Corpora/corpus_small"

In [15]:
"""Topic detection functionality for conversation analysis."""

from typing import List, Dict, Any, Optional
import logging
from tqdm import tqdm

from temporal_belief_analysis.src.temporal_belief.models.bart_classifier import BARTZeroShotClassifier
from temporal_belief_analysis.src.temporal_belief.utils.config import POLITICAL_TOPICS, ProjectConfig

logger = logging.getLogger(__name__)

class TopicDetector:
    """Detect topics in ConvoKit conversations using BART."""

    def __init__(self, topics: Optional[List[str]] = None,
                 config: ProjectConfig = None):
        """Initialize topic detector."""
        self.config = config or ProjectConfig()
        self.classifier = BARTZeroShotClassifier(self.config.bart_model_name)
        self.topics = topics or POLITICAL_TOPICS
        logger.info(f"Initialized topic detector with {len(self.topics)} topics")

    def detect_conversation_topic(self, conversation) -> Dict[str, Any]:
        """Detect topic for a single conversation."""
        utterances = list(conversation.iter_utterances())
        title = conversation.meta['title']
        original_post = utterances[0].text
        if not original_post and not title:
            logger.warning(f"No utterances or title found in conversation {conversation.id}")
            return {"topic": "unknown", "confidence": 0.0}

        combined_text = f"Title: {title}. Original Post: {original_post}"
        result = self.classifier.classify_text(combined_text, self.topics)

        # Clean up the result
        # clean_topic = result["label"].replace(" policy", "")

        return {
            "topic": result["label"],
            "confidence": result["confidence"],
            "all_scores": result["all_scores"],
            "text_length": len(original_post),
            "num_utterances": len(utterances)
        }

    def process_corpus(self, corpus, batch_size: int = 50,
                    save_path: Optional[str] = None) -> None:
      """Process entire corpus for topic detection."""
      conversations = list(corpus.iter_conversations())
      logger.info(f"Processing {len(conversations)} conversations for topic detection")

      for i in tqdm(range(0, len(conversations), batch_size),
                    desc="Processing conversations"):
          batch = conversations[i:i + batch_size]

          # Prepare all texts for batch processing
          batch_texts = []
          valid_conversations = []

          for conv in batch:
              try:
                  title = conv.meta['title']
                  utterances = list(conv.iter_utterances())
                  if not utterances and not title:
                      logger.warning(f"No utterances or title found in conversation {conv.id}")
                      continue

                  original_post = utterances[0].text
                  combined_text = f"{title}. {original_post}"
                  batch_texts.append(combined_text)
                  valid_conversations.append(conv)

              except Exception as e:
                  logger.error(f"Failed to prepare conversation {conv.id}: {e}")
                  conv.add_meta("detected_topic", "unknown")
                  conv.add_meta("topic_confidence", 0.0)

          # Process entire batch at once
          if batch_texts:
              try:
                  batch_results = self.classifier.classify_batch(batch_texts, self.topics)

                  # Apply results back to conversations
                  for conv, result in zip(valid_conversations, batch_results):
                      conv.add_meta("detected_topic", result["label"])
                      conv.add_meta("topic_confidence", result["confidence"])
                      conv.add_meta("topic_scores", result["all_scores"])

              except Exception as e:
                  logger.error(f"Batch classification failed: {e}")
                  # Fallback to individual processing
                  for conv in valid_conversations:
                      try:
                          topic_result = self.detect_conversation_topic(conv)
                          conv.add_meta("detected_topic", topic_result["topic"])
                          conv.add_meta("topic_confidence", topic_result["confidence"])
                          conv.add_meta("topic_scores", topic_result["all_scores"])
                      except Exception as e2:
                          logger.error(f"Individual fallback failed for {conv.id}: {e2}")
                          conv.add_meta("detected_topic", "unknown")
                          conv.add_meta("topic_confidence", 0.0)

      if save_path:
          corpus.dump(save_path)
          logger.info(f"Saved processed corpus to {save_path}")

      logger.info("Topic detection processing complete")

In [4]:
corpus_small = Corpus(filename=download("reddit-corpus-small"))

Downloading reddit-corpus-small to /root/.convokit/saved-corpora/reddit-corpus-small
Downloading reddit-corpus-small from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/reddit-corpus-small.corpus.zip (37.9MB)... Done


In [16]:
topic_detector = TopicDetector()
topic_detector.process_corpus(corpus_small, save_path=SAVE_PATH)

Device set to use cuda:0
Processing conversations:   0%|          | 0/166 [00:00<?, ?it/s]ERROR:__main__:Batch classification failed: 'BARTZeroShotClassifier' object has no attribute 'classify_batch'
Processing conversations:   0%|          | 0/166 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
from bart_classifier import BARTZeroShotClassifier
from config import ProjectConfig

In [28]:
topic_detector = TopicDetector()
i = 0
convos = list(corpus.iter_conversations())
for i in range(20):
  utterances = list(convos[i].iter_utterances())
  title = convos[i].meta['title']
  og_post = utterances[0].text
  print(100*'-')
  print(f"Title: {title} \n")
  print(f"OG post: {og_post} \n")
  topic = topic_detector.detect_conversation_topic(convos[i])
  print(f"Detected topic: {topic['topic']} \n")
  print(f"Confidence: {topic['confidence']} \n")
  i += 1

Device set to use cuda:0


----------------------------------------------------------------------------------------------------
Title: If Austrian economics is so wonderful, why don't the Austrians use it? 

OG post:  

Detected topic: economic policy 

Confidence: 0.3973345160484314 

----------------------------------------------------------------------------------------------------
Title: congrats r/politicaldiscussion, you are turning into an r/politics clone 

OG post: With such gems as:

- Is there a giant worldwide conspiracy against alternate energy? 
- Between Bush and Obama, who has done more damage to America?
- If industry is over regulated how on earth did BP have one of the most profitable years during one of the largest spills on record?

which thankully &amp; mercifully was followed by:

- How on earth did BP have one of the most profitable years during one of the largest spills on record? They didn't. 30 seconds on Wikipedia would have debunked this.

but, this being an r/politics clone, you can

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Detected topic: media and political commentary 

Confidence: 0.3968943655490875 

----------------------------------------------------------------------------------------------------
Title: This sentence "It was Newt who derisively described his wife as too ugly to be the wife of a president." has been seen in many articles, but I have yet to see a source for it. Any help? 

OG post: http://www.dailykos.com/story/2011/12/26/1048994/-Newt-Gingrich-apparently-lies-about-his-first-divorce

http://www.moneyweek.com/news-and-charts/economics/us/newt-gingrich-the-man-were-backing-for-president-56842

I can't find the source, and it's being quoted in forums and blogs all over the internet.

Is it true, did he say these things, or is it hearsay? 

Detected topic: media and political commentary 

Confidence: 0.6322570443153381 

----------------------------------------------------------------------------------------------------
Title: Does anyone still think 46% of working Americans 
don't pay 

In [7]:
convo = corpus.random_conversation()
utterances = list(convo.iter_utterances())
title = convo.meta['title']
og_post = utterances[0].text
print(f"Title: {title}")
print(f"OG post:{og_post}")


Title: Was the DNC's preference for Clinton actually the likely reason she won the primaries?
OG post:I should emphasis the word "likely" because of course no one can know for sure, but would having debate questions before hand and other advantages really have made that much of a difference in primary voting? 

There seems to be a lot of belief at the moment that this is the main reason she got the nomination.

I'm not from the US and just genuinely curious. Do not mean to ask it as a loaded question.


In [9]:
corpus_small = Corpus(filename=download("reddit-corpus-small"))

Downloading reddit-corpus-small to /root/.convokit/saved-corpora/reddit-corpus-small
Downloading reddit-corpus-small from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/reddit-corpus-small.corpus.zip (37.9MB)... Done


In [15]:
i = 0
convos_small = list(corpus_small.iter_conversations())
topic_detector = TopicDetector()
for i in range(3):
  utterances = list(convos_small[i].iter_utterances())
  title = convos_small[i].meta['title']
  og_post = utterances[0].text
  print(100*'-')
  print(f"Title: {title} \n")
  print(f"OG post: {og_post} \n")
  topic = topic_detector.detect_conversation_topic(convos_small[i])
  print(f"Detected topic: {topic['topic']} \n")
  print(f"Confidence: {topic['confidence']} \n")
  convos_small[i].add_meta("detected_topic", topic["topic"])
  convos_small[i].add_meta("topic_confidence", topic["confidence"])
  convos_small[i].add_meta("topic_scores", topic["all_scores"])
  i += 1

corpus_small.dump("/content/drive/MyDrive/MScProject/Corpora/corpus_small")


Device set to use cuda:0


----------------------------------------------------------------------------------------------------
Title: /r/singapore random discussion and small questions thread for September 02, 2018 

OG post: Talk about your day. Anything goes, but subreddit rules still apply. Please be polite to each other! 
 

Detected topic: media and political commentary 

Confidence: 0.1350071281194687 

----------------------------------------------------------------------------------------------------
Title: What are your biggest complaints about singapore? 

OG post: I went to visit a few days ago and Ioved it. I can’t find any negatives other than how small the place is. I’m also just a visitor so the perspective is entirely different from someone who lives there.  

Detected topic: media and political commentary 

Confidence: 0.09838560968637466 

----------------------------------------------------------------------------------------------------
Title: Worst taxi driver you’ve ever encountered? 

OG 