In [None]:
# Need to restart after:
!pip install convokit

In [None]:
# Download file from Google Drive to colab directory
!pip install gdown
file_id = "1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0"
!gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip" --fuzzy

In [2]:
# Unzip with python:
import zipfile
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip").extractall("/content/temporal_belief_analysis")

In [3]:
# For runpod-jupyter or local (run twice)
import sys
import os

# Change to the correct working directory (workspace if runpod, content if colab)
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Comment out if in colab:
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.change_detection import ChangeDetector
from temporal_belief.core.WindowExtraction import WindowExtractor
from temporal_belief.core.OpPathPairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor

Changed working directory to: /content/temporal_belief_analysis/notebooks


In [None]:
# Run twice
# import unsloth
# import unsloth_zoo
from convokit import Corpus, download
import convokit

In [6]:
# Load a corpus:
# corpus = Corpus(filename="/Users/leonidas/.convokit/saved-corpora/pd_corpus_with_stances1000_chronological")
corpus = Corpus(filename="/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological")

In [7]:
print(corpus.meta)

ConvoKitMeta({'subreddit': 'PoliticalDiscussion', 'num_posts': 102848, 'num_comments': 4553046})


In [8]:
# Test timeline builder:
timeline_builder = TimelineBuilder(corpus, min_posts_per_topic=0, min_topics_per_user=0)
timelines = timeline_builder.build_timelines()

# Filter for analysis
change_detector_preprocessor = ChangeDetectorPreprocessor()
filtered_timelines = change_detector_preprocessor.filter_for_change_detection(timelines, min_posts_per_topic=5, min_topics_per_user=2)

# Get a specific user's timeline for a specific topic
user_id = "HardCoreModerate"
topic = "media and political commentary"
topic_timeline = filtered_timelines[user_id][topic]  # This is {utterance_id: stance}

# Convert to list of tuples
topic_timeline_list = list(topic_timeline.items())

2025-08-07 12:24:37,707 - temporal_belief.core.timeline_building - INFO - timeline_building:71 - Built timelines for 4781 users
INFO:temporal_belief.core.timeline_building:Built timelines for 4781 users


In [9]:
# Test the change detector:
persistence_detector = ChangeDetector()
change_points = persistence_detector.detect_persistent_changes(topic_timeline_list)

Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: strongly_against and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:neutral, Previous: moderately_against and Next:neutral
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_against, Previous: strongly_against and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_favor, Previous: moderately_against and Next:moderately_favor
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Nex

In [11]:
# Test the window extractor:
window_extractor = WindowExtractor(corpus, timelines=timelines)
candidate_convos = window_extractor.get_conversations_around_change_point(change_points=change_points, corpus=corpus)
for convo in candidate_convos:
  print(f'ID:{convo.id}')

ID:muccw
ID:mv2yv
ID:mv3ou


In [83]:
def calculate_interplay_features(op_text, reply_path_text, stop_words_set):
    """Calculate 12 interplay features between OP and reply path."""

    # Tokenize and clean
    op_words = op_text.lower().split()
    reply_words = reply_path_text.lower().split()

    # Create word sets
    op_all = set(op_words)
    reply_all = set(reply_words)
    op_stop = set(w for w in op_words if w in stop_words_set)
    reply_stop = set(w for w in reply_words if w in stop_words_set)
    op_content = set(w for w in op_words if w not in stop_words_set)
    reply_content = set(w for w in reply_words if w not in stop_words_set)

    # Calculate 4 metrics for each word type
    features = {}

    for word_type, (op_set, reply_set) in [
        ('all', (op_all, reply_all)),
        ('stop', (op_stop, reply_stop)),
        ('content', (op_content, reply_content))
    ]:
        intersection = len(op_set & reply_set)
        union = len(op_set | reply_set)

        features[f'common_words_{word_type}'] = intersection
        features[f'sim_frac_reply_{word_type}'] = intersection / len(reply_set) if reply_set else 0
        features[f'sim_frac_op_{word_type}'] = intersection / len(op_set) if op_set else 0
        features[f'jaccard_{word_type}'] = intersection / union if union else 0

    return features

In [154]:
import re

class PairPreprocessor:

    def tokenize_quotes(self, utterance_text):
        lines = utterance_text.split('\n')
        processed_lines = []

        for line in lines:
            line = line.strip()
            if line.startswith('&gt;') or line.startswith('>'):
                processed_lines.append('[QUOTE]')
            else:
                processed_lines.append(line)

        return '\n'.join(processed_lines)

    def concatenate_path(self, path_dict):
        concatenated_path = {}
        for key, utt in path_dict.items():
            concatenated_path[key] = utt.text  # or utt.text.strip()
        return concatenated_path

    def tokenize_and_lower(op_text, reply_path_text, stop_words_set):
        op_words = op_text.lower().split()
        reply_words = reply_path_text.lower().split()

        return (op_words, reply_words)

    # This pattern keeps letters, numbers, whitespace, and apostrophes (for contractions)
    def remove_punctuation(op_text, reply_path_text):
        op_text = re.sub(r"[^\w\s']", '', op_text)
        reply_path_text = re.sub(r"[^\w\s']", '', reply_path_text)

        return op_text, reply_path_text

    def remove_quotes_from_all(self, op_path_pairs):
        marked_pairs = []
        for op_path_pair in op_path_pairs:
            # Process the OP utterance
            op_text = self.tokenize_quotes(op_path_pair[0].text)

            # Process each utterance path
            processed_paths = []
            for utterances in op_path_pair[1].values():
                path = [self.tokenize_quotes(utt.text) for utt in utterances]
                processed_paths.append(path)

            marked_pairs.append((op_text, processed_paths))

        return marked_pairs

    def concatenate_utts_in_paths(self, op_path_pair):
        op = op_path_pair[0]
        paths = op_path_pair[1]

        concatenated_paths = []
        for path in paths:
            concatenated = self.concatenate_path(path)  # still a dict with same keys
            concatenated_paths.append(concatenated)

        return (op, concatenated_paths)

    def clean_and_tokenize(self, op_text, reply_path_text):
        # Step 1: Remove punctuation
        op_text, reply_path_text = self.remove_punctuation(op_text, reply_path_text)

        # Step 2: Tokenize and lowercase
        op_words, reply_words = self.tokenize_and_lower(op_text, reply_path_text)

        return op_words, reply_words


In [139]:
# Test the OP and path pairer:
utt_id = change_points[0][1]
user_id = corpus.get_utterance(change_points[0][1]).speaker.id
op_path_pairer = OpPathPairer(corpus, timelines=timelines)
op_path_pairs = op_path_pairer.extract_rooted_path_from_candidate_convos(candidate_convos, user_id)

# def print_user_path_utterances(utterances):
#   for i, utt in enumerate(utterances):
#     print(f'{i}, {utt.text}\n')


for op_path_pair in op_path_pairs:
  print(1000*'=')
  print(f'\nop: {op_path_pair[0].speaker.id}, op_post: {op_path_pair[0].text}')
  for user, utterances in op_path_pair[1].items():
    print(f'user: {user}, utterances: {[utt.text for utt in utterances]}\n')
  print(f'paths: {op_path_pair[1]}\n')

my input user_id: HardCoreModerate
Utt_id: c344ch2 and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c344cv7 and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c344lkp and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c3443ch and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c344801 and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c349430 and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c347vmm and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c344ksk and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c344k47 and user_id: HardCoreModerate

In [153]:
# Test the preprocessor:
pair_preprocessor = PairPreprocessor()
first_pair = op_path_pairs[3]
preprocessed_pair = pair_preprocessor.concatenate_utts_in_paths(first_pair)

print(preprocessed_pair)
# for op, paths in preprocessed_pair:
#     print()
    # print(f'op: {op}')
    # for path in paths:
    #   print(f'Path: {path}')

AttributeError: 'str' object has no attribute 'items'

In [43]:
# Test the interplay feature extractor:
# interplay = Interplay(corpus, timelines=timelines)

In [None]:
# Test interplay scorer: