In [None]:
# Need to restart after:
!pip install convokit

In [None]:
# Download file from Google Drive to colab directory
!pip install gdown
file_id = "1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0"
!gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip" --fuzzy

In [2]:
# Unzip with python:
import zipfile
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip").extractall("/content/temporal_belief_analysis")

In [3]:
# For runpod-jupyter or local (run twice)
import sys
import os

# Change to the correct working directory (workspace if runpod, content if colab)
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Comment out if in colab:
from temporal_belief.core.timeline_building import TimelineBuilder

Changed working directory to: /content/temporal_belief_analysis/notebooks


In [None]:
# For colab:
from temporal_belief_analysis.src.temporal_belief.core.timeline_building import TimelineBuilder

In [None]:
# Run twice
# import unsloth
# import unsloth_zoo
from convokit import Corpus, download
import convokit

In [None]:
# Load a corpus:
# corpus = Corpus(filename="/Users/leonidas/.convokit/saved-corpora/pd_corpus_with_stances1000_chronological")
corpus = Corpus(filename="/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological")

In [50]:
print(corpus.meta)

ConvoKitMeta({'subreddit': 'PoliticalDiscussion', 'num_posts': 102848, 'num_comments': 4553046})


In [None]:
!pip install scipy
!pip install statsmodels

In [10]:
def filter_for_change_detection(timelines, min_posts_per_topic=5, min_topics_per_user=2, min_confidence=0.0):
    """Filter timelines to only include users/topics suitable for change detection"""
    filtered_timelines = {}

    for user_id, user_timeline in timelines.items():
        filtered_user_timeline = {}

        for topic, topic_posts in user_timeline.items():
            # Filter by confidence (if you have access to corpus here)
            reliable_posts = {}
            for utt_id, stance in topic_posts.items():
                # You'd need to pass corpus or confidence scores here
                # For now, assume all posts are reliable
                reliable_posts[utt_id] = stance

            # Check minimum posts per topic
            if len(reliable_posts) >= min_posts_per_topic:
                filtered_user_timeline[topic] = reliable_posts

        # Check minimum topics per user
        if len(filtered_user_timeline) >= min_topics_per_user:
            filtered_timelines[user_id] = filtered_user_timeline

    return filtered_timelines

In [3]:
class ConversationWindowExtractor:
    def __init__(self, corpus, timelines):
        self.corpus = corpus
        self.timelines = timelines

    def get_user_conversations_chronological(self, corpus, speaker_id):
      """Get all conversations for a user in chronological order."""

      # Get all conversations where the speaker participated
      user_conversations = [convo for convo in corpus.iter_conversations()
                          if speaker_id in [utt.speaker.id for utt in convo.iter_utterances()]]

      # Sort conversations by their earliest timestamp
      user_conversations.sort(key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances()))

      return user_conversations

    def get_conversations_around_change_point(self, corpus, change_points):
      # Get first change (probably only one I need)
      utterance = corpus.get_utterance(change_points[0][1])

      # Find the convo this utterance belongs to:
      conversation = utterance.get_conversation()

      # Put all user's convos in a list
      speaker_id = utterance.speaker.id
      user_conversations = self.get_user_conversations_chronological(corpus, speaker_id)

      candidate_convos = []
      # find the index of the convo, and return the convo id of the 3 prior convos
      for i, convo in enumerate(user_conversations):
        if conversation.id == user_conversations[i].id:
          candidate_convos.append(user_conversations[i-2])
          candidate_convos.append(user_conversations[i-1])

      # Append the first convo at the end so they are in chronological order
      candidate_convos.append(conversation)

      return candidate_convos

    def _trim_paths(self, op_utterance):
        try:
          conversation = op_utterance.get_conversation()
        except Exception as e:
          print(f"Can't access convo from utterance, error{e}")

        paths = conversation.get_root_to_leaf_paths()

        trimmed_paths = []
        for path in paths:
            if op_utterance in path:
                # Find where op_utterance is in this path
                op_index = path.index(op_utterance)
                # Slice from that index onwards
                trimmed_path = path[op_index+1:]
                trimmed_paths.append(trimmed_path)

        return trimmed_paths

    def _filter_paths(self, trimmed_paths):
        filtered_paths = {}
        for path_index, path in enumerate(trimmed_paths):
            for utt in path:
                key = f"{utt.speaker.id}_path_{path_index}"
                if key not in filtered_paths:
                    filtered_paths[key] = []
                filtered_paths[key].append(utt)

        return filtered_paths

    def extract_rooted_paths(self, op_utterance):
        trimmed_path = self._trim_paths(op_utterance)
        filtered_path = self._filter_paths(trimmed_path)

        return filtered_path

    # Find the op_utterances from a convo and add them to a list
    def extract_op_utterances_from_convo(self, candidate_convo, user_id):
        paths = candidate_convo.get_root_to_leaf_paths()
        op_utterances = []
        for path in paths:
            for utt in path:
                if utt.speaker.id == user_id and utt not in op_utterances:
                    op_utterances.append(utt)
                    break

        return op_utterances

    # Get all op_utterances accross every candidate convo
    def extract_op_utterances_from_all_convos(self, candidate_convos, user_id):
        all_op_utterances = []
        for candidate_convo in candidate_convos:
            op_utterances = self.extract_op_utterances_from_convo(candidate_convo, user_id)
            all_op_utterances.extend(op_utterances)

        return all_op_utterances

    # Get the paths of an op_utterance from the op_utterances list
    def extract_rooted_path_from_candidate_convos(self, candidate_convos, user_id):
        all_op_utterances = self.extract_op_utterances_from_all_convos(candidate_convos, user_id)

        # debug:
        for op_utt in all_op_utterances:
          print(f'my input user_id: {user_id}')
          speaker_id = corpus.get_utterance(op_utt.id).speaker.id
          print(f'Utt_id: {op_utt.id} and user_id: {speaker_id} in the list of all op utterances.')

        all_ops_n_paths = []
        for op_utt in all_op_utterances:
            # So rooted paths is a dict. Should I convert to list?
            rooted_paths = self.extract_rooted_paths(op_utt)

            op_n_paths = (op_utt, rooted_paths)
            all_ops_n_paths.append(op_n_paths)

        return all_ops_n_paths

In [4]:
def calculate_interplay_features(op_text, reply_path_text, stop_words_set):
    """Calculate 12 interplay features between OP and reply path."""

    # Tokenize and clean
    op_words = op_text.lower().split()
    reply_words = reply_path_text.lower().split()

    # Create word sets
    op_all = set(op_words)
    reply_all = set(reply_words)
    op_stop = set(w for w in op_words if w in stop_words_set)
    reply_stop = set(w for w in reply_words if w in stop_words_set)
    op_content = set(w for w in op_words if w not in stop_words_set)
    reply_content = set(w for w in reply_words if w not in stop_words_set)

    # Calculate 4 metrics for each word type
    features = {}

    for word_type, (op_set, reply_set) in [
        ('all', (op_all, reply_all)),
        ('stop', (op_stop, reply_stop)),
        ('content', (op_content, reply_content))
    ]:
        intersection = len(op_set & reply_set)
        union = len(op_set | reply_set)

        features[f'common_words_{word_type}'] = intersection
        features[f'sim_frac_reply_{word_type}'] = intersection / len(reply_set) if reply_set else 0
        features[f'sim_frac_op_{word_type}'] = intersection / len(op_set) if op_set else 0
        features[f'jaccard_{word_type}'] = intersection / union if union else 0

    return features

In [28]:
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection
from collections import Counter
import logging

class ChangeDetector:
    """Sliding window change detection with proper statistical significance."""

    def __init__(self, window_size=3, significance_level=0.05):
        self.window_size = window_size
        self.alpha = significance_level
        self.stance_values = {
            'strongly_against': -2, 'moderately_against': -1,
            'neutral': 0, 'moderately_favor': 1, 'strongly_favor': 2
        }

    def detect_simple_stance_changes(self, topic_timeline):

        if len(topic_timeline) < 2:
            return []

        changes = []
        timeline_items = list(topic_timeline.items())  # Convert to list of (utterance_id, stance) pairs

        for i in range(1, len(timeline_items)):
            current_utterance_id, current_stance = timeline_items[i]
            previous_utterance_id, previous_stance = timeline_items[i-1]

            # Check if stance changed
            if current_stance != previous_stance:
                change = {
                    'position': i,
                    'current_utterance_id': current_utterance_id,
                    'previous_utterance_id': previous_utterance_id,
                    'from_stance': previous_stance,
                    'to_stance': current_stance,
                    'change_type': self._classify_change_direction(previous_stance, current_stance),
                    'change_magnitude': self._calculate_simple_magnitude(previous_stance, current_stance)
                }
                changes.append(change)

        return changes

    def detect_persistent_changes(self, topic_timeline):
        """Detect persistent changes in stance."""

        # Convert to (utt_id, detected_stance) tuple
        # topic_timeline_list = list(topic_timeline.items())

        # Collect the tuples where the stance is persistent across n utterances
        change_points = []

        for i in range(len(topic_timeline)):
          # if current stance is different than prior
          if topic_timeline[i][1] != topic_timeline[i-1][1]:
            # Check if change persists for more than 1 post
            if topic_timeline[i][1] == topic_timeline[i+1][1]:
              change_index = i
              utt_id = topic_timeline[i][0]
              change_point = (change_index, utt_id)
              change_points.append(change_point)
              print(f"Current:{topic_timeline[i][1]}, Previous: {topic_timeline[i-1][1]} and Next:{topic_timeline[i+1][1]}")

        return change_points

    def _classify_change_direction(self, from_stance, to_stance):
        """Classify the direction of stance change."""
        from_value = self.stance_values.get(from_stance, 0)
        to_value = self.stance_values.get(to_stance, 0)

        if to_value > from_value:
            return 'more_favorable'
        elif to_value < from_value:
            return 'less_favorable'
        else:
            return 'neutral_shift'

    def _calculate_simple_magnitude(self, from_stance, to_stance):
        """Calculate the magnitude of stance change."""
        from_value = self.stance_values.get(from_stance, 0)
        to_value = self.stance_values.get(to_stance, 0)
        return abs(to_value - from_value)

    def detect_changes_with_significance(self, topic_timeline):
        """Detect changes with statistical significance testing."""

        if len(topic_timeline) < self.window_size * 2:
            return [], [], []

        # Convert to lists to maintain order and get IDs
        timeline_items = list(topic_timeline.items())  # [(utterance_id, stance), ...]
        stance_sequence = [self.stance_values.get(stance, 0) for _, stance in timeline_items]

        potential_changes = []
        p_values = []

        # Sliding window approach
        for i in range(self.window_size, len(stance_sequence) - self.window_size):

            # Left window (before potential change)
            left_window = stance_sequence[i - self.window_size:i]

            # Right window (after potential change)
            right_window = stance_sequence[i:i + self.window_size]

            # Statistical test: Are these two windows significantly different?
            statistic, p_value = self.two_sample_test(left_window, right_window)

            p_values.append(p_value)

            # Store potential change info with just the key utterance ID
            change_magnitude = abs(np.mean(right_window) - np.mean(left_window))
            potential_changes.append({
                'position': i,
                'utterance_id': timeline_items[i][0],  # The utterance where change detected
                'p_value': p_value,
                'test_statistic': statistic,
                'magnitude': change_magnitude,
                'left_mean': np.mean(left_window),
                'right_mean': np.mean(right_window),
                'left_window': left_window.copy(),
                'right_window': right_window.copy()
            })

        # Apply FDR correction to all p-values
        if not p_values:
            return [], [], []

        rejected, p_corrected = self.multiple_testing_correction(p_values)

        # Keep only changes that survive FDR correction
        significant_changes = []
        for i, change in enumerate(potential_changes):
            if rejected[i]:  # Survives FDR correction
                change.update({
                    'p_corrected': p_corrected[i],
                    'statistically_significant': True,
                    'survives_fdr_correction': True,
                    'significance_level': self.alpha
                })
                significant_changes.append(change)

        return significant_changes, p_values, p_corrected

    def two_sample_test(self, left_window, right_window):
        """Statistical test for difference between two windows."""
        # Use Mann-Whitney U test (non-parametric, more robust)
        try:
            statistic, p_value = mannwhitneyu(left_window, right_window,
                                            alternative='two-sided')
            return statistic, p_value
        except ValueError:
            # Fallback to t-test if Mann-Whitney fails
            statistic, p_value = ttest_ind(left_window, right_window)
            return statistic, p_value

    def multiple_testing_correction(self, p_values):
        """Correct for multiple testing using Benjamini-Hochberg."""
        rejected, p_corrected = fdrcorrection(p_values, alpha=self.alpha)
        return rejected, p_corrected

    # def analyze_user_belief_changes(self, user_timeline):
    #     """Analyze belief changes across all topics for a user."""
    #     all_changes = {}
    #
    #     for topic, topic_timeline in user_timeline.items():
    #         changes = self.detect_changes_with_significance(topic_timeline)
    #         all_changes[topic] = changes
    #
    #     return all_changes

    def analyze_user_belief_changes(self, user_timeline):
        """Analyze belief changes across all topics for a user.

        Args:
            user_timeline: Dict of {topic: {utterance_id: stance}}

        Returns:
            Dict with changes by topic and total count
        """
        all_changes = {}
        total_changes = 0

        for topic, topic_timeline in user_timeline.items():
            significant_changes, p_values, p_corrected = self.detect_changes_with_significance(topic_timeline)
            all_changes[topic] = significant_changes
            total_changes += len(significant_changes)

        return {
            'changes_by_topic': all_changes,
            'total_changes': total_changes
        }

    def analyze_all_users_belief_changes(self, timelines):
        """Analyze belief changes across all users.

        Args:
            timelines: Dict of {user_id: {topic: {utterance_id: stance}}}

        Returns:
            Dict with changes by user and total count
        """
        all_user_changes = {}
        total_changes = 0

        for user_id, user_timeline in timelines.items():
            user_result = self.analyze_user_belief_changes(user_timeline)
            all_user_changes[user_id] = user_result
            total_changes += user_result['total_changes']

        return {
            'changes_by_user': all_user_changes,
            'total_changes': total_changes
        }

In [46]:
# Test persistence detector:
timeline_builder = TimelineBuilder(corpus, min_posts_per_topic=0, min_topics_per_user=0)
timelines = timeline_builder.build_timelines()

# Filter for analysis
filtered_timelines = filter_for_change_detection(timelines, min_posts_per_topic=5, min_topics_per_user=2)

# Get a specific user's timeline for a specific topic
user_id = "HardCoreModerate"
topic = "media and political commentary"
topic_timeline = filtered_timelines[user_id][topic]  # This is {utterance_id: stance}

# Convert to list of tupples
topic_timeline_list = list(topic_timeline.items())

persistence_detector = ChangeDetector()

2025-08-06 10:45:42,431 - temporal_belief.core.timeline_building - INFO - timeline_building:71 - Built timelines for 4781 users
INFO:temporal_belief.core.timeline_building:Built timelines for 4781 users


In [47]:
change_points = persistence_detector.detect_persistent_changes(topic_timeline_list)

Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: strongly_against and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:neutral, Previous: moderately_against and Next:neutral
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_against, Previous: strongly_against and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_favor, Previous: moderately_against and Next:moderately_favor
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Nex

In [48]:
# Test the window extractor:
window_extractor = ConversationWindowExtractor(corpus, timelines=timelines)
change_points = persistence_detector.detect_persistent_changes(topic_timeline_list)
candidate_convos = window_extractor.get_conversations_around_change_point(change_points=change_points, corpus=corpus)
for convo in candidate_convos:
  print(f'ID:{convo.id}')

Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: strongly_against and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:neutral, Previous: moderately_against and Next:neutral
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_against, Previous: strongly_against and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_favor, Previous: moderately_against and Next:moderately_favor
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Nex

In [None]:
# Should return (op_utterance, paths), (op_utterance2, paths) etc.
utt_id = change_points[0][1]
user_id = corpus.get_utterance(change_points[0][1]).speaker.id
rooted_units_per_convo = window_extractor.extract_rooted_path_from_candidate_convos(candidate_convos, user_id)

In [None]:
print(change_points)

[(20, 'c345fu2'), (26, 'c34bigz'), (30, 'c34e3d0'), (39, 'c37rgne'), (42, 'c37rsmm'), (52, 'c37tt9j'), (54, 'c37tts1'), (64, 'c37vnob'), (71, 'c381lme'), (77, 'c382x1h'), (80, 'c386bvm'), (82, 'c387opl'), (89, 'c3ctdtd'), (104, 'nz1xu'), (108, 'c3d3mf9'), (117, 'c3d8hjg'), (125, 'c3dc5rj'), (135, 'c3df7re'), (142, 'c3dq15f')]


In [None]:
print(change_points[0])

(20, 'c345fu2')


In [None]:
print(topic_timeline_list[0][0])

lnrey


In [None]:
# Detect changes with significance:
timeline_builder = TimelineBuilder(corpus, min_posts_per_topic=0, min_topics_per_user=0)
timelines = timeline_builder.build_timelines()

# Filter for analysis
filtered_timelines = filter_for_change_detection(timelines, min_posts_per_topic=5, min_topics_per_user=2)

# Get a specific user's timeline for a specific topic
user_id = "HardCoreModerate"
topic = "media and political commentary"
topic_timeline = filtered_timelines[user_id][topic]  # This is {utterance_id: stance}

# Initialize detector and detect changes
detector = ChangeDetector()
significant_changes, p_values, p_corrected = detector.detect_changes_with_significance(topic_timeline)

# Print the results
print(f"Detected {len(significant_changes)} statistically significant stance changes for user {user_id} on topic {topic}:")
for change in significant_changes:
    print(f"  {change['stance_before']} → {change['stance_after']} (magnitude: {change['magnitude']:.3f}, p={change['p_corrected']:.4f})")

Detected 0 statistically significant stance changes for user HardCoreModerate on topic media and political commentary:


In [None]:
# Test window extractor
user_id = "HardCoreModerate"
topic = "taxation and government spending"
topic_timeline = timelines[user_id][topic]

# Get significant changes
detector = ChangeDetector()
significant_changes, p_values, p_corrected = detector.detect_changes_with_significance(topic_timeline)

if significant_changes:
    # Test the window extractor
    extractor = ConversationWindowExtractor(corpus, timelines)
    window_data = extractor.get_conversations_around_change(
        user_id=user_id,
        topic=topic,
        change=significant_changes[0],
        window_size=2  # 2 conversations before + 2 after
    )

    # Print summary
    extractor.print_window_summary(window_data)
else:
    print("No significant changes found to test with")

No significant changes found to test with


In [None]:
# Most populated topic for a user
def topic_with_most_contributions(user_id):
    posts_in_topic = {}
    for topic in timelines[user_id].keys():
      posts_in_topic[topic] = len(list(timelines[user_id][topic]))
    # key with the largest value
    topic = max(posts_in_topic, key=posts_in_topic.get)

    return topic, posts_in_topic[topic]

# Yea the number came cause the posts_in_topic was not encapsulated
user_id = 'HardCoreModerate'
topic, number = topic_with_most_contributions(user_id)
print(f"{topic}: {number}")
# print(posts_in_topic)

media and political commentary: 145


In [None]:
# Total number of users with metadata (unfiltered)
print(len(timelines))

In [None]:
# NOT WORKING
# user with the most utterances:
# I have to find the max between their topics and then find the overall max
users = {}
for user_id, data in timelines.items():
    topic, number = topic_with_most_contributions(user_id)
    users[user_id] = topic
    users[user_id][topic] = number

for user in users:
    print(user)
# user_id = max(users, key=users.get)
# print(f"{user_id}: {users[user_id]}")

In [None]:
# Detect simple stance change:
timeline_builder = TimelineBuilder(corpus, min_posts_per_topic=3, min_topics_per_user=1)
timelines = timeline_builder.build_timelines()

# Get a specific user's timeline for a specific topic
user_id = "HardCoreModerate"
topic = "taxation and government spending"
topic_timeline = timelines[user_id][topic]  # This is {utterance_id: stance}

# Initialize detector and detect changes
detector = ChangeDetector()
changes = detector.detect_simple_stance_changes(topic_timeline)

# Print the results
print(f"Detected {len(changes)} stance changes for user {user_id} on topic {topic}:")
for change in changes:
    print(f"  {change['from_stance']} → {change['to_stance']} (magnitude: {change['change_magnitude']})")

Detected 2 stance changes for user HardCoreModerate on topic taxation and government spending:
  moderately_against → neutral (magnitude: 1)
  neutral → moderately_against (magnitude: 1)


In [None]:
# for user_id in

In [None]:
# Run detection for all topics for a user - NOT TESTED:
# Get complete user timeline
user_timeline = timelines["pixel8"]  # All topics for this user

# Analyze changes across all topics
detector = ChangeDetector()
all_changes = detector.analyze_user_belief_changes(user_timeline)

# Results
for topic, changes in all_changes.items():
    print(f"Topic: {topic}")
    for change in changes:
        print(f"  Change at position {change['position']}: magnitude {change['magnitude']}")

In [None]:
# All users that meet the criteria:
print("Available users:")
print(list(timelines.keys())[:20])

In [None]:
# What topics the users have posted about:
for user_id in list(timelines.keys())[:20]:  # Check first 5 users
    topics = list(timelines[user_id].keys())
    print(f"{user_id}: {topics}")
    break

In [None]:
# confidence score:
utterances = list(corpus.iter_utterances())
print(utterances[1].meta)

ConvoKitMeta({'score': 29, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'PoliticalDiscussion', 'stickied': False, 'permalink': '/r/PoliticalDiscussion/comments/nz1xu/congrats_rpoliticaldiscussion_you_are_turning/', 'author_flair_text': '', 'detected_stance': 'moderately_against', 'stance_confidence': 0.8540321985880533, 'stance_scores': {'strongly_favor': 0.0016047263949682626, 'moderately_favor': 0.5134096046288809, 'neutral': 0.0072105322033166885, 'moderately_against': 0.8540321985880533, 'strongly_against': 0.3021060957883795}})
