In [None]:
# Need to restart after:
!pip install convokit[llm]
!pip install convokit

In [None]:
import time
import sys
import os
!pip install gdown
import zipfile
import nltk
from nltk.corpus import stopwords
from convokit import Corpus, download
import convokit
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.change_detection import ChangeDetector
from temporal_belief.core.window_extraction import WindowExtractor
from temporal_belief.core.op_path_pairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor
from temporal_belief.data.preprocessors import PairPreprocessor
from temporal_belief.core.interplay import Interplay
nltk.download('stopwords')

In [None]:
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

In [37]:
from typing import Dict, Any
from collections import defaultdict
import logging

class TimelineBuilder:
    """Simple timeline builder for user belief tracking.

    Builds structure: {user_id: {topic: {utterance_id: stance}}}
    """

    def __init__(self, corpus, min_posts_per_topic: int = 0, min_topics_per_user: int = 0):
        self.corpus = corpus
        self.min_posts_per_topic = min_posts_per_topic
        self.min_topics_per_user = min_topics_per_user
        self.logger = logging.getLogger(__name__)

    def build_timelines(self, include_all=True) -> Dict[str, Dict[str, Dict[str, str]]]:
        """Build user timelines from corpus with stance metadata.

        Returns:
            {user_id: {topic: {utterance_id: stance}}}
        """
        # Group by user and topic
        user_topic_posts = defaultdict(lambda: defaultdict(list))

        for utterance in self.corpus.iter_utterances():
            # Skip if no stance metadata on utterance
            if include_all == False:
                if not utterance.meta or 'detected_stance' not in utterance.meta:
                    continue

            # Get topic from conversation metadata
            conversation = utterance.get_conversation()
            if not conversation or not conversation.meta or 'detected_topic' not in conversation.meta:
                continue

            if not utterance.timestamp:
                continue

            user_id = utterance.speaker.id
            topic = conversation.meta['detected_topic']
            stance = utterance.meta.get('detected_stance', 'Unknown')

            user_topic_posts[user_id][topic].append({
                'utterance_id': utterance.id,
                'timestamp': utterance.timestamp,
                'stance': stance
            })

        # Filter and sort
        timelines = {}
        for user_id, topic_posts in user_topic_posts.items():
            user_timeline = {}

            for topic, posts in topic_posts.items():
                if len(posts) >= self.min_posts_per_topic:
                    # Sort chronologically
                    posts.sort(key=lambda x: x['timestamp'])

                    # Create topic timeline
                    topic_timeline = {}
                    for post in posts:
                        topic_timeline[post['utterance_id']] = post['stance']

                    user_timeline[topic] = topic_timeline

            # Only include users with enough topics
            if len(user_timeline) >= self.min_topics_per_user:
                timelines[user_id] = user_timeline

        self.logger.info(f"Built timelines for {len(timelines)} users")
        return timelines

In [52]:
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection
from collections import Counter
import logging


class ChangeDetector:
    """Sliding window change detection with persistence threshold."""

    def __init__(self, window_size=3, persistence_threshold=4, significance_level=0.05):
        self.window_size = window_size
        self.persistence_threshold = persistence_threshold
        self.alpha = significance_level
        self.stance_values = {
            'left-leaning': -1,
            'neutral': 0,
            'right-leaning': 1
        }
        self.all_change_points = []
        self.all_no_change_points = []

    def detect_persistent_changes(self, topic_timeline):
        """
        Detect persistent changes using sliding window with numerical averages.
        This is your main detection method.
        """
        if len(topic_timeline) < self.window_size * 2:
            return {'change_points': [], 'no_change_points': []}

        print('new one')

        # Convert stances to numerical values
        numerical_stances = []
        for utt_id, stance in topic_timeline:
            numerical_stances.append(self.stance_values.get(stance, 0))

        change_points = []
        no_change_points = []

        # Calculate sliding window averages
        for i in range(self.window_size, len(numerical_stances) - self.window_size):

            # Get before and after windows
            before_window = numerical_stances[i - self.window_size:i]
            after_window = numerical_stances[i:i + self.window_size]

            # Calculate means
            before_mean = np.mean(before_window)
            after_mean = np.mean(after_window)

            # Check for significant change (simple threshold approach)
            change_magnitude = abs(after_mean - before_mean)

            # Require both magnitude and direction consistency
            if change_magnitude > 0.5:  # Adjust threshold as needed
                # Check if change persists
                future_window = numerical_stances[i + self.window_size:i + 2 * self.window_size]
                if len(future_window) >= self.window_size:
                    future_mean = np.mean(future_window)

                    # If the change direction is maintained
                    if (after_mean - before_mean) * (future_mean - before_mean) > 0:
                        utt_id = topic_timeline[i][0]
                        change_points.append(utt_id)
                        # print(f"Sliding window change at index {i}: "
                        #       f"before={before_mean:.2f}, after={after_mean:.2f}, "
                        #       f"future={future_mean:.2f}")

        # Add non-change points
        for i, (utt_id, stance) in enumerate(topic_timeline):
            if utt_id not in change_points:
                no_change_points.append(utt_id)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
        }

    def detect_persistent_changes_simple(self, topic_timeline):
        """
        Alternative: Simple persistence detection (your original approach, but fixed).
        Call this method if you want the simpler approach.
        """
        change_points = []
        no_change_points = []

        if len(topic_timeline) < self.persistence_threshold + 1:
            # Timeline too short for meaningful analysis
            return {'change_points': change_points, 'no_change_points': no_change_points}

        # Track detected changes to avoid duplicates
        already_detected = set()

        for i in range(1, len(topic_timeline) - self.persistence_threshold):
            current_stance = topic_timeline[i][1]
            previous_stance = topic_timeline[i - 1][1]

            # Check if stance changed
            if current_stance != previous_stance and i not in already_detected:

                # Check if new stance persists for required threshold
                persistence_count = 1  # Current post counts as 1

                for j in range(i + 1, min(i + self.persistence_threshold, len(topic_timeline))):
                    if topic_timeline[j][1] == current_stance:
                        persistence_count += 1
                    else:
                        break  # Persistence broken

                # If new stance persists for threshold, mark as change point
                if persistence_count >= self.persistence_threshold:
                    utt_id = topic_timeline[i][0]
                    change_points.append(utt_id)

                    # Mark this range as detected to avoid overlapping detections
                    for k in range(i, min(i + self.persistence_threshold, len(topic_timeline))):
                        already_detected.add(k)

                    print(f"Change detected at index {i}: {previous_stance} → {current_stance} "
                          f"(persisted for {persistence_count} posts)")

        # Add non-change points (utterances that didn't cause changes)
        for i, (utt_id, stance) in enumerate(topic_timeline):
            if utt_id not in change_points:
                no_change_points.append(utt_id)

        # Store for global analysis
        self.all_change_points.extend(change_points)
        self.all_no_change_points.extend(no_change_points)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
        }

    def get_two_groups(self, timelines, method='sliding_window'):
        """
        Categorize users into those with/without changes using specified method.

        Args:
            timelines: User timeline data
            method: 'sliding_window' (default) or 'simple'
        """
        with_changes = {}
        no_changes = {}

        # Choose detection method
        if method == 'sliding_window':
            detect_func = self.detect_persistent_changes  # Uses numerical sliding windows
        elif method == 'simple':
            detect_func = self.detect_persistent_changes_simple  # Your original approach
        else:
            raise ValueError(f"Unknown method: {method}. Use 'sliding_window' or 'simple'")

        for user_id, topic_timelines in timelines.items():
            user_has_changes = False

            for topic_name, topic_timeline in topic_timelines.items():
                topic_timeline_list = list(topic_timeline.items())
                changes = detect_func(topic_timeline_list)

                if changes['change_points']:
                    user_has_changes = True
                    # Store change-causing utterances
                    if user_id not in with_changes:
                        with_changes[user_id] = {}
                    with_changes[user_id][topic_name] = {
                        utt_id: topic_timeline[utt_id]
                        for utt_id in changes['change_points']
                    }

            # If user had no changes in any topic, add to no_changes group
            if not user_has_changes:
                no_changes[user_id] = topic_timelines

        return {
            'with_changes': with_changes,
            'no_changes': no_changes
        }

In [72]:
class WindowExtractor:
    """ Find the conversations around the change point """

    def __init__(self, corpus, timelines):
        self.corpus = corpus
        self.timelines = timelines
        self.user_conversations_cache = {}  # Add cache

    def build_global_user_conversations_index(self, max_convos):
        """Build sorted conversations for ALL users upfront"""
        print("Building global user conversations index...")
        user_conversations = {}

        convos = list(corpus.iter_conversations())[:max_convos]
        for convo in convos:
            # Get all speakers in this conversation
            speakers = {utt.speaker.id for utt in convo.iter_utterances()}

            # Add this conversation to each speaker's list
            for speaker_id in speakers:
                if speaker_id not in user_conversations:
                    user_conversations[speaker_id] = []
                user_conversations[speaker_id].append(convo)

        # Sort each user's conversations once
        for speaker_id in user_conversations:
            user_conversations[speaker_id].sort(
                key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances())
            )

        print(f"Index built for {len(user_conversations)} users!")

        self.user_conversations_cache = user_conversations

    def get_user_conversations_chronological_old(self, corpus, speaker_id):
        """Get all conversations for a user in chronological order."""

        # Check cache first
        if speaker_id in self.user_conversations_cache:
            return self.user_conversations_cache[speaker_id]

        # Get all conversations where the speaker participated
        user_conversations = [convo for convo in corpus.iter_conversations()
                              if speaker_id in [utt.speaker.id for utt in convo.iter_utterances()]]

        # Sort conversations by their earliest timestamp
        user_conversations.sort(key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances()))

        # Cache the result
        self.user_conversations_cache[speaker_id] = user_conversations

        return user_conversations

    def get_user_conversations_chronological(self, corpus, speaker_id):
        return self.user_conversations_cache.get(speaker_id, [])

    def get_conversations_around_change_point(self, corpus, change_point):
        # Get first change (probably only one I need)
        utterance = corpus.get_utterance(change_point)

        # Find the convo this utterance belongs to:
        conversation = utterance.get_conversation()

        # Put all user's convos in a list
        speaker_id = utterance.speaker.id
        user_conversations = self.get_user_conversations_chronological(corpus, speaker_id)

        candidate_convos = []
        # find the index of the convo, and return the convo id of the 3 prior convos
        for i, convo in enumerate(user_conversations):
            if conversation.id == user_conversations[i].id:
                # Check if there are at least two conversations before the current one
                if i >= 2:
                    candidate_convos.append(user_conversations[i - 2])
                    candidate_convos.append(user_conversations[i - 1])
                elif i == 1:
                    # If only one conversation before, append that one
                    candidate_convos.append(user_conversations[i - 1])

                # Append the current conversation with the change point
                candidate_convos.append(conversation)
                break  # Found the conversation, no need to continue the loop

        return candidate_convos

In [7]:
# Download and unzip with python (Dataloading):
# !gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1DLFY6JLMZqNjwvNRZmhlV4-rnoQP_eyH/view?usp=sharing" -O "/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1nWaj5N8nsG7u5homv_kAh4CLPDv01M_Z/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics.zip" --fuzzy
!gdown "https://drive.google.com/file/d/15NMRXEkGRoGjK6TXFBHIMOPjkTyZ0keP/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances200000_llm.zip" --fuzzy

# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics.zip").extractall("/content/temporal_belief_analysis")
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances200000_llm.zip").extractall("/content/temporal_belief_analysis")

Downloading...
From (original): https://drive.google.com/uc?id=15NMRXEkGRoGjK6TXFBHIMOPjkTyZ0keP
From (redirected): https://drive.google.com/uc?id=15NMRXEkGRoGjK6TXFBHIMOPjkTyZ0keP&confirm=t&uuid=93ac337e-b26a-48f0-a227-b7dfc088b8c3
To: /content/temporal_belief_analysis/pd_corpus_with_stances200000_llm.zip
100% 834M/834M [00:07<00:00, 115MB/s]


In [14]:
corpus = Corpus(filename="/content/temporal_belief_analysis/pd_corpus_with_stances200000_chronological_second_attempt")

In [15]:
timeline_builder = TimelineBuilder(corpus)
timelines = timeline_builder.build_timelines()

2025-08-18 13:17:13,881 - temporal_belief.core.timeline_building - INFO - timeline_building:71 - Built timelines for 586 users
INFO:temporal_belief.core.timeline_building:Built timelines for 586 users


In [69]:
user_count = 0
for user_id, topic_timeline in timelines.items():
    for topic_name, utt_stances in topic_timeline.items():
        if len(utt_stances.values()) > 100:
            print(f"User: {user_id}, Topic name: {topic_name}, utterances: {len(utt_stances.values())}")
            print(f"Stances: {[stance for stance in utt_stances.values()]}")
            user_count += 1
        else:
            continue
print(f"users: {user_count}")


User: seltaeb4, Topic name: media and political commentary, utterances: 153
Stances: ['right-leaning', 'left-leaning', 'left-leaning', 'neutral', 'right-leaning', 'left-leaning', 'right-leaning', 'right-leaning', 'left-leaning', 'neutral', 'neutral', 'left-leaning', 'neutral', 'left-leaning', 'neutral', 'left-leaning', 'neutral', 'left-leaning', 'left-leaning', 'neutral', 'neutral', 'right-leaning', 'right-leaning', 'right-leaning', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'left-leaning', 'left-leaning', 'left-leaning', 'left-leaning', 'left-leaning', 'neutral', 'left-leaning', 'neutral', 'left-leaning', 'neutral', 'neutral', 'neutral', 'neutral', 'left-leaning', 'left-leaning', 'neutral', 'right-leaning', 'left-leaning', 'left-leaning', 'left-leaning', 'neutral', 'neutral', 'neutral', 'neutral', 'right-leaning', 'neutral', 'neutral', 'left-leaning', 'left-leaning', 'left-leaning', 'left-leaning', 'neutral', 'neutral'

In [None]:
change_detector = ChangeDetector()
user_id = "HardCoreModerate"
topic = "taxation and government spending"
topic_timeline = timelines[user_id][topic]
topic_timeline_list = list(topic_timeline.items())
change_point = change_detector.detect_persistent_changes(topic_timeline_list)
groups = change_detector.get_two_groups(topic_timeline_list)
print(change_point)

In [78]:
window_extractor = WindowExtractor(corpus, timelines)
window_extractor.build_global_user_conversations_index(max_convos = 400)
candidate_convos = window_extractor.get_conversations_around_change_point(change_point=change_point, corpus=corpus)
print(f"Candidate convos: {[convo for convo in candidate_convos]}")

Building global user conversations index...
Index built for 1235 users!
Candidate convos: [Conversation({'obj_type': 'conversation', 'vectors': [], 'tree': None, 'owner': <convokit.model.corpus.Corpus object at 0x7b52f436f550>, 'id': 'ocea8', 'meta': ConvoKitMeta({'title': "Mitt: It isn't envy, it is fear of exploitation", 'num_comments': 191, 'domain': 'self.PoliticalDiscussion', 'timestamp': 1326294511, 'subreddit': 'PoliticalDiscussion', 'gilded': -1, 'gildings': None, 'stickied': False, 'author_flair_text': '', 'detected_topic': 'political figures and campaigns', 'topic_confidence': 0.5010017156600952, 'topic_scores': {'political figures and campaigns': 0.5010017156600952, 'media and political commentary': 0.3437584340572357, 'electoral politics': 0.033539239317178726, 'economic policy': 0.030718829482793808, 'civil rights and social issues': 0.013739150017499924, 'congressional politics': 0.009991119615733624, 'political parties and ideology': 0.009239627979695797, 'gun rights and

In [79]:
op_path_pairer = OpPathPairer(corpus, timelines)
op_path_pairs = op_path_pairer.extract_rooted_path_from_candidate_convos(candidate_convos, 'user_id')
pair_preprocessor = PairPreprocessor()
preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)
print(preprocessed_pairs)

[]


In [20]:
persuasion_analyzer = Interplay()
interplay_features = persuasion_analyzer.calculate_interplay_features()

In [70]:
# Load English stop words
stop_words_set = set(stopwords.words('english'))

# use the groups
groups = change_detector.get_two_groups(timelines)
groups_tuple = (groups['with_changes'], groups['no_changes'])

# Init
i = 0
group_means = [] # Initialize as a list to append means
group_scores = []
utts_num = 0

# For each group
for group in groups_tuple:
    current_group_scores = []

    for user_id, topic_timelines in group.items():
        user_start_time = time.time()
        user_change_points = 0

        for topic_timeline in topic_timelines.values():

            for change_point in topic_timeline.keys(): # Iterate through change points (keys)
                print(f'User: {user_id}, topic: {topic_timeline}, change point {change_point}')
                utts_num += 1

                user_change_points += 1

                # TIME: Window extraction
                start_time = time.time()
                try:
                    candidate_convos = window_extractor.get_conversations_around_change_point(
                        change_point=change_point, corpus=corpus
                    )
                    window_time = time.time() - start_time
                    print(f'⏱️ Window extraction: {window_time:.3f}s')
                except ValueError as e:
                    print(f"Skipping change point {change_point}: {e}")
                    continue


                # TIME: Path extraction
                start_time = time.time()
                op_path_pairs = []
                for candidate_convo in candidate_convos:
                    try:
                        op_path_pairs.extend(op_path_pairer.extract_rooted_path_from_candidate_convos(
                            [candidate_convo], user_id
                        ))
                    except ValueError as e:
                        print(f"Skipping conversation {candidate_convo.id}: {e}")
                        continue
                path_time = time.time() - start_time
                print(f'⏱️ Path extraction: {path_time:.3f}s')


                # TIME: Preprocessing
                start_time = time.time()
                preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)
                preprocess_time = time.time() - start_time
                print(f'⏱️ Preprocessing: {preprocess_time:.3f}s')


                # TIME: Feature extraction
                start_time = time.time()
                features_list = []
                for op, paths in preprocessed_pairs:
                    for k, concatenated_utts in paths.items():
                        interplay_features = persuasion_analyzer.calculate_interplay_features(
                            op.text, concatenated_utts, stop_words_set
                        )
                        features_list.append(interplay_features)
                feature_time = time.time() - start_time
                print(f'⏱️ Feature extraction: {feature_time:.3f}s')


                # TIME: Scoring
                start_time = time.time()
                scores = []
                for interplay_features in features_list:
                    score = persuasion_analyzer.calculate_persuasion_score(interplay_features)
                    scores.append(score)
                scoring_time = time.time() - start_time
                print(f'⏱️ Scoring: {scoring_time:.3f}s')
                total_time = window_time + path_time + preprocess_time + feature_time + scoring_time
                print(f'🔥 TOTAL for change point: {total_time:.3f}s\n')

                # Print total time for this change point
                total_time = window_time + path_time + preprocess_time + feature_time + scoring_time
                print(f'🔥 TOTAL for change point: {total_time:.3f}s\n')

                current_group_scores.extend(scores)

        # TIME: End timing this user
        user_total_time = time.time() - user_start_time
        print(f'👤 USER {user_id} TOTAL: {user_total_time:.3f}s ({user_change_points} change points)')
        print(f'📊 Average per change point: {user_total_time/max(1, user_change_points):.3f}s\n')

    # Calculate mean for this group
    total = 0
    num_of_scores = 0
    for score in current_group_scores: # Iterate through individual scores
        total += score
        num_of_scores += 1

    group_mean = total / num_of_scores if num_of_scores > 0 else 0 # Handle division by zero
    group_means.append(group_mean) # Append mean to the list

# Print the calculated group means
print(f'Group Means: {group_means}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction: 0.000s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.000s

🔥 TOTAL for change point: 0.000s

User: a200ftmonster, topic: {'c34akkc': 'neutral', 'c34w7jd': 'neutral', 'c3bam9p': 'neutral'}, change point c3bam9p
⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.000s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction: 0.000s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.000s

🔥 TOTAL for change point: 0.000s

User: a200ftmonster, topic: {'c35qqmq': 'neutral'}, change point c35qqmq
⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.000s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction: 0.000s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.000s

🔥 TOTAL for change point: 0.000s

User: a200ftmonster, topic: {'c37o6aj': 'neutral', 'c37yfgg': 'neutral'}, change point c37o6aj
⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.000s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction: 0.000s
⏱