In [3]:
!git clone https://github.com/Sharp-4rth/temporal_belief_analysis.git

Cloning into 'temporal_belief_analysis'...
remote: Enumerating objects: 511, done.[K
remote: Counting objects: 100% (122/122), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 511 (delta 70), reused 54 (delta 28), pack-reused 389 (from 1)[K
Receiving objects: 100% (511/511), 3.16 MiB | 6.41 MiB/s, done.
Resolving deltas: 100% (318/318), done.


In [None]:
# Need to restart after:
!pip install convokit[llm]
!pip install convokit

In [5]:
import sys
import os
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

Changed working directory to: /content/temporal_belief_analysis/notebooks


In [6]:
import time
!pip install gdown
import zipfile
import nltk
from nltk.corpus import stopwords
from convokit import Corpus, download
import convokit
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.change_detection import ChangeDetector
from temporal_belief.core.window_extraction import WindowExtractor
from temporal_belief.core.op_path_pairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor
from temporal_belief.data.preprocessors import PairPreprocessor
from temporal_belief.core.interplay import Interplay
nltk.download('stopwords')

An error occurred: Unsloth currently only works on NVIDIA GPUs and Intel GPUs.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Download and unzip with python (Dataloading):
# !gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1DLFY6JLMZqNjwvNRZmhlV4-rnoQP_eyH/view?usp=sharing" -O "/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1nWaj5N8nsG7u5homv_kAh4CLPDv01M_Z/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics.zip" --fuzzy
!gdown "https://drive.google.com/file/d/1AIrstrzE259fcVyxJQW4-RwvAkoUyK1x/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip" --fuzzy

# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics.zip").extractall("/content/temporal_belief_analysis")
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip").extractall("/content/temporal_belief_analysis")

In [11]:
CORPUS_PATH = "/content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned"

In [17]:
from typing import Dict, Any
from collections import defaultdict
import logging

MERGED_TOPIC = {
    # Economy
    'economic policy': 'Economy & Tax',
    'taxation and government spending': 'Economy & Tax',

    # Healthcare
    'healthcare policy': 'Healthcare',

    # Civil rights / justice / education / voting
    'civil rights and social issues': 'Civil Rights, Justice & Education',
    'criminal justice and policing': 'Civil Rights, Justice & Education',
    'voting rights and elections': 'Civil Rights, Justice & Education',
    'education policy': 'Civil Rights, Justice & Education',

    # Hot-button singles
    'gun rights and control': 'Guns',
    'abortion and reproductive rights': 'Abortion',
    'immigration policy': 'Immigration',
    'climate change and energy policy': 'Climate & Energy',

    # Foreign / defense
    'foreign policy and defense': 'Foreign & Defense',

    # Meta / process / actors
    'political figures and campaigns': 'Process & Actors (Meta)',
    'congressional politics': 'Process & Actors (Meta)',
    'electoral politics': 'Process & Actors (Meta)',
    'political parties and ideology': 'Process & Actors (Meta)',
    'media and political commentary': 'Process & Actors (Meta)',
}


class TimelineBuilder:
    """Simple timeline builder for user belief tracking.

    Builds structure: {user_id: {topic: {utterance_id: stance}}}
    """

    def __init__(self, corpus, min_posts_per_topic: int = 0, min_topics_per_user: int = 0):
        self.corpus = corpus
        self.min_posts_per_topic = min_posts_per_topic
        self.min_topics_per_user = min_topics_per_user
        self.logger = logging.getLogger(__name__)

    def build_timelines(self, include_all=True) -> Dict[str, Dict[str, Dict[str, str]]]:
        """Build user timelines from corpus with stance metadata.

        Returns:
            {user_id: {topic: {utterance_id: stance}}}
        """
        # Group by user and topic
        user_topic_posts = defaultdict(lambda: defaultdict(list))

        for utterance in self.corpus.iter_utterances():
            # Skip if no stance metadata on utterance
            if include_all == False:
                if not utterance.meta or 'detected_stance' not in utterance.meta:
                    continue

            # Get topic from conversation metadata
            conversation = utterance.get_conversation()
            if not conversation or not conversation.meta or 'detected_topic' not in conversation.meta:
                continue

            if not utterance.timestamp:
                continue

            user_id = utterance.speaker.id
            old_topic = conversation.meta['detected_topic']
            topic = MERGED_TOPIC.get(old_topic, old_topic)
            stance = utterance.meta.get('detected_stance', 'Unknown')

            user_topic_posts[user_id][topic].append({
                'utterance_id': utterance.id,
                'timestamp': utterance.timestamp,
                'stance': stance
            })

        # Filter and sort
        timelines = {}
        for user_id, topic_posts in user_topic_posts.items():
            user_timeline = {}

            for topic, posts in topic_posts.items():
                if len(posts) >= self.min_posts_per_topic:
                    # Sort chronologically
                    posts.sort(key=lambda x: x['timestamp'])

                    # Create topic timeline
                    topic_timeline = {}
                    for post in posts:
                        topic_timeline[post['utterance_id']] = post['stance']

                    user_timeline[topic] = topic_timeline

            # Only include users with enough topics
            if len(user_timeline) >= self.min_topics_per_user:
                timelines[user_id] = user_timeline

        self.logger.info(f"Built timelines for {len(timelines)} users")
        return timelines

In [15]:
import numpy as np
from collections import Counter
import logging

class ChangeDetector:
    """Change detection with multiple methods (sliding window, simple, option A)."""

    def __init__(self, window_size=3, persistence_threshold=4, significance_level=0.05):
        self.window_size = window_size
        self.persistence_threshold = persistence_threshold
        self.alpha = significance_level
        self.stance_values = {
            'left-leaning': -1,
            'neutral': 0,
            'right-leaning': 1
        }
        self.all_change_points = []
        self.all_no_change_points = []

    def detect_persistent_changes(self, topic_timeline, conf=0.60, k=3, tie_band=0.10):

        if not topic_timeline:
            return {'change_points': [], 'no_change_points': []}

        def _to_probs(item):
            if isinstance(item, str):
                if item == 'left-leaning':  return (1.0, 0.0, 0.0)
                if item == 'neutral':       return (0.0, 1.0, 0.0)
                if item == 'right-leaning': return (0.0, 0.0, 1.0)
                return (0.0, 1.0, 0.0)
            if isinstance(item, dict):
                return (float(item.get('pL',0.0)), float(item.get('pN',0.0)), float(item.get('pR',0.0)))
            if isinstance(item, (list, tuple)) and len(item) == 3:
                pL, pN, pR = item
                return (float(pL), float(pN), float(pR))
            return (0.0, 1.0, 0.0)

        def _label_from_probs(pL, pN, pR):
            mx = max(pL, pN, pR)
            if mx < conf:
                return 'neutral'  # low confidence → neutral (uncertain)
            # break near-ties between left/right
            if abs(pR - pL) < tie_band and mx in (pR, pL):
                return 'neutral'
            return ['left-leaning', 'neutral', 'right-leaning'][[pL, pN, pR].index(mx)]

        utt_ids = [u for u,_ in topic_timeline]
        labels  = [_label_from_probs(*_to_probs(x)) for _, x in topic_timeline]

        change_points = []
        no_change_points = []

        current = labels[0]
        i = 1
        n = len(labels)

        while i < n:
            if labels[i] == current:
                i += 1
                continue

            # candidate new state; must persist for k steps
            cand = labels[i]
            j = i
            run_len = 0
            while j < n and labels[j] == cand and run_len < k:
                run_len += 1
                j += 1

            if run_len >= k:
                change_points.append(utt_ids[i])  # first idx of new regime
                current = cand
                i = j  # jump to end of confirmed run
            else:
                # not persistent; ignore and move one step
                i += 1

        # Non-change points: everything that didn't trigger a regime change
        changed_set = set(change_points)
        for uid in utt_ids:
            if uid not in changed_set:
                no_change_points.append(uid)

        # store globally if you want aggregate stats
        self.all_change_points.extend(change_points)
        self.all_no_change_points.extend(no_change_points)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
        }

    def get_two_groups(self, timelines, method='sliding_window', **kwargs):
        """
        Group users into with/without changes using the chosen method.
        method: 'sliding_window' | 'simple' | 'option_a'
        Extra kwargs are passed to the detector (e.g., conf=0.7, k=4, tie_band=0.1).
        """
        with_changes = {}
        no_changes = {}

        if method == 'sliding_window':
            detect_func = self.detect_persistent_changes
        elif method == 'simple':
            detect_func = self.detect_persistent_changes_simple
        elif method == 'option_a':
            # wrap to pass kwargs
            def detect_func(tl): return self.detect_changes_option_a(tl, **kwargs)
        else:
            raise ValueError(f"Unknown method: {method}")

        for user_id, topic_timelines in timelines.items():
            user_has_changes = False

            for topic_name, topic_timeline in topic_timelines.items():
                topic_timeline_list = list(topic_timeline.items())
                changes = detect_func(topic_timeline_list)

                if changes['change_points']:
                    user_has_changes = True
                    if user_id not in with_changes:
                        with_changes[user_id] = {}
                    with_changes[user_id][topic_name] = {
                        utt_id: topic_timeline[utt_id]
                        for utt_id in changes['change_points']
                    }

            if not user_has_changes:
                no_changes[user_id] = topic_timelines

        return {'with_changes': with_changes, 'no_changes': no_changes}

In [42]:
class WindowExtractor:
    """ Find the conversations around the change point """

    def __init__(self, corpus, timelines):
        self.corpus = corpus
        self.timelines = timelines
        self.user_conversations_cache = {}  # Add cache

    def build_global_user_conversations_index(self):
        """Build sorted conversations for ALL users upfront"""
        print("Building global user conversations index...")
        user_conversations = {}

        convos = list(corpus.iter_conversations())
        for convo in convos:
            # Get all speakers in this conversation
            speakers = {utt.speaker.id for utt in convo.iter_utterances()}

            # Add this conversation to each speaker's list
            for speaker_id in speakers:
                if speaker_id not in user_conversations:
                    user_conversations[speaker_id] = []
                user_conversations[speaker_id].append(convo)

        # Sort each user's conversations once
        for speaker_id in user_conversations:
            user_conversations[speaker_id].sort(
                key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances())
            )

        print(f"Index built for {len(user_conversations)} users!")

        self.user_conversations_cache = user_conversations

    def get_user_conversations_chronological_old(self, corpus, speaker_id):
        """Get all conversations for a user in chronological order."""

        # Check cache first
        if speaker_id in self.user_conversations_cache:
            return self.user_conversations_cache[speaker_id]

        # Get all conversations where the speaker participated
        user_conversations = [convo for convo in corpus.iter_conversations()
                              if speaker_id in [utt.speaker.id for utt in convo.iter_utterances()]]

        # Sort conversations by their earliest timestamp
        user_conversations.sort(key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances()))

        # Cache the result
        self.user_conversations_cache[speaker_id] = user_conversations

        return user_conversations

    def get_user_conversations_chronological(self, corpus, speaker_id):
        return self.user_conversations_cache.get(speaker_id, [])

    def get_conversations_around_change_point(self, corpus, change_point, test=False):
        # Get first change (probably only one I need)
        utterance = corpus.get_utterance(change_point)

        # Find the convo this utterance belongs to:
        conversation = utterance.get_conversation()

        # Put all user's convos in a list
        speaker_id = utterance.speaker.id
        if test is True:
            user_conversations = self.get_user_conversations_chronological_old(corpus, speaker_id)
        else:
            user_conversations = self.get_user_conversations_chronological(corpus, speaker_id)

        candidate_convos = []
        # find the index of the convo, and return the convo id of the 3 prior convos
        for i, convo in enumerate(user_conversations):
            if conversation.id == user_conversations[i].id:
                # Check if there are at least two conversations before the current one
                if i >= 2:
                    candidate_convos.append(user_conversations[i - 2])
                    candidate_convos.append(user_conversations[i - 1])
                elif i == 1:
                    # If only one conversation before, append that one
                    candidate_convos.append(user_conversations[i - 1])

                # Append the current conversation with the change point
                candidate_convos.append(conversation)
                break  # Found the conversation, no need to continue the loop

        return candidate_convos

In [None]:
corpus = Corpus(filename=CORPUS_PATH)

In [18]:
timeline_builder = TimelineBuilder(corpus)
timelines = timeline_builder.build_timelines()

In [47]:
# User to test:
user_id = "HardCoreModerate"
topic = "Economy & Tax"

In [45]:
change_detector = ChangeDetector()
topic_timeline = timelines[user_id][topic]
topic_timeline_list = list(topic_timeline.items())
change_points = change_detector.detect_persistent_changes(topic_timeline_list)['change_points']
groups = change_detector.get_two_groups(timelines)
print(change_points)

['c8jdp06', 'c8n8gvw', 'cd3j9yr', 'cg9y7y4']


In [46]:
window_extractor = WindowExtractor(corpus, timelines)
candidate_convos = window_extractor.get_conversations_around_change_point(change_point=change_points[0], corpus=corpus, test=True)
print(f"Candidate convos: {[convo for convo in candidate_convos]}")

Candidate convos: [Conversation({'obj_type': 'conversation', 'vectors': [], 'tree': None, 'owner': <convokit.model.corpus.Corpus object at 0x7ce1ce496450>, 'id': '18x1dr', 'meta': ConvoKitMeta({'title': "If there weren't so many people volunteering to go to war, would a government have to think harder about whether or not to start them?", 'num_comments': 96, 'domain': 'self.PoliticalDiscussion', 'timestamp': 1361402705, 'subreddit': 'PoliticalDiscussion', 'gilded': 0, 'gildings': None, 'stickied': False, 'author_flair_text': '', 'detected_topic': 'foreign policy and defense', 'topic_confidence': 0.42644694447517395, 'topic_scores': {'foreign policy and defense': 0.42644694447517395, 'media and political commentary': 0.06354590505361557, 'civil rights and social issues': 0.05673328414559364, 'gun rights and control': 0.04096480831503868, 'political figures and campaigns': 0.040524519979953766, 'criminal justice and policing': 0.04010428115725517, 'climate change and energy policy': 0.03

In [None]:
window_extractor.build_global_user_conversations_index()

In [66]:
# This should be a list of tuples, where the second part is the text of the concatenated utterances of a user
op_path_pairer = OpPathPairer(corpus, timelines)
op_path_pairs = op_path_pairer.extract_rooted_path_from_candidate_convos(candidate_convos, user_id)
pair_preprocessor = PairPreprocessor()
preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)
# print(preprocessed_pairs)

for pair in preprocessed_pairs:
    print(100*'===')
    op = pair[0]
    paths = pair[1]
    print(f"OP: {op.speaker.id}, Text: {op.text}")
    for id, text in paths.items():
        print(100*'---')
        print(f"ID: {id}, Text: {text}")
    break

OP: HardCoreModerate, Text: &gt;We don't invest in teachers who are competent

Says who? They have a bachelors degree and have to get certified, how much advanced training does one need to teach K-8? Lets get real here. 

&gt;and on top of that the public education system isn't about teaching kids to think freely and for themselves, it's about learning to operate within the system and make the grade.

Guess what? LIFE is about learning to operate within the system and making the grade. This is why so many people can't hold a job, they want to be independent free thinkers. Well, that's not how our society works. It rewards people who work within the system. 

Obviously we need bright people who think out of the system, but there are programs for these kids once they get identified in school systems. Here in NJ there is a program called heroes that runs conferences for parents &amp; children who are unusually gifted. They find target schools to send them to etc. 
------------------------

In [61]:
candidate_convos[0].print_conversation_structure()

Traxmyth
    cdb03b
        Traxmyth
            TwistedDrum5
        werdna24
            gjhgjh
                werdna24
                Kingsley-Zissou
                ctindel
        bartink
    Elliptical_Tangent
    devilcraft
        IAmA-Steve
            [deleted]
            MazInger-Z
            TuriGuiliano
        [deleted]
            therealpaulyd
                [deleted]
                    anticonventionalwisd
                        zulavos
                tonyray
                    therealpaulyd
                        ninja8ball
                            EonBlueAegis
                                HardCoreModerate
                                    way2lazy2care
                                        HardCoreModerate
                                            sailorbrendan
                                                HardCoreModerate
                                                    sailorbrendan
                                            [deleted]
  

In [None]:
persuasion_analyzer = Interplay()
interplay_features = persuasion_analyzer.calculate_interplay_features()

In [None]:
# Load English stop words
stop_words_set = set(stopwords.words('english'))

# use the groups
groups = change_detector.get_two_groups(timelines)
groups_tuple = (groups['with_changes'], groups['no_changes'])

# Init
i = 0
group_means = [] # Initialize as a list to append means
group_scores = []
utts_num = 0

# For each group
for group in groups_tuple:
    current_group_scores = []

    for user_id, topic_timelines in group.items():
        user_start_time = time.time()
        user_change_points = 0

        for topic_timeline in topic_timelines.values():

            for change_point in topic_timeline.keys(): # Iterate through change points (keys)
                print(f'User: {user_id}, topic: {topic_timeline}, change point {change_point}')
                utts_num += 1

                user_change_points += 1

                # TIME: Window extraction
                start_time = time.time()
                try:
                    candidate_convos = window_extractor.get_conversations_around_change_point(
                        change_point=change_point, corpus=corpus
                    )
                    window_time = time.time() - start_time
                    print(f'⏱️ Window extraction: {window_time:.3f}s')
                except ValueError as e:
                    print(f"Skipping change point {change_point}: {e}")
                    continue


                # TIME: Path extraction
                start_time = time.time()
                op_path_pairs = []
                for candidate_convo in candidate_convos:
                    try:
                        op_path_pairs.extend(op_path_pairer.extract_rooted_path_from_candidate_convos(
                            [candidate_convo], user_id
                        ))
                    except ValueError as e:
                        print(f"Skipping conversation {candidate_convo.id}: {e}")
                        continue
                path_time = time.time() - start_time
                print(f'⏱️ Path extraction: {path_time:.3f}s')


                # TIME: Preprocessing
                start_time = time.time()
                preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)
                preprocess_time = time.time() - start_time
                print(f'⏱️ Preprocessing: {preprocess_time:.3f}s')


                # TIME: Feature extraction
                start_time = time.time()
                features_list = []
                for op, paths in preprocessed_pairs:
                    for k, concatenated_utts in paths.items():
                        interplay_features = persuasion_analyzer.calculate_interplay_features(
                            op.text, concatenated_utts, stop_words_set
                        )
                        features_list.append(interplay_features)
                feature_time = time.time() - start_time
                print(f'⏱️ Feature extraction: {feature_time:.3f}s')


                # TIME: Scoring
                start_time = time.time()
                scores = []
                for interplay_features in features_list:
                    score = persuasion_analyzer.calculate_persuasion_score(interplay_features)
                    scores.append(score)
                scoring_time = time.time() - start_time
                print(f'⏱️ Scoring: {scoring_time:.3f}s')
                total_time = window_time + path_time + preprocess_time + feature_time + scoring_time
                print(f'🔥 TOTAL for change point: {total_time:.3f}s\n')

                # Print total time for this change point
                total_time = window_time + path_time + preprocess_time + feature_time + scoring_time
                print(f'🔥 TOTAL for change point: {total_time:.3f}s\n')

                current_group_scores.extend(scores)

        # TIME: End timing this user
        user_total_time = time.time() - user_start_time
        print(f'👤 USER {user_id} TOTAL: {user_total_time:.3f}s ({user_change_points} change points)')
        print(f'📊 Average per change point: {user_total_time/max(1, user_change_points):.3f}s\n')

    # Calculate mean for this group
    total = 0
    num_of_scores = 0
    for score in current_group_scores: # Iterate through individual scores
        total += score
        num_of_scores += 1

    group_mean = total / num_of_scores if num_of_scores > 0 else 0 # Handle division by zero
    group_means.append(group_mean) # Append mean to the list

# Print the calculated group means
print(f'Group Means: {group_means}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction: 0.000s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.000s

🔥 TOTAL for change point: 0.000s

User: a200ftmonster, topic: {'c34akkc': 'neutral', 'c34w7jd': 'neutral', 'c3bam9p': 'neutral'}, change point c3bam9p
⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.000s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction: 0.000s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.000s

🔥 TOTAL for change point: 0.000s

User: a200ftmonster, topic: {'c35qqmq': 'neutral'}, change point c35qqmq
⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.000s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction: 0.000s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.000s

🔥 TOTAL for change point: 0.000s

User: a200ftmonster, topic: {'c37o6aj': 'neutral', 'c37yfgg': 'neutral'}, change point c37o6aj
⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.000s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction: 0.000s
⏱