In [2]:
pwd

'/Users/leonidas/GitHub/temporal_belief_analysis/notebooks'

In [3]:
# Run twice
# import unsloth
# import unsloth_zoo
from convokit import Corpus, download
import convokit

In [4]:
# Load a corpus:
corpus = Corpus(filename="/workspace/temporal_belief_analysis/pd_corpus_with_stances100000_chronological")

In [5]:
print(corpus.meta)

ConvoKitMeta({'subreddit': 'PoliticalDiscussion', 'num_posts': 102848, 'num_comments': 4553046})


In [7]:
# This code does filter out utterances with no metadata
from typing import Dict, Any
from collections import defaultdict
import logging

class TimelineBuilder:
    """Simple timeline builder for user belief tracking.

    Builds structure: {user_id: {topic: {utterance_id: stance}}}
    """

    def __init__(self, corpus, min_posts_per_topic: int = 0, min_topics_per_user: int = 0):
        self.corpus = corpus
        self.min_posts_per_topic = min_posts_per_topic
        self.min_topics_per_user = min_topics_per_user
        self.logger = logging.getLogger(__name__)

    def build_timelines(self) -> Dict[str, Dict[str, Dict[str, str]]]:
        """Build user timelines from corpus with stance metadata.

        Returns:
            {user_id: {topic: {utterance_id: stance}}}
        """
        # Group by user and topic
        user_topic_posts = defaultdict(lambda: defaultdict(list))

        for utterance in self.corpus.iter_utterances():
            # Skip if no stance metadata on utterance
            if not utterance.meta or 'detected_stance' not in utterance.meta:
                continue

            # Get topic from conversation metadata
            conversation = utterance.get_conversation()
            if not conversation or not conversation.meta or 'detected_topic' not in conversation.meta:
                continue

            if not utterance.timestamp:
                continue

            user_id = utterance.speaker.id
            topic = conversation.meta['detected_topic']
            stance = utterance.meta['detected_stance']

            user_topic_posts[user_id][topic].append({
                'utterance_id': utterance.id,
                'timestamp': utterance.timestamp,
                'stance': stance
            })

        # Filter and sort
        timelines = {}
        for user_id, topic_posts in user_topic_posts.items():
            user_timeline = {}

            for topic, posts in topic_posts.items():
                if len(posts) >= self.min_posts_per_topic:
                    # Sort chronologically
                    posts.sort(key=lambda x: x['timestamp'])

                    # Create topic timeline
                    topic_timeline = {}
                    for post in posts:
                        topic_timeline[post['utterance_id']] = post['stance']

                    user_timeline[topic] = topic_timeline

            # Only include users with enough topics
            if len(user_timeline) >= self.min_topics_per_user:
                timelines[user_id] = user_timeline

        self.logger.info(f"Built timelines for {len(timelines)} users")
        return timelines

user_topic_posts = {
    "user_123": {
        "healthcare": [
            {'utterance_id': 'utterance_12345_reddit', 'timestamp': '2023-01-01', 'stance': 'moderately_in_favor'},
            {'utterance_id': 'utterance_67890_reddit', 'timestamp': '2023-01-15', 'stance': 'strongly_against'}
        ],
        "education": [
            {'utterance_id': 'utterance_32890_reddit', 'timestamp': '2023-02-01', 'stance': 'strongly_in_favor'}
        ]
    }
}

timelines = {
    "user_1": {
        "healthcare": {
            'utterance_12345_reddit': 'strongly_in_favor',
            'utterance_67890_reddit': 'strongly_against'
        },
        "education": {
            'utterance_32890_reddit': 'moderately_in_favor'
        }
    }
    "user_2": {
        "taxation": {
            'utterance_12355_reddit': 'strongly_in_favor',
            'utterance_67830_reddit': 'strongly_in_favor'
        },
        "taxation": {
            'utterance_32290_reddit': 'moderately_in_favor'
    .
    .
    .

}

In [8]:
builder = TimelineBuilder(corpus, 0, 0)  # min_topics_per_user
timelines = builder.build_timelines()

In [9]:
# How do get number of utterances in this topic?
# Maybe loop through and find for which one he has the most posts? or print the one with the most utterances yea.
print(len(list(timelines['HardCoreModerate']['media and political commentary'])))

145


In [38]:
# Get the first user
first_user = list(timelines.keys())[2]

# Print their complete timeline across all topics
print(f"Timeline for user: {first_user}")
for topic, posts in timelines[first_user].items():
   print(f"\n  Topic: {topic}")
   for utterance_id, stance in posts.items():
       print(f"    {utterance_id}: {stance}")

Timeline for user: [deleted]

  Topic: climate change and energy policy
    c27e03w: neutral
    c27o2yl: neutral
    nyu5u: moderately_against

  Topic: media and political commentary
    gq004: unknown
    c1pk4gc: moderately_against
    c1pt5h6: moderately_favor
    c1ptgcy: moderately_against
    c1ptq7w: moderately_against
    c1pvhwh: moderately_against
    c1pvu5h: moderately_against
    c1pwfg4: unknown
    c1pwryi: moderately_against
    c1qvzwa: neutral
    c1t3f88: moderately_against
    c1t5q5y: moderately_against
    c1t8gg4: moderately_against
    h9oq6: strongly_against
    c1to9to: unknown
    c1ttf3v: unknown
    c1ttghq: unknown
    c1ttglt: unknown
    hhgc7: moderately_against
    hm96v: moderately_favor
    c1wjidt: moderately_against
    c1wjmpt: moderately_against
    c1wjut2: moderately_against
    c1wkgas: moderately_against
    c1wkgj1: moderately_against
    hp4u4: moderately_favor
    c1xagfv: moderately_against
    c1xovhl: moderately_against
    c1xrx8a: m

In [1]:
print(f"Number of users in timelines: {len(timelines)}")

NameError: name 'timelines' is not defined

In [35]:
print(f"Total utterances in corpus: {len(list(corpus.iter_utterances()))}")


Total utterances in corpus: 4655894


In [12]:
count_with_metadata = 0
for utterance in corpus.iter_utterances():
    if (utterance.meta and
        'detected_stance' in utterance.meta and
        'detected_topic' in utterance.meta):
        count_with_metadata += 1

print(f"Utterances with stance/topic metadata: {count_with_metadata}")


Utterances with stance/topic metadata: 0


In [23]:
utts = list(corpus.iter_utterances())
print(utts[0].meta)

ConvoKitMeta({'score': 0, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'PoliticalDiscussion', 'stickied': False, 'permalink': '/r/PoliticalDiscussion/comments/nz333/if_austrian_economics_is_so_wonderful_why_dont/', 'author_flair_text': '', 'detected_stance': 'unknown', 'stance_confidence': 0.0})


In [36]:
# Count total utterances in your built timelines
total_timeline_utterances = 0
for user_id, user_timeline in timelines.items():
    for topic, topic_posts in user_timeline.items():
        total_timeline_utterances += len(topic_posts)

print(f"Total utterances in timelines: {total_timeline_utterances}")
# Should be close to 1000 (your test batch)

Total utterances in timelines: 1050
