In [None]:
# Need to restart after:
!pip install convokit

In [None]:
# Download file from Google Drive to colab directory
!pip install gdown
file_id = "1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0"
!gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip" --fuzzy

In [3]:
# Unzip with python:
import zipfile
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip").extractall("/content/temporal_belief_analysis")

In [4]:
# For runpod-jupyter or local (run twice)
import sys
import os

# Change to the correct working directory (workspace if runpod, content if colab)
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Comment out if in colab:
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.change_detection import ChangeDetector
from temporal_belief.core.WindowExtraction import WindowExtractor
from temporal_belief.core.OpPathPairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor

Changed working directory to: /content/temporal_belief_analysis/notebooks


In [5]:
# Run twice
# import unsloth
# import unsloth_zoo
from convokit import Corpus, download
import convokit

Error from Unsloth: NotImplementedError: Unsloth currently only works on NVIDIA GPUs and Intel GPUs.



Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


In [6]:
# Load a corpus:
# corpus = Corpus(filename="/Users/leonidas/.convokit/saved-corpora/pd_corpus_with_stances1000_chronological")
corpus = Corpus(filename="/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological")

No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem


In [7]:
print(corpus.meta)

ConvoKitMeta({'subreddit': 'PoliticalDiscussion', 'num_posts': 102848, 'num_comments': 4553046})


In [8]:
# Test timeline builder:
timeline_builder = TimelineBuilder(corpus, min_posts_per_topic=0, min_topics_per_user=0)
timelines = timeline_builder.build_timelines()

# Filter for analysis
change_detector_preprocessor = ChangeDetectorPreprocessor()
filtered_timelines = change_detector_preprocessor.filter_for_change_detection(timelines, min_posts_per_topic=5, min_topics_per_user=2)

# Get a specific user's timeline for a specific topic
user_id = "HardCoreModerate"
topic = "media and political commentary"
topic_timeline = filtered_timelines[user_id][topic]  # This is {utterance_id: stance}

# Convert to list of tuples
topic_timeline_list = list(topic_timeline.items())

2025-08-08 14:45:24,183 - temporal_belief.core.timeline_building - INFO - timeline_building:71 - Built timelines for 4781 users
INFO:temporal_belief.core.timeline_building:Built timelines for 4781 users


In [9]:
# Test the change detector:
persistence_detector = ChangeDetector()
change_points = persistence_detector.detect_persistent_changes(topic_timeline_list)

Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: strongly_against and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:neutral, Previous: moderately_against and Next:neutral
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_against, Previous: strongly_against and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: neutral and Next:moderately_against
Current:moderately_favor, Previous: moderately_against and Next:moderately_favor
Current:moderately_against, Previous: moderately_favor and Next:moderately_against
Current:moderately_against, Previous: moderately_favor and Nex

In [10]:
# Test the window extractor:
window_extractor = WindowExtractor(corpus, timelines=timelines)
candidate_convos = window_extractor.get_conversations_around_change_point(change_points=change_points, corpus=corpus)
for convo in candidate_convos:
  print(f'ID:{convo.id}')

ID:muccw
ID:mv2yv
ID:mv3ou


In [59]:
import re

class PairPreprocessor:

    def tokenize_quotes(self, utterance_text):
        lines = utterance_text.split('\n')
        processed_lines = []

        for line in lines:
            line = line.strip()
            if line.startswith('&gt;') or line.startswith('>'):
                processed_lines.append('[QUOTE]')
            else:
                processed_lines.append(line)

        return '\n'.join(processed_lines)

    def concatenate_path(self, paths):
        concatenated_paths = {}
        for key, utt_list in paths.items():
            path_text = ''
            for utt in utt_list:
                utt_text_quoted = self.tokenize_quotes(utt.text)
                path_text += utt_text_quoted + ' '
            concatenated_paths[key] = path_text.strip()
        return concatenated_paths

    def tokenize_and_lower(op_text, reply_path_text, stop_words_set):
        op_words = op_text.lower().split()
        reply_words = reply_path_text.lower().split()

        return (op_words, reply_words)

    # This pattern keeps letters, numbers, whitespace, and apostrophes (for contractions)
    def remove_punctuation(op_text, reply_path_text):
        op_text = re.sub(r"[^\w\s']", '', op_text)
        reply_path_text = re.sub(r"[^\w\s']", '', reply_path_text)

        return op_text, reply_path_text

    def remove_quotes_from_all(self, op_path_pairs):
        marked_pairs = []
        for op_path_pair in op_path_pairs:
            # Process the OP utterance
            op_text = self.tokenize_quotes(op_path_pair[0].text)

            # Process each utterance path
            processed_paths = []
            for utterances in op_path_pair[1].values():
                path = [self.tokenize_quotes(utt.text) for utt in utterances]
                processed_paths.append(path)

            marked_pairs.append((op_text, processed_paths))

        return marked_pairs

    def concatenate_path_in_pair(self, pair):
        op = pair[0]
        paths = pair[1]

        concatenated_paths = self.concatenate_path(paths)

        return (op, concatenated_paths)

    def concatenate_path_in_all_pairs(self, op_path_pairs):
        # op_path_pairs_quoted = self.remove_quotes_from_all(op_path_pairs)
        preprocessed_pairs = []
        for pair in op_path_pairs:
            pair = self.concatenate_path_in_pair(pair)
            preprocessed_pairs.append(pair)

        return preprocessed_pairs

    def clean_and_tokenize(self, op_text, reply_path_text):
        # Step 1: Remove punctuation
        op_text, reply_path_text = self.remove_punctuation(op_text, reply_path_text)

        # Step 2: Tokenize and lowercase
        op_words, reply_words = self.tokenize_and_lower(op_text, reply_path_text)

        return op_words, reply_words

In [65]:
def calculate_interplay_features(op_text, reply_path_text, stop_words_set):
    """Calculate 12 interplay features between OP and reply path."""

    # Remove punctuation
    op_text = re.sub(r"[^\w\s']", '', op_text)
    reply_path_text = re.sub(r"[^\w\s']", '', reply_path_text)

    # Tokenize and clean
    op_words = op_text.lower().split()
    reply_words = reply_path_text.lower().split()

    # Create word sets
    op_all = set(op_words)
    reply_all = set(reply_words)
    op_stop = set(w for w in op_words if w in stop_words_set)
    reply_stop = set(w for w in reply_words if w in stop_words_set)
    op_content = set(w for w in op_words if w not in stop_words_set)
    reply_content = set(w for w in reply_words if w not in stop_words_set)

    # Calculate 4 metrics for each word type
    features = {}

    for word_type, (op_set, reply_set) in [
        ('all', (op_all, reply_all)),
        ('stop', (op_stop, reply_stop)),
        ('content', (op_content, reply_content))
    ]:
        intersection = len(op_set & reply_set)
        union = len(op_set | reply_set)

        features[f'common_words_{word_type}'] = intersection
        features[f'sim_frac_reply_{word_type}'] = intersection / len(reply_set) if reply_set else 0
        features[f'sim_frac_op_{word_type}'] = intersection / len(op_set) if op_set else 0
        features[f'jaccard_{word_type}'] = intersection / union if union else 0

    return features

In [66]:
# Test the OP and path pairer:
utt_id = change_points[0][1]
user_id = corpus.get_utterance(change_points[0][1]).speaker.id
op_path_pairer = OpPathPairer(corpus, timelines=timelines)

# List of tuples:
op_path_pairs = op_path_pairer.extract_rooted_path_from_candidate_convos(candidate_convos, user_id)

# def print_user_path_utterances(utterances):
#   for i, utt in enumerate(utterances):
#     print(f'{i}, {utt.text}\n')

for op_path_pair in op_path_pairs:
  print(1000*'=')
  print(f'\nop: {op_path_pair[0].id}\n')
  for path, utterances in op_path_pair[1].items():
    print(f'path: {path}, utterances: {[utt.text for utt in utterances]}\n')

my input user_id: HardCoreModerate
Utt_id: c344cv7 and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c344ch2 and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c344lkp and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c34bomu and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c3443ch and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c344ksk and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c344801 and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c3449zr and user_id: HardCoreModerate in the list of all op utterances.
my input user_id: HardCoreModerate
Utt_id: c345pi7 and user_id: HardCoreModerate

In [67]:
print(op_path_pairs[3])

(Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x7eba74a27b90>, 'id': 'HardCoreModerate', 'meta': ConvoKitMeta({})}), 'owner': <convokit.model.corpus.Corpus object at 0x7eba74a27b90>, 'id': 'c34bomu', 'meta': ConvoKitMeta({'score': 1, 'top_level_comment': 'c342t0k', 'retrieved_on': 1427956771, 'gilded': 0, 'gildings': None, 'subreddit': 'PoliticalDiscussion', 'stickied': False, 'permalink': '', 'author_flair_text': '', 'detected_stance': 'moderately_favor', 'stance_confidence': 0.8944957256317139, 'stance_scores': {'strongly_favor': 0.3542829751968384, 'moderately_favor': 0.8944957256317139, 'neutral': 0.14797292401393256, 'moderately_against': 0.7100401421387991, 'strongly_against': 0.2817333862185478}})}), {})


In [68]:
# Test the preprocessor:
pair_preprocessor = PairPreprocessor()
pair = op_path_pairs[3]
# So it should take a tuple, where the second part of the tuple is a dictionary of path_key, list of utterance pairs

# WHY THIS WORKS????
# for k, v in pair[1].items():
#     print(k, v)

preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)

for pair in preprocessed_pairs:
    for k, utt_text in pair[1].items():
        print(1000*'=')
        print(f'Text: {utt_text}\n')

# Now do it for all pairs

# DONT FORGET TO REMOVE DELETED ETC.
# SHIT BUT CAN I KEEP THEM TO ANALYSE THEM SOMEHOW?
# AS LONG AS THEY ARE IN THE SAME PATH I THINK I CAN KEEP THEM?


Text: Ok, I'll bite.. Link to podcast please? Great. So far I like your ideas. I'll check it out.

Text: I didn't think anyone would bite lol...

http://urbannerd.com/category/podcast/


Text: [deleted] [deleted] [deleted] [deleted] [deleted]

Text: I do not work in politics. I don't think you need to work in politics to create change. He wants to change the direction of politics. That means he must get active. To not do so would be the definition of lazy, would it not? I think I will have to disagree. If you don't like public policy, but complain about public policy, and instead of doing something about it, would rather just flee, then yes I think that is lazy.

Every generation thinks we are going to hell in a handbasket. That is nothing new. We need to start teaching youth to stop sensationalizing things and instead to analyze them and get involved. I fear that sites like reddit encourage a FOX News-like knee jerk reactive &amp; lazy liberal youth.

Also, you are talking about grass

In [69]:
stop_words_set = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
    'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
    'to', 'was', 'were', 'will', 'with', 'would', 'she', 'her', 'him',
    'his', 'have', 'had', 'you', 'your', 'yours', 'we', 'our', 'ours',
    'they', 'them', 'their', 'theirs', 'i', 'me', 'my', 'mine', 'this',
    'these', 'that', 'those', 'am', 'been', 'being', 'do', 'does', 'did',
    'doing', 'can', 'could', 'should', 'would', 'may', 'might', 'must',
    'shall', 'will', 'about', 'above', 'after', 'again', 'against', 'all',
    'any', 'because', 'before', 'below', 'between', 'both', 'but', 'down',
    'during', 'each', 'few', 'further', 'here', 'how', 'if', 'into', 'more',
    'most', 'no', 'nor', 'not', 'now', 'only', 'or', 'other', 'out', 'over',
    'own', 'same', 'so', 'some', 'such', 'than', 'then', 'there', 'through',
    'too', 'under', 'until', 'up', 'very', 'what', 'when', 'where', 'which',
    'while', 'who', 'why', 'once', 'off', 'just', 'also', 'get', 'got',
    'go', 'goes', 'going', 'gone', 'come', 'came', 'coming', 'take', 'took',
    'taken', 'taking', 'make', 'made', 'making', 'see', 'saw', 'seen',
    'seeing', 'know', 'knew', 'known', 'knowing', 'say', 'said', 'saying',
    'says', 'think', 'thought', 'thinking', 'thinks', 'use', 'used', 'using',
    'uses', 'want', 'wanted', 'wanting', 'wants', 'need', 'needed', 'needing',
    'needs', 'give', 'gave', 'given', 'giving', 'gives', 'like', 'liked',
    'liking', 'likes', 'look', 'looked', 'looking', 'looks', 'way', 'ways',
    'time', 'times', 'work', 'worked', 'working', 'works', 'day', 'days',
    'year', 'years', 'back', 'good', 'new', 'first', 'last', 'long', 'great',
    'little', 'right', 'old', 'high', 'different', 'small', 'large', 'next',
    'early', 'young', 'important', 'public', 'bad', 'same', 'able'
}

In [70]:
scores = []
for op, paths in preprocessed_pairs:
    for k, concatenated_utts in paths.items():
        s = calculate_interplay_features(op.text, concatenated_utts, stop_words_set)
    scores.append(s)

for score in scores:
    print(score)
    break

{'common_words_all': 47, 'sim_frac_reply_all': 0.22488038277511962, 'sim_frac_op_all': 0.281437125748503, 'jaccard_all': 0.14285714285714285, 'common_words_stop': 37, 'sim_frac_reply_stop': 0.5138888888888888, 'sim_frac_op_stop': 0.6065573770491803, 'jaccard_stop': 0.3854166666666667, 'common_words_content': 10, 'sim_frac_reply_content': 0.072992700729927, 'sim_frac_op_content': 0.09433962264150944, 'jaccard_content': 0.04291845493562232}


In [None]:
# So now thnk of all things that need to be tidied up.
# Need to score things properly according to what they said.
# Need to find that previous chat where I was given the interplay code and in general tighten up the interplay code
# Need to look into potentially more preprocessing if needed
# Need to find proper stop words
# Need to make it so that it runs on the entire dataset.