In [None]:
# Need to restart after:
!pip install convokit

In [1]:
# Download file from Google Drive to colab directory
!pip install gdown
file_id = "1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0"
!gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip" --fuzzy

Downloading...
From (original): https://drive.google.com/uc?id=1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0
From (redirected): https://drive.google.com/uc?id=1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0&confirm=t&uuid=dec8e496-9995-4457-9c6f-cc93932fa365
To: /content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip
100% 841M/841M [00:30<00:00, 27.2MB/s]


In [2]:
# Unzip with python:
import zipfile
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip").extractall("/content/temporal_belief_analysis")

In [3]:
# For runpod-jupyter or local (run twice)
import sys
import os

# Change to the correct working directory (workspace if runpod, content if colab)
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Comment out if in colab:
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.change_detection import ChangeDetector
from temporal_belief.core.WindowExtraction import WindowExtractor
from temporal_belief.core.OpPathPairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor

Changed working directory to: /content/temporal_belief_analysis/notebooks


In [4]:
# Run twice
# import unsloth
# import unsloth_zoo
from convokit import Corpus, download
import convokit

Error from Unsloth: NotImplementedError: Unsloth currently only works on NVIDIA GPUs and Intel GPUs.



Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


In [5]:
# Load a corpus:
# corpus = Corpus(filename="/Users/leonidas/.convokit/saved-corpora/pd_corpus_with_stances1000_chronological")
corpus = Corpus(filename="/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological")

No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem


In [6]:
print(corpus.meta)

ConvoKitMeta({'subreddit': 'PoliticalDiscussion', 'num_posts': 102848, 'num_comments': 4553046})


In [7]:
# Test timeline builder:
timeline_builder = TimelineBuilder(corpus, min_posts_per_topic=0, min_topics_per_user=0)
timelines = timeline_builder.build_timelines()

# Filter for analysis
change_detector_preprocessor = ChangeDetectorPreprocessor()
filtered_timelines = change_detector_preprocessor.filter_for_change_detection(timelines, min_posts_per_topic=5, min_topics_per_user=2)

# Get a specific user's timeline for a specific topic
user_id = "HardCoreModerate"
topic = "media and political commentary"
topic_timeline = filtered_timelines[user_id][topic]  # This is {utterance_id: stance}

# Convert to list of tuples
topic_timeline_list = list(topic_timeline.items())

2025-08-09 11:08:42,211 - temporal_belief.core.timeline_building - INFO - timeline_building:71 - Built timelines for 4781 users
INFO:temporal_belief.core.timeline_building:Built timelines for 4781 users


In [8]:
# Test the change detector:
persistence_detector = ChangeDetector()
change_points = persistence_detector.detect_persistent_changes(topic_timeline_list)

In [9]:
# Test the window extractor:
window_extractor = WindowExtractor(corpus, timelines=timelines)
candidate_convos = window_extractor.get_conversations_around_change_point(change_points=change_points, corpus=corpus)
for convo in candidate_convos:
  print(f'ID:{convo.id}')

ID:muccw
ID:mv2yv
ID:mv3ou


In [10]:
import re

class PairPreprocessor:

    def tokenize_quotes(self, utterance_text):
        lines = utterance_text.split('\n')
        processed_lines = []

        for line in lines:
            line = line.strip()
            if line.startswith('&gt;') or line.startswith('>'):
                processed_lines.append('[QUOTE]')
            else:
                processed_lines.append(line)

        return '\n'.join(processed_lines)

    def concatenate_path(self, paths):
        concatenated_paths = {}
        for key, utt_list in paths.items():
            path_text = ''
            for utt in utt_list:
                utt_text_quoted = self.tokenize_quotes(utt.text)
                path_text += utt_text_quoted + ' '
            concatenated_paths[key] = path_text.strip()
        return concatenated_paths

    def tokenize_and_lower(op_text, reply_path_text, stop_words_set):
        op_words = op_text.lower().split()
        reply_words = reply_path_text.lower().split()

        return (op_words, reply_words)

    # This pattern keeps letters, numbers, whitespace, and apostrophes (for contractions)
    def remove_punctuation(op_text, reply_path_text):
        op_text = re.sub(r"[^\w\s']", '', op_text)
        reply_path_text = re.sub(r"[^\w\s']", '', reply_path_text)

        return op_text, reply_path_text

    def remove_quotes_from_all(self, op_path_pairs):
        marked_pairs = []
        for op_path_pair in op_path_pairs:
            # Process the OP utterance
            op_text = self.tokenize_quotes(op_path_pair[0].text)

            # Process each utterance path
            processed_paths = []
            for utterances in op_path_pair[1].values():
                path = [self.tokenize_quotes(utt.text) for utt in utterances]
                processed_paths.append(path)

            marked_pairs.append((op_text, processed_paths))

        return marked_pairs

    def concatenate_path_in_pair(self, pair):
        op = pair[0]
        paths = pair[1]

        concatenated_paths = self.concatenate_path(paths)

        return (op, concatenated_paths)

    def concatenate_path_in_all_pairs(self, op_path_pairs):
        # op_path_pairs_quoted = self.remove_quotes_from_all(op_path_pairs)
        preprocessed_pairs = []
        for pair in op_path_pairs:
            pair = self.concatenate_path_in_pair(pair)
            preprocessed_pairs.append(pair)

        return preprocessed_pairs

    def clean_and_tokenize(self, op_text, reply_path_text):
        # Step 1: Remove punctuation
        op_text, reply_path_text = self.remove_punctuation(op_text, reply_path_text)

        # Step 2: Tokenize and lowercase
        op_words, reply_words = self.tokenize_and_lower(op_text, reply_path_text)

        return op_words, reply_words

In [12]:
def calculate_interplay_features(op_text, reply_path_text, stop_words_set):
    """Calculate 12 interplay features between OP and reply path."""

    # Remove punctuation
    op_text = re.sub(r"[^\w\s']", '', op_text)
    reply_path_text = re.sub(r"[^\w\s']", '', reply_path_text)

    # Tokenize and clean
    op_words = op_text.lower().split()
    reply_words = reply_path_text.lower().split()

    # Create word sets
    op_all = set(op_words)
    reply_all = set(reply_words)
    op_stop = set(w for w in op_words if w in stop_words_set)
    reply_stop = set(w for w in reply_words if w in stop_words_set)
    op_content = set(w for w in op_words if w not in stop_words_set)
    reply_content = set(w for w in reply_words if w not in stop_words_set)

    # Calculate 4 metrics for each word type
    features = {}

    for word_type, (op_set, reply_set) in [
        ('all', (op_all, reply_all)),
        ('stop', (op_stop, reply_stop)),
        ('content', (op_content, reply_content))
    ]:
        intersection = len(op_set & reply_set)
        union = len(op_set | reply_set)

        features[f'common_words_{word_type}'] = intersection
        features[f'sim_frac_reply_{word_type}'] = intersection / len(reply_set) if reply_set else 0
        features[f'sim_frac_op_{word_type}'] = intersection / len(op_set) if op_set else 0
        features[f'jaccard_{word_type}'] = intersection / union if union else 0

    return features

In [59]:
def calculate_persuasion_score(interplay_features):
    """
    Calculate persuasion score based on Tan et al.'s CMV findings.
    Higher scores indicate higher persuasion likelihood.
    """

    # Extract the key predictive features
    reply_frac_content = interplay_features.get('sim_frac_reply_content', 0)
    jaccard_content = interplay_features.get('jaccard_content', 0)
    op_frac_stop = interplay_features.get('sim_frac_op_stop', 0)
    reply_frac_all = interplay_features.get('sim_frac_reply_all', 0)

    # Apply their findings (↓↓↓↓ means negative correlation, ↑↑↑↑ means positive)
    score = 0

    # Strongest predictor: less content word similarity → more persuasive
    score += (1 - reply_frac_content) * 0.4  # Weight of 0.4 for strongest predictor

    # Less content overlap → more persuasive
    score += (1 - jaccard_content) * 0.3     # Weight of 0.3

    # More stopword similarity → more persuasive
    score += op_frac_stop * 0.2              # Weight of 0.2

    # Less overall similarity → more persuasive
    score += (1 - reply_frac_all) * 0.1      # Weight of 0.1

    return score

In [13]:
# Test the OP and path pairer:
utt_id = change_points[0][1]
user_id = corpus.get_utterance(change_points[0][1]).speaker.id
op_path_pairer = OpPathPairer(corpus, timelines=timelines)

# List of tuples:
op_path_pairs = op_path_pairer.extract_rooted_path_from_candidate_convos(candidate_convos, user_id)

# def print_user_path_utterances(utterances):
#   for i, utt in enumerate(utterances):
#     print(f'{i}, {utt.text}\n')

for op_path_pair in op_path_pairs:
  print(1000*'=')
  print(f'\nop: {op_path_pair[0].id}\n')
  for path, utterances in op_path_pair[1].items():
    print(f'path: {path}, utterances: {[utt.text for utt in utterances]}\n')


op: c344cv7


op: c344ch2


op: c344lkp


op: c3443ch

path: [deleted]_path_0, utterances: ['[deleted]', '[deleted]']

path: HardCoreModerate_path_0, utterances: ["I do not work in politics. I don't think you need to work in politics to create change. He wants to change the direction of politics. That means he must get active. To not do so would be the definition of lazy, would it not? ", "I think I will have to disagree. If you don't like public policy, but complain about public policy, and instead of doing something about it, would rather just flee, then yes I think that is lazy. \n\nEvery generation thinks we are going to hell in a handbasket. That is nothing new. We need to start teaching youth to stop sensationalizing things and instead to analyze them and get involved. I fear that sites like reddit encourage a FOX News-like knee jerk reactive &amp; lazy liberal youth. \n\nAlso, you are talking about grassroots movements nationwide, I am talking about getting involved locally. Th

In [14]:
print(op_path_pairs[3])

(Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x7b84f22532d0>, 'id': 'HardCoreModerate', 'meta': ConvoKitMeta({})}), 'owner': <convokit.model.corpus.Corpus object at 0x7b84f22532d0>, 'id': 'c3443ch', 'meta': ConvoKitMeta({'score': 18, 'top_level_comment': 'c3443ch', 'retrieved_on': 1427953119, 'gilded': 0, 'gildings': None, 'subreddit': 'PoliticalDiscussion', 'stickied': False, 'permalink': '', 'author_flair_text': '', 'detected_stance': 'moderately_against', 'stance_confidence': 0.517206738392512, 'stance_scores': {'strongly_favor': 0.013105638635655245, 'moderately_favor': 0.3341254343589147, 'neutral': 0.07730480283498764, 'moderately_against': 0.517206738392512, 'strongly_against': 0.041570129338651896}})}), {'[deleted]_path_0': [Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': <convokit.model.corpus.

In [15]:
# Test the preprocessor:
pair_preprocessor = PairPreprocessor()
pair = op_path_pairs[3]
# So it should take a tuple, where the second part of the tuple is a dictionary of path_key, list of utterance pairs

# WHY THIS WORKS????
# for k, v in pair[1].items():
#     print(k, v)

preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)

for pair in preprocessed_pairs:
    for k, utt_text in pair[1].items():
        print(1000*'=')
        print(f'Text: {utt_text}\n')

# Now do it for all pairs

# DONT FORGET TO REMOVE DELETED ETC.
# SHIT BUT CAN I KEEP THEM TO ANALYSE THEM SOMEHOW?
# AS LONG AS THEY ARE IN THE SAME PATH I THINK I CAN KEEP THEM?


Text: [deleted] [deleted]

Text: I do not work in politics. I don't think you need to work in politics to create change. He wants to change the direction of politics. That means he must get active. To not do so would be the definition of lazy, would it not? I think I will have to disagree. If you don't like public policy, but complain about public policy, and instead of doing something about it, would rather just flee, then yes I think that is lazy.

Every generation thinks we are going to hell in a handbasket. That is nothing new. We need to start teaching youth to stop sensationalizing things and instead to analyze them and get involved. I fear that sites like reddit encourage a FOX News-like knee jerk reactive &amp; lazy liberal youth.

Also, you are talking about grassroots movements nationwide, I am talking about getting involved locally. These are 2 different things. Even if he moves abroad, he should be involved locally, and in fact will have to if he will ever have a chance at 

In [33]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# Load English stop words
stop_words_set = set(stopwords.words('english'))

print(len(stop_words_set))
print(list(stop_words_set)[:20])  # Show first 20 stop words

198
["didn't", "we've", 'herself', 'were', 'again', 'by', 'shan', 'this', "we'll", 'shouldn', 'to', 'because', 'hasn', "wasn't", 'here', 'they', 'through', 'if', 'will', 'between']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
# Test interplay features extraction
features_list = []
for op, paths in preprocessed_pairs:
    for k, concatenated_utts in paths.items():
        interplay_features = calculate_interplay_features(op.text, concatenated_utts, stop_words_set)
        features_list.append(interplay_features)

for interplay_features in features_list:
    print(interplay_features)

{'common_words_all': 0, 'sim_frac_reply_all': 0.0, 'sim_frac_op_all': 0.0, 'jaccard_all': 0.0, 'common_words_stop': 0, 'sim_frac_reply_stop': 0, 'sim_frac_op_stop': 0.0, 'jaccard_stop': 0.0, 'common_words_content': 0, 'sim_frac_reply_content': 0.0, 'sim_frac_op_content': 0.0, 'jaccard_content': 0.0}
{'common_words_all': 55, 'sim_frac_reply_all': 0.4365079365079365, 'sim_frac_op_all': 0.23809523809523808, 'jaccard_all': 0.18211920529801323, 'common_words_stop': 32, 'sim_frac_reply_stop': 0.8205128205128205, 'sim_frac_op_stop': 0.47058823529411764, 'jaccard_stop': 0.4266666666666667, 'common_words_content': 23, 'sim_frac_reply_content': 0.26436781609195403, 'sim_frac_op_content': 0.1411042944785276, 'jaccard_content': 0.1013215859030837}
{'common_words_all': 65, 'sim_frac_reply_all': 0.32338308457711445, 'sim_frac_op_all': 0.2813852813852814, 'jaccard_all': 0.1771117166212534, 'common_words_stop': 42, 'sim_frac_reply_stop': 0.7, 'sim_frac_op_stop': 0.6176470588235294, 'jaccard_stop': 0.4

In [70]:
# Test interplay scoring
scores = []
for interplay_features in features_list:
    score = calculate_persuasion_score(interplay_features)
    scores.append(score)

print(scores)

[0.7999999999999999, 0.7143232512003231, 0.8013877162142911, 0.6374143142332578, 0.5365569228786804, 0.6374143142332578, 0.720804325160596, 0.7126425570228091, 0.865419356456039, 0.8494429211333775, 0.7661277295008913, 0.7018592214898645, 0.7426964401822087, 0.7999999999999999, 0.7143232512003231, 0.632953027364792, 0.6861296242634984, 0.7637032360439859, 0.7040075120957473, 0.7999999999999999, 0.8294804903075728, 0.7033945370055538, 0.6423529411764706, 0.7075121840643772, 0.7832832501357317, 0.7713659305041733, 0.661607843137255, 0.7194536241462878, 0.6703145337737084, 0.7261863516906513, 0.6654980121318101, 0.7999999999999999, 0.8054213106295149, 0.7110688017579186, 0.7999999999999999, 0.8294804903075728, 0.6997746598639456, 0.7546045334488353, 0.7152603731551099, 0.8739133353213501, 0.8877298086909815, 0.7999999999999999, 0.9003314686482425, 0.9196638691043264, 0.7999999999999999, 0.9003314686482425, 0.8357142857142856, 0.7999999999999999, 0.9647805974149126, 0.8557142857142856, 0.7

In [30]:
# So now thnk of all things that need to be tidied up.
# Need to score things properly according to what they said.
# Need to find that previous chat where I was given the interplay code and in general tighten up the interplay code
# Need to look into potentially more preprocessing if needed
# Need to find proper stop words
# Need to make it so that it runs on the entire dataset.

In [84]:
# Test persuation analysis coordinator

# For topic_timeline in timelines:
pair_preprocessor = PairPreprocessor()
persistence_detector = ChangeDetector()
window_extractor = WindowExtractor(corpus, timelines=timelines)
op_path_pairer = OpPathPairer(corpus, timelines=timelines)
i = 0
# repeat for both groups
# repeat for all users
for topic_timeline in timelines.values():
    if i < 3:
        # But what if there's multiple genuine change points in the same timeline?
        change_point = persistence_detector.detect_persistent_changes(topic_timeline_list)[0]
        print("change point:", change_point)

        # Now find convos around that change_point
        candidate_convos = window_extractor.get_conversations_around_change_point(change_points=change_points, corpus=corpus)

        # Now make op_path_pairs
        utt_id = change_points[0][1]
        user_id = corpus.get_utterance(change_points[0][1]).speaker.id
        op_path_pairs = op_path_pairer.extract_rooted_path_from_candidate_convos(candidate_convos, user_id)

        preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)

        # Interplay Feature extraction:
        features_list = []
        for op, paths in preprocessed_pairs:
            for k, concatenated_utts in paths.items():
                interplay_features = calculate_interplay_features(op.text, concatenated_utts, stop_words_set)
                features_list.append(interplay_features)

        # Interplay scoring:
        scores = []
        for interplay_features in features_list:
            score = calculate_persuasion_score(interplay_features)
            scores.append(score)

change point: (20, 'c345fu2')
{'economic policy': {'j3g36': 'moderately_against', 'c28uhbc': 'moderately_against', 'c28vhor': 'moderately_against', 'c28vyhs': 'moderately_against', 'c28wegs': 'moderately_against', 'c28yyll': 'moderately_against', 'c28yzm6': 'moderately_against', 'c28yzso': 'moderately_against', 'c28z353': 'neutral', 'c290fut': 'strongly_against', 'c290g67': 'neutral', 'c290h6h': 'moderately_against', 'c290mll': 'neutral', 'c291t8z': 'moderately_against', 'c291yto': 'strongly_against', 'c291zvv': 'moderately_against', 'c2928gw': 'neutral', 'c292b02': 'neutral', 'c29c7v8': 'moderately_against', 'c2a0tlm': 'moderately_against', 'c2anbs3': 'moderately_against', 'c2ds943': 'moderately_against', 'c2folul': 'neutral', 'c2foq3m': 'neutral', 'c2fvkc9': 'moderately_against', 'c2fz060': 'strongly_against', 'c2fzrn2': 'moderately_against', 'c2ggect': 'neutral', 'c2gggm3': 'moderately_against', 'c2hkg3r': 'strongly_against', 'c2jbvy0': 'moderately_against', 'c2llp7v': 'moderately_a


# Maybe first collect each group, then have them in a list/tuple and run the coordinator on that.

# So then that new function would take the features of this path and attatch a score to it.

# And that's it. Now I only need to glue things together.

# Should I make less topics?

# I think I could make a simple model quick and go manual as a backup



# Filter timelines, then
# For each user in timelines:
# Loops through all all topic_timelines and finds all change points (should be 1 for each topic_timeline)
# Loops through all then takes these from a list and finds the conversations around that period (5-10)
# Loops through all convos in that structure and creates a list of op,paths pairs
# Extracts features and calculates the score


# Then another function,
# Does the same but for each user:
# Loops through all topic_timelines and finds no change points (should be 1 for each topic_timeline)
# Then does the same as the previous function

# Then at the end I run a stat test for the two groups.