In [None]:

# Need to restart after:
!pip install convokit

In [9]:
# Download file from Google Drive to colab directory
!pip install gdown
file_id = "1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0"
!gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip" --fuzzy

Downloading...
From (original): https://drive.google.com/uc?id=1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0
From (redirected): https://drive.google.com/uc?id=1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0&confirm=t&uuid=c3c90372-e564-43fc-b7be-a9ac3cac024b
To: /content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip
100% 841M/841M [00:16<00:00, 52.4MB/s]


In [10]:
# Unzip with python:
import zipfile
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics10000_chronological.zip").extractall("/content/temporal_belief_analysis")

In [11]:
# For runpod-jupyter or local (run twice)
import sys
import os

# Change to the correct working directory (workspace if runpod, content if colab)
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Comment out if in colab:
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.change_detection import ChangeDetector
from temporal_belief.core.WindowExtraction import WindowExtractor
from temporal_belief.core.OpPathPairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor

Changed working directory to: /content/temporal_belief_analysis/notebooks


In [12]:
# Run twice
# import unsloth
# import unsloth_zoo
from convokit import Corpus, download
import convokit

In [13]:
# Load a corpus:
# corpus = Corpus(filename="/Users/leonidas/.convokit/saved-corpora/pd_corpus_with_stances1000_chronological")
corpus = Corpus(filename="/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological")

In [40]:
print(corpus.meta)

ConvoKitMeta({'subreddit': 'PoliticalDiscussion', 'num_posts': 102848, 'num_comments': 4553046})


In [24]:
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection
from collections import Counter
import logging

class ChangeDetector:
    """Sliding window change detection with proper statistical significance."""

    def __init__(self, window_size=3, significance_level=0.05):
        self.window_size = window_size
        self.alpha = significance_level
        self.stance_values = {
            'strongly_against': -2, 'moderately_against': -1,
            'neutral': 0, 'moderately_favor': 1, 'strongly_favor': 2
        }
        self.all_change_points = []
        self.all_no_change_points = []

    def detect_persistent_changes(self, topic_timeline):
        """Detect persistent changes in stance."""

        # Convert to (utt_id, detected_stance) tuple
        # topic_timeline_list = list(topic_timeline.items())

        # Collect the tuples where the stance is persistent across n utterances
        change_points = []
        no_change_points = []

        for i in range(len(topic_timeline) - 1):
          # if current stance is different than prior
            if topic_timeline[i][1] != topic_timeline[i-1][1]:
                # Check if change persists for more than 1 post
                if topic_timeline[i][1] == topic_timeline[i+1][1]:
                    change_index = i
                    utt_id = topic_timeline[i][0]
                    # change_point = (change_index, utt_id)
                    change_point = (utt_id)
                    change_points.append(change_point)
                    # print(f"Current:{topic_timeline[i][1]}, Previous: {topic_timeline[i-1][1]} and Next:{topic_timeline[i+1][1]}")
                    self.all_change_points.extend(change_points)
                    self.all_no_change_points.extend(no_change_points)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
          }

    def get_two_groups(self):
        with_changes = {}
        no_changes = {}

        for user_id, topic_timelines in timelines.items():
            for topic_name, topic_timeline in topic_timelines.items():  # Added topic_name
                topic_timeline_list = list(topic_timeline.items())
                changes = self.detect_persistent_changes(topic_timeline_list)

                if changes['change_points']:
                    # User experienced changes - store only change-causing utterances
                    if user_id not in with_changes:
                        with_changes[user_id] = {}
                    with_changes[user_id][topic_name] = {utt_id: topic_timeline[utt_id] for utt_id in changes['change_points']}
                else:
                    # User had no changes - store all utterances
                    if user_id not in no_changes:
                        no_changes[user_id] = {}
                    no_changes[user_id][topic_name] = topic_timeline

        return {
            'with_changes': with_changes,
            'no_changes': no_changes
        }

# Maybe I call the change detector to run a change analysis or something.
# What this does is it saves both the change points AND the groups in variables

In [25]:
# Test timeline builder:
timeline_builder = TimelineBuilder(corpus, min_posts_per_topic=0, min_topics_per_user=0)
timelines = timeline_builder.build_timelines()

# Filter for analysis
change_detector_preprocessor = ChangeDetectorPreprocessor()

# Use filtered ones for detecting changes but the full ones for interplay score. Although maybe it doesn't matter.
filtered_timelines = change_detector_preprocessor.filter_for_change_detection(timelines, min_posts_per_topic=5, min_topics_per_user=2)

# Get a specific user's timeline for a specific topic
user_id = "HardCoreModerate"
topic = "media and political commentary"
topic_timeline = filtered_timelines[user_id][topic]  # This is {utterance_id: stance}

# Convert to list of tuples
# topic_timeline_list = list(topic_timeline.items())

2025-08-13 15:01:23,578 - temporal_belief.core.timeline_building - INFO - timeline_building:71 - Built timelines for 4781 users
INFO:temporal_belief.core.timeline_building:Built timelines for 4781 users


In [148]:
print(len(timelines))

4781


In [117]:
print(topic_timeline)

{'j3g36': 'moderately_against', 'c28uhbc': 'moderately_against', 'c28vhor': 'moderately_against', 'c28vyhs': 'moderately_against', 'c28wegs': 'moderately_against', 'c28yyll': 'moderately_against', 'c28yzm6': 'moderately_against', 'c28yzso': 'moderately_against', 'c28z353': 'neutral', 'c290fut': 'strongly_against', 'c290g67': 'neutral', 'c290h6h': 'moderately_against', 'c290mll': 'neutral', 'c291t8z': 'moderately_against', 'c291yto': 'strongly_against', 'c291zvv': 'moderately_against', 'c2928gw': 'neutral', 'c292b02': 'neutral', 'c29c7v8': 'moderately_against', 'c2a0tlm': 'moderately_against', 'c2anbs3': 'moderately_against', 'c2ds943': 'moderately_against', 'c2folul': 'neutral', 'c2foq3m': 'neutral', 'c2fvkc9': 'moderately_against', 'c2fz060': 'strongly_against', 'c2fzrn2': 'moderately_against', 'c2ggect': 'neutral', 'c2gggm3': 'moderately_against', 'c2hkg3r': 'strongly_against', 'c2jbvy0': 'moderately_against', 'c2llp7v': 'moderately_against', 'c2s4z4h': 'moderately_against', 'c2sq8yf

In [21]:
# Can I iterate over all timelines of this user?
persistence_detector_new = ChangeDetector()
for user_id, topic_timelines in timelines.items():
    for topic_timeline in topic_timelines.values():
        topic_timeline_list = list(topic_timeline.items())
        changes = persistence_detector_new.detect_persistent_changes(topic_timeline_list)
        # self.detect_persistent_changes(topic_timeline)

In [111]:
# Test the change detector:
persistence_detector_new = ChangeDetector()
topic_timeline_list = list(topic_timeline.items())
change_points = persistence_detector_new.detect_persistent_changes(topic_timeline_list)
# with_changes, no_changes = persistence_detector_new.get_two_groups()

In [26]:
# Groups:
persistence_detector_new = ChangeDetector()
groups = persistence_detector_new.get_two_groups()

print(len(groups['with_changes']))
print(len(groups['no_changes']))

print(groups['with_changes'])

# Numbers don't add up cause some users could appear in both groups (had changes in some topics but not in others).

# works!!!!

1439
4409
{'seltaeb4': {'economic policy': {'j3g36': 'moderately_against', 'c2928gw': 'neutral', 'c29c7v8': 'moderately_against', 'c2folul': 'neutral', 'c2jbvy0': 'moderately_against', 'c2sq8yf': 'neutral', 'c30q1zx': 'moderately_against', 'c33bcnl': 'moderately_against', 'c3f9ne8': 'moderately_against'}, 'political figures and campaigns': {'c28yjvi': 'moderately_favor', 'c2cc375': 'moderately_against', 'c2cik1e': 'moderately_against', 'c2cyyrb': 'moderately_against', 'c2hs5w3': 'moderately_against', 'c2ikj19': 'moderately_against', 'c2kzdfm': 'moderately_favor', 'kpnj8': 'moderately_against', 'c2se06m': 'moderately_favor', 'c2yrzwc': 'moderately_against', 'c37n8s9': 'moderately_against', 'c38lpul': 'moderately_against', 'c3ad7n2': 'moderately_against'}, 'media and political commentary': {'c294h29': 'moderately_against', 'c2ansxo': 'moderately_against', 'c2dxv0h': 'moderately_against', 'c2fw9xc': 'moderately_against', 'c2h07sx': 'moderately_against', 'c2ic73g': 'moderately_against', 'c

In [12]:
# Test the window extractor:
window_extractor = WindowExtractor(corpus, timelines=timelines)
candidate_convos = window_extractor.get_conversations_around_change_point(change_points=change_points, corpus=corpus)
for convo in candidate_convos:
  print(f'ID:{convo.id}')

ID:muccw
ID:mv2yv
ID:mv3ou


In [15]:
import re

class PairPreprocessor:

    def tokenize_quotes(self, utterance_text):
        lines = utterance_text.split('\n')
        processed_lines = []

        for line in lines:
            line = line.strip()
            if line.startswith('&gt;') or line.startswith('>'):
                processed_lines.append('[QUOTE]')
            else:
                processed_lines.append(line)

        return '\n'.join(processed_lines)

    def concatenate_path(self, paths):
        concatenated_paths = {}
        for key, utt_list in paths.items():
            path_text = ''
            for utt in utt_list:
                utt_text_quoted = self.tokenize_quotes(utt.text)
                path_text += utt_text_quoted + ' '
            concatenated_paths[key] = path_text.strip()
        return concatenated_paths

    def tokenize_and_lower(op_text, reply_path_text, stop_words_set):
        op_words = op_text.lower().split()
        reply_words = reply_path_text.lower().split()

        return (op_words, reply_words)

    # This pattern keeps letters, numbers, whitespace, and apostrophes (for contractions)
    def remove_punctuation(op_text, reply_path_text):
        op_text = re.sub(r"[^\w\s']", '', op_text)
        reply_path_text = re.sub(r"[^\w\s']", '', reply_path_text)

        return op_text, reply_path_text

    def remove_quotes_from_all(self, op_path_pairs):
        marked_pairs = []
        for op_path_pair in op_path_pairs:
            # Process the OP utterance
            op_text = self.tokenize_quotes(op_path_pair[0].text)

            # Process each utterance path
            processed_paths = []
            for utterances in op_path_pair[1].values():
                path = [self.tokenize_quotes(utt.text) for utt in utterances]
                processed_paths.append(path)

            marked_pairs.append((op_text, processed_paths))

        return marked_pairs

    def concatenate_path_in_pair(self, pair):
        op = pair[0]
        paths = pair[1]

        concatenated_paths = self.concatenate_path(paths)

        return (op, concatenated_paths)

    def concatenate_path_in_all_pairs(self, op_path_pairs):
        # op_path_pairs_quoted = self.remove_quotes_from_all(op_path_pairs)
        preprocessed_pairs = []
        for pair in op_path_pairs:
            pair = self.concatenate_path_in_pair(pair)
            preprocessed_pairs.append(pair)

        return preprocessed_pairs

    def clean_and_tokenize(self, op_text, reply_path_text):
        # Step 1: Remove punctuation
        op_text, reply_path_text = self.remove_punctuation(op_text, reply_path_text)

        # Step 2: Tokenize and lowercase
        op_words, reply_words = self.tokenize_and_lower(op_text, reply_path_text)

        return op_words, reply_words

In [16]:
def calculate_interplay_features(op_text, reply_path_text, stop_words_set):
    """Calculate 12 interplay features between OP and reply path."""

    # Remove punctuation
    op_text = re.sub(r"[^\w\s']", '', op_text)
    reply_path_text = re.sub(r"[^\w\s']", '', reply_path_text)

    # Tokenize and clean
    op_words = op_text.lower().split()
    reply_words = reply_path_text.lower().split()

    # Create word sets
    op_all = set(op_words)
    reply_all = set(reply_words)
    op_stop = set(w for w in op_words if w in stop_words_set)
    reply_stop = set(w for w in reply_words if w in stop_words_set)
    op_content = set(w for w in op_words if w not in stop_words_set)
    reply_content = set(w for w in reply_words if w not in stop_words_set)

    # Calculate 4 metrics for each word type
    features = {}

    for word_type, (op_set, reply_set) in [
        ('all', (op_all, reply_all)),
        ('stop', (op_stop, reply_stop)),
        ('content', (op_content, reply_content))
    ]:
        intersection = len(op_set & reply_set)
        union = len(op_set | reply_set)

        features[f'common_words_{word_type}'] = intersection
        features[f'sim_frac_reply_{word_type}'] = intersection / len(reply_set) if reply_set else 0
        features[f'sim_frac_op_{word_type}'] = intersection / len(op_set) if op_set else 0
        features[f'jaccard_{word_type}'] = intersection / union if union else 0

    return features

In [17]:
def calculate_persuasion_score(interplay_features):
    """
    Calculate persuasion score based on Tan et al.'s CMV findings.
    Higher scores indicate higher persuasion likelihood.
    """

    # Extract the key predictive features
    reply_frac_content = interplay_features.get('sim_frac_reply_content', 0)
    jaccard_content = interplay_features.get('jaccard_content', 0)
    op_frac_stop = interplay_features.get('sim_frac_op_stop', 0)
    reply_frac_all = interplay_features.get('sim_frac_reply_all', 0)

    # Apply their findings (↓↓↓↓ means negative correlation, ↑↑↑↑ means positive)
    score = 0

    # Strongest predictor: less content word similarity → more persuasive
    score += (1 - reply_frac_content) * 0.4  # Weight of 0.4 for strongest predictor

    # Less content overlap → more persuasive
    score += (1 - jaccard_content) * 0.3     # Weight of 0.3

    # More stopword similarity → more persuasive
    score += op_frac_stop * 0.2              # Weight of 0.2

    # Less overall similarity → more persuasive
    score += (1 - reply_frac_all) * 0.1      # Weight of 0.1

    return score

In [16]:
# Test the OP and path pairer:
utt_id = change_points[0][1]
user_id = corpus.get_utterance(change_points[0][1]).speaker.id
op_path_pairer = OpPathPairer(corpus, timelines=timelines)

# List of tuples:
op_path_pairs = op_path_pairer.extract_rooted_path_from_candidate_convos(candidate_convos, user_id)

# def print_user_path_utterances(utterances):
#   for i, utt in enumerate(utterances):
#     print(f'{i}, {utt.text}\n')

for op_path_pair in op_path_pairs:
  print(1000*'=')
  print(f'\nop: {op_path_pair[0].id}\n')
  for path, utterances in op_path_pair[1].items():
    print(f'path: {path}, utterances: {[utt.text for utt in utterances]}\n')


op: c344ch2


op: c344cv7


op: c344lkp


op: c3443ch

path: theratking862_path_0, utterances: ['some people don\'t have time to drop their work and start getting involved locally. I don\'t think it\'s unethical or lazy to "change teams" if living in your current country is unsatisfactory to you. Obviously trying to change the system from within is a choice, but so is leaving for greener pastures. \n\nLaziness would be complaining about your country without doing *anything*. At least he\'s taking action. ']

path: repmack_path_0, utterances: ["&gt;some people don't have time to drop their work and start getting involved locally.\n\nBut they have time to move to a whole new country?\n"]

path: [deleted]_path_1, utterances: ['[deleted]', '[deleted]']

path: HardCoreModerate_path_1, utterances: ["I do not work in politics. I don't think you need to work in politics to create change. He wants to change the direction of politics. That means he must get active. To not do so would be the def

In [17]:
print(op_path_pairs[3])

(Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x7ac7e5e17790>, 'id': 'HardCoreModerate', 'meta': ConvoKitMeta({})}), 'owner': <convokit.model.corpus.Corpus object at 0x7ac7e5e17790>, 'id': 'c3443ch', 'meta': ConvoKitMeta({'score': 18, 'top_level_comment': 'c3443ch', 'retrieved_on': 1427953119, 'gilded': 0, 'gildings': None, 'subreddit': 'PoliticalDiscussion', 'stickied': False, 'permalink': '', 'author_flair_text': '', 'detected_stance': 'moderately_against', 'stance_confidence': 0.517206738392512, 'stance_scores': {'strongly_favor': 0.013105638635655245, 'moderately_favor': 0.3341254343589147, 'neutral': 0.07730480283498764, 'moderately_against': 0.517206738392512, 'strongly_against': 0.041570129338651896}})}), {'theratking862_path_0': [Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': <convokit.model.cor

In [None]:
# Test the preprocessor:
pair_preprocessor = PairPreprocessor()
pair = op_path_pairs[3]
# So it should take a tuple, where the second part of the tuple is a dictionary of path_key, list of utterance pairs

# WHY THIS WORKS????
# for k, v in pair[1].items():
#     print(k, v)

preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)

for pair in preprocessed_pairs:
    for k, utt_text in pair[1].items():
        print(1000*'=')
        print(f'Text: {utt_text}\n')

# Now do it for all pairs

# DONT FORGET TO REMOVE DELETED ETC.
# SHIT BUT CAN I KEEP THEM TO ANALYSE THEM SOMEHOW?
# AS LONG AS THEY ARE IN THE SAME PATH I THINK I CAN KEEP THEM?


In [35]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# Load English stop words
stop_words_set = set(stopwords.words('english'))

print(len(stop_words_set))
print(list(stop_words_set)[:20])  # Show first 20 stop words

198
["we've", 'some', 'is', "hasn't", 'a', 'than', 'under', 'has', 'with', 'at', 'been', 'for', 'ourselves', "hadn't", 'hers', "couldn't", 'him', "it's", 'wasn', 'when']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
# Test interplay features extraction
features_list = []
for op, paths in preprocessed_pairs:
    for k, concatenated_utts in paths.items():
        interplay_features = calculate_interplay_features(op.text, concatenated_utts, stop_words_set)
        features_list.append(interplay_features)

for interplay_features in features_list:
    print(interplay_features)

{'common_words_all': 31, 'sim_frac_reply_all': 0.5344827586206896, 'sim_frac_op_all': 0.1341991341991342, 'jaccard_all': 0.12015503875968993, 'common_words_stop': 19, 'sim_frac_reply_stop': 0.76, 'sim_frac_op_stop': 0.27941176470588236, 'jaccard_stop': 0.25675675675675674, 'common_words_content': 12, 'sim_frac_reply_content': 0.36363636363636365, 'sim_frac_op_content': 0.0736196319018405, 'jaccard_content': 0.06521739130434782}
{'common_words_all': 8, 'sim_frac_reply_all': 0.7272727272727273, 'sim_frac_op_all': 0.03463203463203463, 'jaccard_all': 0.03418803418803419, 'common_words_stop': 5, 'sim_frac_reply_stop': 1.0, 'sim_frac_op_stop': 0.07352941176470588, 'jaccard_stop': 0.07352941176470588, 'common_words_content': 3, 'sim_frac_reply_content': 0.5, 'sim_frac_op_content': 0.018404907975460124, 'jaccard_content': 0.018072289156626505}
{'common_words_all': 0, 'sim_frac_reply_all': 0.0, 'sim_frac_op_all': 0.0, 'jaccard_all': 0.0, 'common_words_stop': 0, 'sim_frac_reply_stop': 0, 'sim_fr

In [37]:
# Test interplay scoring
scores = []
for interplay_features in features_list:
    score = calculate_persuasion_score(interplay_features)
    scores.append(score)

for score in scores:
  print(score)

0.7751994737288855
0.8717945644701436
0.8741695804195805
0.7999999999999999
0.8193796181895149
0.8619657984003488
0.4971284965034965
0.8587523652675951
0.8836067279156045
0.7580402930402932
0.7999999999999999
0.6302083333333334
0.7999999999999999
0.7759745528305366
0.9203549492687231
0.7032167832167832
0.8915664348824612
0.9158800267798517
0.7855237154150198
0.5025032938076417
0.758695652173913
0.7926066727731853
0.6848577437888349
0.791123188405797


In [None]:
# So now thnk of all things that need to be tidied up.
# Need to score things properly according to what they said.
# Need to find that previous chat where I was given the interplay code and in general tighten up the interplay code
# Need to look into potentially more preprocessing if needed
# Need to find proper stop words
# Need to make it so that it runs on the entire dataset.

In [42]:
class WindowExtractor:
    """ Find the conversations around the change point """
    def __init__(self, corpus, timelines):
        self.corpus = corpus
        self.timelines = timelines

    def get_user_conversations_chronological(self, corpus, speaker_id):
        """Get all conversations for a user in chronological order."""

        # Get all conversations where the speaker participated
        user_conversations = [convo for convo in corpus.iter_conversations()
                              if speaker_id in [utt.speaker.id for utt in convo.iter_utterances()]]

        # Sort conversations by their earliest timestamp
        user_conversations.sort(key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances()))

        return user_conversations

    def get_conversations_around_change_point(self, corpus, change_point):
        # Get first change (probably only one I need)
        utterance = corpus.get_utterance(change_point)

        # Find the convo this utterance belongs to:
        conversation = utterance.get_conversation()

        # Put all user's convos in a list
        speaker_id = utterance.speaker.id
        user_conversations = self.get_user_conversations_chronological(corpus, speaker_id)

        candidate_convos = []
        # find the index of the convo, and return the convo id of the 3 prior convos
        for i, convo in enumerate(user_conversations):
            if conversation.id == user_conversations[i].id:
                # Check if there are at least two conversations before the current one
                if i >= 2:
                    candidate_convos.append(user_conversations[i - 2])
                    candidate_convos.append(user_conversations[i - 1])
                elif i == 1:
                     # If only one conversation before, append that one
                     candidate_convos.append(user_conversations[i-1])

                # Append the current conversation with the change point
                candidate_convos.append(conversation)
                break # Found the conversation, no need to continue the loop

        return candidate_convos

In [None]:
# Test persuation analysis coordinator

# For topic_timeline in timelines:
pair_preprocessor = PairPreprocessor()
persistence_detector_new = ChangeDetector()
window_extractor = WindowExtractor(corpus, timelines=timelines)
op_path_pairer = OpPathPairer(corpus, timelines=timelines)

# use the groups
timelines = timeline_builder.build_timelines()
groups = persistence_detector_new.get_two_groups()
groups_tuple = (groups['with_changes'], groups['no_changes'])

# Init
i = 0
group_means = [] # Initialize as a list to append means
group_scores = []
utts_num = 0

# For each group
for group in groups_tuple:
    current_group_scores = [] # List to store scores for the current group
    for user_id, topic_timelines in group.items(): # Iterate through users in the current group
        for topic_timeline in topic_timelines.values():
            # Limit to 3 iterations for debugging:
            if i < 3:
                for change_point in topic_timeline.keys(): # Iterate through change points (keys)
                    print(f'change point {change_point}')
                    utts_num += 1

                    # Get candidate convos
                    # Pass the individual change_point string
                    try:
                        candidate_convos = window_extractor.get_conversations_around_change_point(change_point=change_point, corpus=corpus)
                        print(f'Candidate convos{[convo.id for convo in candidate_convos]}') # Print convo ids
                    except ValueError as e:
                        print(f"Skipping change point {change_point} due to conversation integrity error in get_conversations_around_change_point: {e}")
                        continue # Skip this change point and move to the next

                    # Now make op_path_pairs
                    op_path_pairs = []
                    for candidate_convo in candidate_convos:
                        try:
                            # Corrected method name here
                            op_path_pairs.extend(op_path_pairer.extract_rooted_path_from_candidate_convos([candidate_convo], user_id))
                        except ValueError as e:
                            print(f"Skipping conversation {candidate_convo.id} due to integrity error in extract_rooted_path_from_convo: {e}")
                            continue # Skip this conversation and move to the next one

                    preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)
                    print(f'Number of op path pairs: {len(preprocessed_pairs)}') # Print count

                    # Interplay Feature extraction:
                    features_list = []
                    for op, paths in preprocessed_pairs:
                        for k, concatenated_utts in paths.items():
                            interplay_features = calculate_interplay_features(op.text, concatenated_utts, stop_words_set)
                            features_list.append(interplay_features)
                    print(f'Number of feature lists: {len(features_list)}') # Print count

                    # Interplay scoring for everything around this change point
                    scores = []
                    for interplay_features in features_list:
                        score = calculate_persuasion_score(interplay_features)
                        scores.append(score) # Append the individual score
                    print(f'Scores: {scores}')
                    current_group_scores.extend(scores) # Extend with scores from this change point
                    break # Remove this break later to process all change points
                break # Remove this break later to process all topics
            break # Remove this break later to process all users


    # Calculate mean for this group
    total = 0
    num_of_scores = 0
    for score in current_group_scores: # Iterate through individual scores
        total += score
        num_of_scores += 1

    group_mean = total / num_of_scores if num_of_scores > 0 else 0 # Handle division by zero
    group_means.append(group_mean) # Append mean to the list

# Print the calculated group means
print(f'Group Means: {group_means}')

In [30]:
for user_id, topic_timelines in timelines.items():
    for topic_timeline in topic_timelines.values():
        print(topic_timeline)
        break
    break

{'j3g36': 'moderately_against', 'c28uhbc': 'moderately_against', 'c28vhor': 'moderately_against', 'c28vyhs': 'moderately_against', 'c28wegs': 'moderately_against', 'c28yyll': 'moderately_against', 'c28yzm6': 'moderately_against', 'c28yzso': 'moderately_against', 'c28z353': 'neutral', 'c290fut': 'strongly_against', 'c290g67': 'neutral', 'c290h6h': 'moderately_against', 'c290mll': 'neutral', 'c291t8z': 'moderately_against', 'c291yto': 'strongly_against', 'c291zvv': 'moderately_against', 'c2928gw': 'neutral', 'c292b02': 'neutral', 'c29c7v8': 'moderately_against', 'c2a0tlm': 'moderately_against', 'c2anbs3': 'moderately_against', 'c2ds943': 'moderately_against', 'c2folul': 'neutral', 'c2foq3m': 'neutral', 'c2fvkc9': 'moderately_against', 'c2fz060': 'strongly_against', 'c2fzrn2': 'moderately_against', 'c2ggect': 'neutral', 'c2gggm3': 'moderately_against', 'c2hkg3r': 'strongly_against', 'c2jbvy0': 'moderately_against', 'c2llp7v': 'moderately_against', 'c2s4z4h': 'moderately_against', 'c2sq8yf


# Maybe first collect each group, then have them in a list/tuple and run the coordinator on that.

# So then that new function would take the features of this path and attatch a score to it.

# And that's it. Now I only need to glue things together.

# Should I make less topics?

# I think I could make a simple model quick and go manual as a backup



# Filter timelines, then
# For each user in timelines:
# Loops through all all topic_timelines and finds all change points (should be 1 for each topic_timeline)
# Loops through all then takes these from a list and finds the conversations around that period (5-10)
# Loops through all convos in that structure and creates a list of op,paths pairs
# Extracts features and calculates the score


# Then another function,
# Does the same but for each user:
# Loops through all topic_timelines and finds no change points (should be 1 for each topic_timeline)
# Then does the same as the previous function

# Then at the end I run a stat test for the two groups.

In [None]:
import scipy.stats as stats

# Assuming group_means[0] is the mean for 'with_changes' and group_means[1] is for 'no_changes'
# and current_group_scores contains the individual scores for the last processed group.
# We need to store scores for both groups to perform the test.

# Let's restructure the scoring part to store scores for both groups
all_group_scores = []

# For each group
for group_index, group in enumerate(groups_tuple):
    current_group_scores = [] # List to store scores for the current group
    # ... (previous code to get features_list) ...
    # For now, let's use the features_list generated from the last run for demonstration
    # In the final code, this part needs to be inside the loop that processes each group and user

    # Interplay scoring for everything around this change point
    scores = []
    for interplay_features in features_list:
        score = calculate_persuasion_score(interplay_features)
        scores.append(score) # Append the individual score
    current_group_scores.extend(scores) # Extend with scores from this change point

    all_group_scores.append(current_group_scores)


# Perform Mann-Whitney U test
# Ensure both groups have scores
if len(all_group_scores) == 2 and all_group_scores[0] and all_group_scores[1]:
    stat, p_value = stats.mannwhitneyu(all_group_scores[0], all_group_scores[1])

    print(f"Mann-Whitney U Statistic: {stat}")
    print(f"P-value: {p_value}")

    alpha = 0.05
    if p_value < alpha:
        print("The difference in persuasion scores between the two groups is statistically significant.")
    else:
        print("There is no statistically significant difference in persuasion scores between the two groups.")
else:
    print("Insufficient data in one or both groups to perform the Mann-Whitney U test.")

**Interpretation:**

*   **Mann-Whitney U Statistic:** This is the test statistic calculated by the Mann-Whitney U test.
*   **P-value:** The p-value indicates the probability of observing the data, or something more extreme, if the null hypothesis (that there is no difference between the groups) is true.
*   **Significance Level (alpha):** We typically use a significance level of 0.05. If the p-value is less than alpha, we reject the null hypothesis and conclude that there is a statistically significant difference between the groups.

Based on the p-value, we can determine if the persuasion scores are significantly different for users who changed their stance compared to those who did not.