In [None]:
# Need to restart after:
!pip install convokit

In [None]:
# Download file from Google Drive to colab directory
!pip install gdown
import zipfile

In [None]:
# Download and unzip with python:
!gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip" --fuzzy
!gdown "https://drive.google.com/file/d/1DLFY6JLMZqNjwvNRZmhlV4-rnoQP_eyH/view?usp=sharing" -O "/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip" --fuzzy
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip").extractall("/content/temporal_belief_analysis")
zipfile.ZipFile("/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip").extractall("/content/temporal_belief_analysis")


In [None]:
# For runpod-jupyter or local (run twice)
import sys
import os

# Change to the correct working directory (workspace if runpod, content if colab)
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Comment out if in colab:
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.persistence_change_detection import ChangeDetector
from temporal_belief.core.window_extraction import WindowExtractor
from temporal_belief.core.op_path_pairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor

In [None]:
# Run twice
# import unsloth
# import unsloth_zoo
from convokit import Corpus, download
import convokit

In [None]:
# Load a corpus:
corpus = Corpus(filename="/content/temporal_belief_analysis/merged_corpus_checkpoint_5")

In [None]:
print(corpus.meta)

In [None]:
utts = list(corpus.iter_utterances())
print(utts[0].meta)

In [None]:
# Check if the 300k labeled utterances are the LAST 300k chronologically
sorted_utts = sorted(list(corpus.iter_utterances()), key=lambda u: getattr(u, 'timestamp', 0))
total_utts = len(sorted_utts)

print(f"Total utterances: {total_utts}")

# Check from the end backwards
consecutive_from_end = 0
for i in range(total_utts - 1, -1, -1):  # Count backwards from end
    utt = sorted_utts[i]
    if 'detected_stance' in utt.meta:
        consecutive_from_end += 1
    else:
        break  # First utterance (going backwards) without stance

print(f"Consecutive utterances with stance from END: {consecutive_from_end}")

# Also check where the labels START (going forward)
first_labeled_position = None
for i, utt in enumerate(sorted_utts):
    if 'detected_stance' in utt.meta:
        first_labeled_position = i
        break

print(f"First labeled utterance is at position: {first_labeled_position}")
print(f"Labels span positions {first_labeled_position} to {total_utts - 1}")

In [None]:
# Find where your 300k labels actually are
labeled_positions = []
for i, utt in enumerate(sorted_utts):
    if 'detected_stance' in utt.meta:
        labeled_positions.append(i)

if labeled_positions:
    print(f"First labeled position: {labeled_positions[0]}")
    print(f"Last labeled position: {labeled_positions[-1]}")
    print(f"Total labeled: {len(labeled_positions)}")

    # Check if they're consecutive
    is_consecutive = all(labeled_positions[i] == labeled_positions[i-1] + 1
                        for i in range(1, len(labeled_positions)))
    print(f"Are they consecutive? {is_consecutive}")

In [None]:
found_stances = []
i = 0
for utt in corpus.iter_utterances():
    if 'detected_stance' in utt.meta:
        found_stances.append((utt.id, utt.meta['detected_stance']))
        i += 1

print("Utterances WITH stance labels:", found_stances)
print(i)

In [None]:
# Check first x utterances specifically
utts = list(corpus.iter_utterances())
for i in range(min(300000, len(utts))):
    utt = utts[i]
    has_stance = 'detected_stance' in utt.meta
    print(f"Utterance {i}: {utt.id} - Has stance: {has_stance}")

In [None]:
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection
from collections import Counter
import logging

class ChangeDetector:
    """Sliding window change detection with persistence threshold."""

    def __init__(self, window_size=3, persistence_threshold=4, significance_level=0.05):
        self.window_size = window_size
        self.persistence_threshold = persistence_threshold
        self.alpha = significance_level
        self.stance_values = {
            'strongly_against': -2, 'moderately_against': -1,
            'neutral': 0, 'moderately_favor': 1, 'strongly_favor': 2
        }
        self.all_change_points = []
        self.all_no_change_points = []

    def detect_persistent_changes(self, topic_timeline):
        """
        Detect persistent changes using sliding window with numerical averages.
        This is your main detection method.
        """
        if len(topic_timeline) < self.window_size * 2:
            return {'change_points': [], 'no_change_points': []}

        # Convert stances to numerical values
        numerical_stances = []
        for utt_id, stance in topic_timeline:
            numerical_stances.append(self.stance_values.get(stance, 0))

        change_points = []
        no_change_points = []

        # Calculate sliding window averages
        for i in range(self.window_size, len(numerical_stances) - self.window_size):

            # Get before and after windows
            before_window = numerical_stances[i-self.window_size:i]
            after_window = numerical_stances[i:i+self.window_size]

            # Calculate means
            before_mean = np.mean(before_window)
            after_mean = np.mean(after_window)

            # Check for significant change (simple threshold approach)
            change_magnitude = abs(after_mean - before_mean)

            # Require both magnitude and direction consistency
            if change_magnitude > 0.5:  # Adjust threshold as needed
                # Check if change persists
                future_window = numerical_stances[i+self.window_size:i+2*self.window_size]
                if len(future_window) >= self.window_size:
                    future_mean = np.mean(future_window)

                    # If the change direction is maintained
                    if (after_mean - before_mean) * (future_mean - before_mean) > 0:
                        utt_id = topic_timeline[i][0]
                        change_points.append(utt_id)
                        print(f"Sliding window change at index {i}: "
                              f"before={before_mean:.2f}, after={after_mean:.2f}, "
                              f"future={future_mean:.2f}")

        # Add non-change points
        for i, (utt_id, stance) in enumerate(topic_timeline):
            if utt_id not in change_points:
                no_change_points.append(utt_id)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
        }

    def detect_persistent_changes_simple(self, topic_timeline):
        """
        Alternative: Simple persistence detection (your original approach, but fixed).
        Call this method if you want the simpler approach.
        """
        change_points = []
        no_change_points = []

        if len(topic_timeline) < self.persistence_threshold + 1:
            # Timeline too short for meaningful analysis
            return {'change_points': change_points, 'no_change_points': no_change_points}

        # Track detected changes to avoid duplicates
        already_detected = set()

        for i in range(1, len(topic_timeline) - self.persistence_threshold):
            current_stance = topic_timeline[i][1]
            previous_stance = topic_timeline[i-1][1]

            # Check if stance changed
            if current_stance != previous_stance and i not in already_detected:

                # Check if new stance persists for required threshold
                persistence_count = 1  # Current post counts as 1

                for j in range(i + 1, min(i + self.persistence_threshold, len(topic_timeline))):
                    if topic_timeline[j][1] == current_stance:
                        persistence_count += 1
                    else:
                        break  # Persistence broken

                # If new stance persists for threshold, mark as change point
                if persistence_count >= self.persistence_threshold:
                    utt_id = topic_timeline[i][0]
                    change_points.append(utt_id)

                    # Mark this range as detected to avoid overlapping detections
                    for k in range(i, min(i + self.persistence_threshold, len(topic_timeline))):
                        already_detected.add(k)

                    print(f"Change detected at index {i}: {previous_stance} → {current_stance} "
                          f"(persisted for {persistence_count} posts)")

        # Add non-change points (utterances that didn't cause changes)
        for i, (utt_id, stance) in enumerate(topic_timeline):
            if utt_id not in change_points:
                no_change_points.append(utt_id)

        # Store for global analysis
        self.all_change_points.extend(change_points)
        self.all_no_change_points.extend(no_change_points)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
        }

    def get_two_groups(self, timelines, method='sliding_window'):
        """
        Categorize users into those with/without changes using specified method.

        Args:
            timelines: User timeline data
            method: 'sliding_window' (default) or 'simple'
        """
        with_changes = {}
        no_changes = {}

        # Choose detection method
        if method == 'sliding_window':
            detect_func = self.detect_persistent_changes  # Uses numerical sliding windows
        elif method == 'simple':
            detect_func = self.detect_persistent_changes_simple  # Your original approach
        else:
            raise ValueError(f"Unknown method: {method}. Use 'sliding_window' or 'simple'")

        for user_id, topic_timelines in timelines.items():
            user_has_changes = False

            for topic_name, topic_timeline in topic_timelines.items():
                topic_timeline_list = list(topic_timeline.items())
                changes = detect_func(topic_timeline_list)

                if changes['change_points']:
                    user_has_changes = True
                    # Store change-causing utterances
                    if user_id not in with_changes:
                        with_changes[user_id] = {}
                    with_changes[user_id][topic_name] = {
                        utt_id: topic_timeline[utt_id]
                        for utt_id in changes['change_points']
                    }

            # If user had no changes in any topic, add to no_changes group
            if not user_has_changes:
                no_changes[user_id] = topic_timelines

        return {
            'with_changes': with_changes,
            'no_changes': no_changes
        }

In [None]:
# Test timeline builder:
timeline_builder = TimelineBuilder(corpus, min_posts_per_topic=0, min_topics_per_user=0)
timelines = timeline_builder.build_timelines()

# Filter for analysis
change_detector_preprocessor = ChangeDetectorPreprocessor()

# Use filtered ones for detecting changes but the full ones for interplay score. Although maybe it doesn't matter.
filtered_timelines = change_detector_preprocessor.filter_for_change_detection(timelines, min_posts_per_topic=5, min_topics_per_user=2)

# Get a specific user's timeline for a specific topic
user_id = "HardCoreModerate"
topic = "media and political commentary"
topic_timeline = filtered_timelines[user_id][topic]  # This is {utterance_id: stance}

# Convert to list of tuples
# topic_timeline_list = list(topic_timeline.items())

In [None]:
print(len(timelines))

In [None]:
print(topic_timeline)

In [None]:
# Can I iterate over all timelines of this user?
persistence_detector_new = ChangeDetector()
for user_id, topic_timelines in timelines.items():
    for topic_timeline in topic_timelines.values():
        topic_timeline_list = list(topic_timeline.items())
        changes = persistence_detector_new.detect_persistent_changes(topic_timeline_list)
        # self.detect_persistent_changes(topic_timeline)

In [None]:
# Test the change detector:
persistence_detector_new = ChangeDetector()
topic_timeline_list = list(topic_timeline.items())
change_points = persistence_detector_new.detect_persistent_changes(topic_timeline_list)
# with_changes, no_changes = persistence_detector_new.get_two_groups()

In [None]:
# Groups:
persistence_detector_new = ChangeDetector()
groups = persistence_detector_new.get_two_groups(timelines)

print(len(groups['with_changes']))
print(len(groups['no_changes']))

print(groups['with_changes'])

# Numbers don't add up cause some users could appear in both groups (had changes in some topics but not in others).

# works!!!!

In [None]:
# Test the window extractor:
window_extractor = WindowExtractor(corpus, timelines=timelines)
candidate_convos = window_extractor.get_conversations_around_change_point(change_points=change_points, corpus=corpus)
for convo in candidate_convos:
  print(f'ID:{convo.id}')

In [None]:
import re

class PairPreprocessor:

    def tokenize_quotes(self, utterance_text):
        lines = utterance_text.split('\n')
        processed_lines = []

        for line in lines:
            line = line.strip()
            if line.startswith('&gt;') or line.startswith('>'):
                processed_lines.append('[QUOTE]')
            else:
                processed_lines.append(line)

        return '\n'.join(processed_lines)

    def concatenate_path(self, paths):
        concatenated_paths = {}
        for key, utt_list in paths.items():
            path_text = ''
            for utt in utt_list:
                utt_text_quoted = self.tokenize_quotes(utt.text)
                path_text += utt_text_quoted + ' '
            concatenated_paths[key] = path_text.strip()
        return concatenated_paths

    def tokenize_and_lower(op_text, reply_path_text, stop_words_set):
        op_words = op_text.lower().split()
        reply_words = reply_path_text.lower().split()

        return (op_words, reply_words)

    # This pattern keeps letters, numbers, whitespace, and apostrophes (for contractions)
    def remove_punctuation(op_text, reply_path_text):
        op_text = re.sub(r"[^\w\s']", '', op_text)
        reply_path_text = re.sub(r"[^\w\s']", '', reply_path_text)

        return op_text, reply_path_text

    def remove_quotes_from_all(self, op_path_pairs):
        marked_pairs = []
        for op_path_pair in op_path_pairs:
            # Process the OP utterance
            op_text = self.tokenize_quotes(op_path_pair[0].text)

            # Process each utterance path
            processed_paths = []
            for utterances in op_path_pair[1].values():
                path = [self.tokenize_quotes(utt.text) for utt in utterances]
                processed_paths.append(path)

            marked_pairs.append((op_text, processed_paths))

        return marked_pairs

    def concatenate_path_in_pair(self, pair):
        op = pair[0]
        paths = pair[1]

        concatenated_paths = self.concatenate_path(paths)

        return (op, concatenated_paths)

    def concatenate_path_in_all_pairs(self, op_path_pairs):
        # op_path_pairs_quoted = self.remove_quotes_from_all(op_path_pairs)
        preprocessed_pairs = []
        for pair in op_path_pairs:
            pair = self.concatenate_path_in_pair(pair)
            preprocessed_pairs.append(pair)

        return preprocessed_pairs

    def clean_and_tokenize(self, op_text, reply_path_text):
        # Step 1: Remove punctuation
        op_text, reply_path_text = self.remove_punctuation(op_text, reply_path_text)

        # Step 2: Tokenize and lowercase
        op_words, reply_words = self.tokenize_and_lower(op_text, reply_path_text)

        return op_words, reply_words

In [None]:
def calculate_interplay_features(op_text, reply_path_text, stop_words_set):
    """Calculate 12 interplay features between OP and reply path."""

    # Remove punctuation
    op_text = re.sub(r"[^\w\s']", '', op_text)
    reply_path_text = re.sub(r"[^\w\s']", '', reply_path_text)

    # Tokenize and clean
    op_words = op_text.lower().split()
    reply_words = reply_path_text.lower().split()

    # Create word sets
    op_all = set(op_words)
    reply_all = set(reply_words)
    op_stop = set(w for w in op_words if w in stop_words_set)
    reply_stop = set(w for w in reply_words if w in stop_words_set)
    op_content = set(w for w in op_words if w not in stop_words_set)
    reply_content = set(w for w in reply_words if w not in stop_words_set)

    # Calculate 4 metrics for each word type
    features = {}

    for word_type, (op_set, reply_set) in [
        ('all', (op_all, reply_all)),
        ('stop', (op_stop, reply_stop)),
        ('content', (op_content, reply_content))
    ]:
        intersection = len(op_set & reply_set)
        union = len(op_set | reply_set)

        features[f'common_words_{word_type}'] = intersection
        features[f'sim_frac_reply_{word_type}'] = intersection / len(reply_set) if reply_set else 0
        features[f'sim_frac_op_{word_type}'] = intersection / len(op_set) if op_set else 0
        features[f'jaccard_{word_type}'] = intersection / union if union else 0

    return features

In [None]:
def calculate_persuasion_score(interplay_features):
    """
    Calculate persuasion score based on Tan et al.'s CMV findings.
    Higher scores indicate higher persuasion likelihood.
    """

    # Extract the key predictive features
    reply_frac_content = interplay_features.get('sim_frac_reply_content', 0)
    jaccard_content = interplay_features.get('jaccard_content', 0)
    op_frac_stop = interplay_features.get('sim_frac_op_stop', 0)
    reply_frac_all = interplay_features.get('sim_frac_reply_all', 0)

    # Apply their findings (↓↓↓↓ means negative correlation, ↑↑↑↑ means positive)
    score = 0

    # Strongest predictor: less content word similarity → more persuasive
    score += (1 - reply_frac_content) * 0.4  # Weight of 0.4 for strongest predictor

    # Less content overlap → more persuasive
    score += (1 - jaccard_content) * 0.3     # Weight of 0.3

    # More stopword similarity → more persuasive
    score += op_frac_stop * 0.2              # Weight of 0.2

    # Less overall similarity → more persuasive
    score += (1 - reply_frac_all) * 0.1      # Weight of 0.1

    return score

In [None]:
# Test the OP and path pairer:
utt_id = change_points[0][1]
user_id = corpus.get_utterance(change_points[0][1]).speaker.id
op_path_pairer = OpPathPairer(corpus, timelines=timelines)

# List of tuples:
op_path_pairs = op_path_pairer.extract_rooted_path_from_candidate_convos(candidate_convos, user_id)

# def print_user_path_utterances(utterances):
#   for i, utt in enumerate(utterances):
#     print(f'{i}, {utt.text}\n')

for op_path_pair in op_path_pairs:
  print(1000*'=')
  print(f'\nop: {op_path_pair[0].id}\n')
  for path, utterances in op_path_pair[1].items():
    print(f'path: {path}, utterances: {[utt.text for utt in utterances]}\n')

In [None]:
print(op_path_pairs[3])

In [None]:
# Test the preprocessor:
pair_preprocessor = PairPreprocessor()
pair = op_path_pairs[3]
# So it should take a tuple, where the second part of the tuple is a dictionary of path_key, list of utterance pairs

# WHY THIS WORKS????
# for k, v in pair[1].items():
#     print(k, v)

preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)

for pair in preprocessed_pairs:
    for k, utt_text in pair[1].items():
        print(1000*'=')
        print(f'Text: {utt_text}\n')

# Now do it for all pairs

# DONT FORGET TO REMOVE DELETED ETC.
# SHIT BUT CAN I KEEP THEM TO ANALYSE THEM SOMEHOW?
# AS LONG AS THEY ARE IN THE SAME PATH I THINK I CAN KEEP THEM?


In [None]:
# REMEMBER TO EXTRACT THIS FOR PYCAHRM
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# Load English stop words
stop_words_set = set(stopwords.words('english'))

print(len(stop_words_set))
print(list(stop_words_set)[:20])  # Show first 20 stop words

In [None]:
# Test interplay features extraction
features_list = []
for op, paths in preprocessed_pairs:
    for k, concatenated_utts in paths.items():
        interplay_features = calculate_interplay_features(op.text, concatenated_utts, stop_words_set)
        features_list.append(interplay_features)

for interplay_features in features_list:
    print(interplay_features)

In [None]:
# Test interplay scoring
scores = []
for interplay_features in features_list:
    score = calculate_persuasion_score(interplay_features)
    scores.append(score)

for score in scores:
  print(score)

In [None]:
# So now thnk of all things that need to be tidied up.
# Need to score things properly according to what they said.
# Need to find that previous chat where I was given the interplay code and in general tighten up the interplay code
# Need to look into potentially more preprocessing if needed
# Need to find proper stop words
# Need to make it so that it runs on the entire dataset.

In [None]:
# Build once at the start
def build_global_user_conversations_index(corpus):
    """Build sorted conversations for ALL users upfront"""
    print("Building global user conversations index...")
    user_conversations = {}

    for convo in corpus.iter_conversations():
        # Get all speakers in this conversation
        speakers = {utt.speaker.id for utt in convo.iter_utterances()}

        # Add this conversation to each speaker's list
        for speaker_id in speakers:
            if speaker_id not in user_conversations:
                user_conversations[speaker_id] = []
            user_conversations[speaker_id].append(convo)

    # Sort each user's conversations once
    for speaker_id in user_conversations:
        user_conversations[speaker_id].sort(
            key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances())
        )

    print(f"Index built for {len(user_conversations)} users!")
    return user_conversations

# I can also have it as a global variable instead:
# global_user_convos = build_global_user_conversations_index(corpus)

In [None]:
class WindowExtractor:
    """ Find the conversations around the change point """
    def __init__(self, corpus, timelines):
        self.corpus = corpus
        self.timelines = timelines
        self.user_conversations_cache = {}  # Add cache

    def get_user_conversations_chronological_old(self, corpus, speaker_id):
        """Get all conversations for a user in chronological order."""

        # Check cache first
        if speaker_id in self.user_conversations_cache:
            return self.user_conversations_cache[speaker_id]

        # Get all conversations where the speaker participated
        user_conversations = [convo for convo in corpus.iter_conversations()
                              if speaker_id in [utt.speaker.id for utt in convo.iter_utterances()]]

        # Sort conversations by their earliest timestamp
        user_conversations.sort(key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances()))

        # Cache the result
        self.user_conversations_cache[speaker_id] = user_conversations

        return user_conversations

    def get_user_conversations_chronological(self, corpus, speaker_id):
        return self.user_conversations_cache.get(speaker_id, [])

    def get_conversations_around_change_point(self, corpus, change_point):
        # Get first change (probably only one I need)
        utterance = corpus.get_utterance(change_point)

        # Find the convo this utterance belongs to:
        conversation = utterance.get_conversation()

        # Put all user's convos in a list
        speaker_id = utterance.speaker.id
        user_conversations = self.get_user_conversations_chronological(corpus, speaker_id)

        candidate_convos = []
        # find the index of the convo, and return the convo id of the 3 prior convos
        for i, convo in enumerate(user_conversations):
            if conversation.id == user_conversations[i].id:
                # Check if there are at least two conversations before the current one
                if i >= 2:
                    candidate_convos.append(user_conversations[i - 2])
                    candidate_convos.append(user_conversations[i - 1])
                elif i == 1:
                     # If only one conversation before, append that one
                     candidate_convos.append(user_conversations[i-1])

                # Append the current conversation with the change point
                candidate_convos.append(conversation)
                break # Found the conversation, no need to continue the loop

        return candidate_convos

In [None]:
timelines = timeline_builder.build_timelines()

In [None]:
speakers = list(corpus.iter_speakers())
print(len(speakers))

In [None]:
window_extractor = WindowExtractor(corpus, timelines=timelines)
# Build the cache
print("Pre-building user conversation index...")
window_extractor.user_conversations_cache = build_global_user_conversations_index(corpus)

In [None]:
# Test persuation analysis coordinator
import time


# For topic_timeline in timelines:
pair_preprocessor = PairPreprocessor()
persistence_detector_new = ChangeDetector()
window_extractor = WindowExtractor(corpus, timelines=timelines)
op_path_pairer = OpPathPairer(corpus, timelines=timelines)

# use the groups
groups = persistence_detector_new.get_two_groups(timelines)
groups_tuple = (groups['with_changes'], groups['no_changes'])

# Init
i = 0
group_means = [] # Initialize as a list to append means
group_scores = []
utts_num = 0

# For each group
for group in groups_tuple:
    current_group_scores = []

    for user_id, topic_timelines in group.items():
        user_start_time = time.time()
        user_change_points = 0

        for topic_timeline in topic_timelines.values():

            for change_point in topic_timeline.keys(): # Iterate through change points (keys)
                print(f'User: {user_id}, topic: {topic_timeline}, change point {change_point}')
                utts_num += 1

                user_change_points += 1

                # TIME: Window extraction
                start_time = time.time()
                try:
                    candidate_convos = window_extractor.get_conversations_around_change_point(
                        change_point=change_point, corpus=corpus
                    )
                    window_time = time.time() - start_time
                    print(f'⏱️ Window extraction: {window_time:.3f}s')
                except ValueError as e:
                    print(f"Skipping change point {change_point}: {e}")
                    continue


                # TIME: Path extraction
                start_time = time.time()
                op_path_pairs = []
                for candidate_convo in candidate_convos:
                    try:
                        op_path_pairs.extend(op_path_pairer.extract_rooted_path_from_candidate_convos(
                            [candidate_convo], user_id
                        ))
                    except ValueError as e:
                        print(f"Skipping conversation {candidate_convo.id}: {e}")
                        continue
                path_time = time.time() - start_time
                print(f'⏱️ Path extraction: {path_time:.3f}s')


                # TIME: Preprocessing
                start_time = time.time()
                preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)
                preprocess_time = time.time() - start_time
                print(f'⏱️ Preprocessing: {preprocess_time:.3f}s')


                # TIME: Feature extraction
                start_time = time.time()
                features_list = []
                for op, paths in preprocessed_pairs:
                    for k, concatenated_utts in paths.items():
                        interplay_features = calculate_interplay_features(
                            op.text, concatenated_utts, stop_words_set
                        )
                        features_list.append(interplay_features)
                feature_time = time.time() - start_time
                print(f'⏱️ Feature extraction: {feature_time:.3f}s')


                # TIME: Scoring
                start_time = time.time()
                scores = []
                for interplay_features in features_list:
                    score = calculate_persuasion_score(interplay_features)
                    scores.append(score)
                scoring_time = time.time() - start_time
                print(f'⏱️ Scoring: {scoring_time:.3f}s')
                total_time = window_time + path_time + preprocess_time + feature_time + scoring_time
                print(f'🔥 TOTAL for change point: {total_time:.3f}s\n')

                # Print total time for this change point
                total_time = window_time + path_time + preprocess_time + feature_time + scoring_time
                print(f'🔥 TOTAL for change point: {total_time:.3f}s\n')

                current_group_scores.extend(scores)

        # TIME: End timing this user
        user_total_time = time.time() - user_start_time
        print(f'👤 USER {user_id} TOTAL: {user_total_time:.3f}s ({user_change_points} change points)')
        print(f'📊 Average per change point: {user_total_time/max(1, user_change_points):.3f}s\n')

    # Calculate mean for this group
    total = 0
    num_of_scores = 0
    for score in current_group_scores: # Iterate through individual scores
        total += score
        num_of_scores += 1

    group_mean = total / num_of_scores if num_of_scores > 0 else 0 # Handle division by zero
    group_means.append(group_mean) # Append mean to the list

# Print the calculated group means
print(f'Group Means: {group_means}')

In [None]:
for user_id, topic_timelines in timelines.items():
    for topic_timeline in topic_timelines.values():
        print(topic_timeline)
        break
    break


# Maybe first collect each group, then have them in a list/tuple and run the coordinator on that.

# So then that new function would take the features of this path and attatch a score to it.

# And that's it. Now I only need to glue things together.

# Should I make less topics?

# I think I could make a simple model quick and go manual as a backup



# Filter timelines, then
# For each user in timelines:
# Loops through all all topic_timelines and finds all change points (should be 1 for each topic_timeline)
# Loops through all then takes these from a list and finds the conversations around that period (5-10)
# Loops through all convos in that structure and creates a list of op,paths pairs
# Extracts features and calculates the score


# Then another function,
# Does the same but for each user:
# Loops through all topic_timelines and finds no change points (should be 1 for each topic_timeline)
# Then does the same as the previous function

# Then at the end I run a stat test for the two groups.