In [None]:
!git clone https://github.com/Sharp-4rth/temporal_belief_analysis.git

In [None]:
# Need to restart after:
!pip install convokit[llm]
!pip install convokit

In [None]:
!pip install statsmodels

In [None]:
dw

In [None]:
import sys
import os
os.chdir('/workspace/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

In [None]:
import time
!pip install gdown
import zipfile
import nltk
from nltk.corpus import stopwords
from convokit import Corpus, download
import convokit
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.persistence_change_detection import ChangeDetector
from temporal_belief.core.window_extraction import WindowExtractor
from temporal_belief.core.op_path_pairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor
from temporal_belief.data.preprocessors import PairPreprocessor
from temporal_belief.data.preprocessors import ExtractFeatures
from temporal_belief.data.preprocessors import GroupPreprocessor
from temporal_belief.core.interplay import Interplay
import numpy as np
nltk.download('stopwords')

In [16]:
from tqdm import tqdm

class WindowExtractor:
    """ Find the conversations around the change point """

    def __init__(self, corpus, timelines):
        self.corpus = corpus
        self.timelines = timelines
        self.user_conversations_cache = {}  # Add cache

    def build_global_user_conversations_index(self):
        """Build sorted conversations for ALL users upfront"""
        print("Building global user conversations index...")
        user_conversations = {}

        convos = list(self.corpus.iter_conversations())
        print(f"Processing {len(convos)} conversations...")

        for convo in convos:
            # Get all speakers in this conversation
            speakers = {utt.speaker.id for utt in convo.iter_utterances()}

            # Add this conversation to each speaker's list
            for speaker_id in speakers:
                if speaker_id not in user_conversations:
                    user_conversations[speaker_id] = []
                user_conversations[speaker_id].append(convo)

        # Sort each user's conversations once
        print(f"Sorting conversations for {len(user_conversations)} users...")
        for speaker_id in user_conversations:
            user_conversations[speaker_id].sort(
                key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances())
            )

        print(f"Index built for {len(user_conversations)} users!")

        self.user_conversations_cache = user_conversations

    def get_user_conversations_chronological_old(self, corpus, speaker_id):
        """Get all conversations for a user in chronological order."""

        # Check cache first
        if speaker_id in self.user_conversations_cache:
            return self.user_conversations_cache[speaker_id]

        # Get all conversations where the speaker participated
        user_conversations = [convo for convo in corpus.iter_conversations()
                              if speaker_id in [utt.speaker.id for utt in convo.iter_utterances()]]

        # Sort conversations by their earliest timestamp
        user_conversations.sort(key=lambda convo: min(utt.timestamp for utt in convo.iter_utterances()))

        # Cache the result
        self.user_conversations_cache[speaker_id] = user_conversations

        return user_conversations

    def get_user_conversations_chronological(self, corpus, speaker_id):
        return self.user_conversations_cache.get(speaker_id, [])

    def get_conversations_around_change_point(self, corpus, change_point, test=False, window=10):
        # Get first change (probably only one I need)
        utterance = corpus.get_utterance(change_point)

        # Find the convo this utterance belongs to:
        conversation = utterance.get_conversation()

        # Put all user's convos in a list
        speaker_id = utterance.speaker.id
        if test is True:
            user_conversations = self.get_user_conversations_chronological_old(corpus, speaker_id)
        else:
            user_conversations = self.get_user_conversations_chronological(corpus, speaker_id)
            # print(f"Cache: {user_conversations}")

        candidate_convos = []
        # find the index of the convo, and return the convo id of the 3 prior convos
        for i, convo in enumerate(user_conversations):
            if conversation.id == user_conversations[i].id:
                # Check if there are at least two conversations before the current one
                # To this:
                if i >= window:
                    # Get the 'window' number of conversations before the current one
                    candidate_convos.extend(user_conversations[i - 10:i])
                else:
                    # If there are fewer than 10 conversations before, get all of them
                    candidate_convos.extend(user_conversations[:i])

                # Append the current conversation with the change point
                candidate_convos.append(conversation)
                break  # Found the conversation, no need to continue the loop

        return candidate_convos

In [None]:
# Download and unzip with python (Dataloading):
# !gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1DLFY6JLMZqNjwvNRZmhlV4-rnoQP_eyH/view?usp=sharing" -O "/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1nWaj5N8nsG7u5homv_kAh4CLPDv01M_Z/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics.zip" --fuzzy
!gdown "https://drive.google.com/file/d/1AIrstrzE259fcVyxJQW4-RwvAkoUyK1x/view?usp=sharing" -O "/workspace/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip" --fuzzy

# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics.zip").extractall("/content/temporal_belief_analysis")
zipfile.ZipFile("/workspace/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip").extractall("/workspace/temporal_belief_analysis")

In [6]:
CORPUS_PATH = "/workspace/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned"
corpus = Corpus(filename=CORPUS_PATH)

In [11]:
timeline_builder = TimelineBuilder(corpus)
timelines = timeline_builder.build_timelines()

2025-08-27 17:06:43,913 - temporal_belief.core.timeline_building - INFO - timeline_building:73 - Built timelines for 122040 users
INFO:temporal_belief.core.timeline_building:Built timelines for 122040 users


In [17]:
window_extractor = WindowExtractor(corpus, timelines)
window_extractor.build_global_user_conversations_index()

Building global user conversations index...
Processing 102848 conversations...
Sorting conversations for 122040 users...
Index built for 122040 users!


In [20]:
op_path_pairer = OpPathPairer(corpus, timelines)
pair_preprocessor = PairPreprocessor()
feature_extractor = ExtractFeatures()
persuasion_analyzer = Interplay()

In [113]:
import numpy as np
from collections import Counter
import logging
from typing import Dict, List, Tuple, Any, Optional


class ChangeDetector:
    """CUSUM-based change detection for political stance shifts.

    Focuses on detecting changes between 'left-leaning' and 'right-leaning' positions,
    ignoring neutral stances. Uses cumulative sum control charts to identify
    significant shifts in political orientation over time.
    """

    def __init__(self, threshold=6.0, drift=0.5, min_change_separation=5):
        """Initialize CUSUM detector with control parameters.

        Args:
            threshold: Detection threshold for CUSUM statistic (higher = less sensitive)
            drift: Reference drift value for change detection (typically 0.5-1.0)
            min_change_separation: Minimum posts between detected changes
        """
        self.threshold = threshold
        self.drift = drift
        self.min_change_separation = min_change_separation

        # Map stances to numeric values for CUSUM
        self.stance_values = {
            'left-leaning': -1.0,
            'neutral': 0.0,  # Will be filtered out
            'right-leaning': 1.0
        }

        self.all_change_points = []
        self.all_no_change_points = []

        # Logging setup
        self.logger = logging.getLogger(__name__)

    def _to_probs(self, item):
        """Convert various input formats to probability tuple (pL, pN, pR)."""
        if isinstance(item, str):
            if item == 'left-leaning':  return (1.0, 0.0, 0.0)
            if item == 'neutral':       return (0.0, 1.0, 0.0)
            if item == 'right-leaning': return (0.0, 0.0, 1.0)
            return (0.0, 1.0, 0.0)
        if isinstance(item, dict):
            return (float(item.get('pL', 0.0)), float(item.get('pN', 0.0)), float(item.get('pR', 0.0)))
        if isinstance(item, (list, tuple)) and len(item) == 3:
            pL, pN, pR = item
            return (float(pL), float(pN), float(pR))
        return (0.0, 1.0, 0.0)

    def _get_political_signal(self, prob_tuple, conf_threshold=0.6):
        """Extract political signal from probability tuple, ignoring neutral.

        Args:
            prob_tuple: (pL, pN, pR) probability tuple
            conf_threshold: Minimum confidence to consider stance reliable

        Returns:
            Float value: -1.0 (left), +1.0 (right), or None (neutral/uncertain)
        """
        pL, pN, pR = prob_tuple

        # Only consider if we have sufficient confidence in left or right
        if pL >= conf_threshold:
            return -1.0  # left-leaning
        elif pR >= conf_threshold:
            return 1.0  # right-leaning
        else:
            return None  # neutral or uncertain - ignore for CUSUM

    def detect_cusum_changes(self, topic_timeline, conf_threshold=0.6):
        """Detect political stance changes using CUSUM algorithm.

        Args:
            topic_timeline: List of (utterance_id, stance_data) tuples
            conf_threshold: Minimum confidence for reliable stance detection

        Returns:
            Dictionary with change_points and no_change_points lists
        """
        if not topic_timeline:
            return {'change_points': [], 'no_change_points': []}

        # Extract political signals, filtering out neutral/uncertain
        signals = []
        valid_utterances = []

        for utt_id, stance_data in topic_timeline:
            prob_tuple = self._to_probs(stance_data)
            signal = self._get_political_signal(prob_tuple, conf_threshold)

            if signal is not None:
                signals.append(signal)
                valid_utterances.append(utt_id)

        if len(signals) < 3:
            self.logger.warning(f"Insufficient political signals for CUSUM: {len(signals)}")
            return {'change_points': [], 'no_change_points': [utt_id for utt_id, _ in topic_timeline]}

        # CUSUM change detection
        change_indices = self._cusum_detect_changes(signals)

        # Convert indices back to utterance IDs
        change_points = [valid_utterances[idx] for idx in change_indices if idx < len(valid_utterances)]

        # All other utterances are no-change points
        change_set = set(change_points)
        no_change_points = [utt_id for utt_id, _ in topic_timeline if utt_id not in change_set]

        # Store for aggregate statistics
        self.all_change_points.extend(change_points)
        self.all_no_change_points.extend(no_change_points)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
        }

    def _cusum_detect_changes(self, signals):
        """Core CUSUM algorithm for detecting mean shifts in political stance.

        Args:
            signals: List of political stance values (-1.0 or +1.0)

        Returns:
            List of indices where significant changes were detected
        """
        if len(signals) < 2:
            return []

        signals = np.array(signals)
        n = len(signals)
        change_points = []

        # Calculate overall mean for reference
        overall_mean = np.mean(signals)

        # Initialize CUSUM statistics
        cusum_pos = 0.0  # Positive CUSUM (detecting upward shifts)
        cusum_neg = 0.0  # Negative CUSUM (detecting downward shifts)

        for i in range(1, n):
            # Calculate deviations from reference mean
            deviation = signals[i] - overall_mean

            # Update CUSUM statistics
            cusum_pos = max(0, cusum_pos + deviation - self.drift)
            cusum_neg = max(0, cusum_neg - deviation - self.drift)

            # Check for threshold crossings
            change_detected = False

            if cusum_pos > self.threshold:
                # Positive shift detected (towards right-leaning)
                change_points.append(i)
                cusum_pos = 0.0  # Reset after detection
                change_detected = True
                self.logger.debug(f"CUSUM: Positive shift detected at index {i}")

            elif cusum_neg > self.threshold:
                # Negative shift detected (towards left-leaning)
                change_points.append(i)
                cusum_neg = 0.0  # Reset after detection
                change_detected = True
                self.logger.debug(f"CUSUM: Negative shift detected at index {i}")

            # Enforce minimum separation between changes
            if change_detected and len(change_points) > 1:
                if i - change_points[-2] < self.min_change_separation:
                    change_points.pop()  # Remove this change point
                    self.logger.debug(f"CUSUM: Removed change point at {i} due to minimum separation")

        return change_points

    def detect_cusum_changes_advanced(self, topic_timeline, conf_threshold=0.6,
                                      adaptive_threshold=True):
        """Advanced CUSUM with adaptive thresholding and confidence weighting.

        Args:
            topic_timeline: List of (utterance_id, stance_data) tuples
            conf_threshold: Minimum confidence for reliable stance detection
            adaptive_threshold: Whether to adapt threshold based on signal variance

        Returns:
            Dictionary with change_points and no_change_points lists
        """
        if not topic_timeline:
            return {'change_points': [], 'no_change_points': []}

        # Extract weighted political signals
        signals = []
        confidences = []
        valid_utterances = []

        for utt_id, stance_data in topic_timeline:
            prob_tuple = self._to_probs(stance_data)
            signal = self._get_political_signal(prob_tuple, conf_threshold)

            if signal is not None:
                signals.append(signal)
                # Extract confidence from stance_data if available
                confidence = self._extract_confidence(stance_data)
                confidences.append(confidence)
                valid_utterances.append(utt_id)

        if len(signals) < 3:
            return {'change_points': [], 'no_change_points': [utt_id for utt_id, _ in topic_timeline]}

        # Adaptive threshold based on signal variance
        threshold = self.threshold
        if adaptive_threshold:
            signal_std = np.std(signals)
            threshold = max(self.threshold, 2.0 * signal_std)
            self.logger.debug(f"CUSUM: Adaptive threshold set to {threshold:.2f}")

        # Confidence-weighted CUSUM
        change_indices = self._cusum_detect_changes_weighted(signals, confidences, threshold)

        change_points = [valid_utterances[idx] for idx in change_indices if idx < len(valid_utterances)]
        change_set = set(change_points)
        no_change_points = [utt_id for utt_id, _ in topic_timeline if utt_id not in change_set]

        self.all_change_points.extend(change_points)
        self.all_no_change_points.extend(no_change_points)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
        }

    def _cusum_detect_changes_weighted(self, signals, confidences, threshold):
        """CUSUM with confidence weighting for more reliable change detection."""
        signals = np.array(signals)
        confidences = np.array(confidences)
        n = len(signals)
        change_points = []

        # Confidence-weighted mean
        weighted_mean = np.average(signals, weights=confidences)

        # Initialize CUSUM with confidence weighting
        cusum_pos = 0.0
        cusum_neg = 0.0

        for i in range(1, n):
            # Weight deviation by confidence
            deviation = (signals[i] - weighted_mean) * confidences[i]

            # Update CUSUM statistics
            cusum_pos = max(0, cusum_pos + deviation - self.drift)
            cusum_neg = max(0, cusum_neg - deviation - self.drift)

            # Detection with separation enforcement
            if cusum_pos > threshold or cusum_neg > threshold:
                if not change_points or i - change_points[-1] >= self.min_change_separation:
                    change_points.append(i)
                    cusum_pos = 0.0
                    cusum_neg = 0.0

                    direction = "right" if cusum_pos > cusum_neg else "left"
                    self.logger.debug(
                        f"CUSUM: {direction} shift detected at index {i}, confidence={confidences[i]:.2f}")

        return change_points

    def _extract_confidence(self, stance_data):
        """Extract confidence score from stance data."""
        if isinstance(stance_data, dict):
            return stance_data.get('confidence', 1.0)
        elif isinstance(stance_data, (list, tuple)) and len(stance_data) == 3:
            # Use max probability as confidence
            return max(stance_data)
        else:
            return 1.0  # Default confidence

    def _get_political_signal(self, prob_tuple, conf_threshold=0.6):
        """Extract political signal, ignoring neutral positions."""
        pL, pN, pR = prob_tuple

        # Only consider confident left/right positions
        if pL >= conf_threshold:
            return -1.0  # left-leaning
        elif pR >= conf_threshold:
            return 1.0  # right-leaning
        else:
            return None  # neutral/uncertain - ignore

    def get_two_groups(self, timelines, method='cusum', conf_threshold=0.6,
                       advanced=True, **kwargs):
        """
        Group users into with/without changes using CUSUM detection.

        Args:
            timelines: Dictionary of {user_id: {topic: timeline}} data
            method: Detection method ('cusum' or 'cusum_advanced')
            conf_threshold: Minimum confidence for reliable stance detection
            advanced: Whether to use confidence-weighted CUSUM
            **kwargs: Additional parameters (threshold, drift, etc.)

        Returns:
            Dictionary with 'with_changes' and 'no_changes' user groups
        """
        with_changes = {}
        no_changes = {}

        # Update detector parameters from kwargs
        if 'threshold' in kwargs:
            self.threshold = kwargs['threshold']
        if 'drift' in kwargs:
            self.drift = kwargs['drift']
        if 'min_change_separation' in kwargs:
            self.min_change_separation = kwargs['min_change_separation']

        # Select detection method
        if advanced:
            detect_func = lambda tl: self.detect_cusum_changes_advanced(
                tl, conf_threshold=conf_threshold, **kwargs
            )
        else:
            detect_func = lambda tl: self.detect_cusum_changes(
                tl, conf_threshold=conf_threshold
            )

        self.logger.info(f"Starting CUSUM change detection with threshold={self.threshold}, "
                         f"drift={self.drift}, advanced={advanced}")

        for user_id, topic_timelines in timelines.items():
            if user_id == '[deleted]':
                continue

            user_has_changes = False

            for topic_name, topic_timeline in topic_timelines.items():
                # Convert to list format expected by detection methods
                topic_timeline_list = list(topic_timeline.items())

                # Run CUSUM change detection
                changes = detect_func(topic_timeline_list)

                if changes['change_points']:
                    user_has_changes = True
                    if user_id not in with_changes:
                        with_changes[user_id] = {}

                    # Store change points with their stance data
                    with_changes[user_id][topic_name] = {
                        utt_id: topic_timeline[utt_id]
                        for utt_id in changes['change_points']
                    }

            # Users without any detected changes
            if not user_has_changes:
                no_changes[user_id] = topic_timelines

        # Log summary statistics
        self.logger.info(f"CUSUM Results: {len(with_changes)} users with changes, "
                         f"{len(no_changes)} users without changes")
        self.logger.info(f"Total change points detected: {len(self.all_change_points)}")

        return {
            'with_changes': with_changes,
            'no_changes': no_changes,
            'summary': {
                'users_with_changes': len(with_changes),
                'users_without_changes': len(no_changes),
                'total_change_points': len(self.all_change_points),
                'detection_parameters': {
                    'threshold': self.threshold,
                    'drift': self.drift,
                    'min_separation': self.min_change_separation,
                    'conf_threshold': conf_threshold
                }
            }
        }

    def analyze_change_patterns(self, with_changes_data):
        """Analyze patterns in detected political stance changes.

        Args:
            with_changes_data: Users with detected changes from get_two_groups()

        Returns:
            Dictionary containing change pattern analysis
        """
        all_changes = []

        for user_id, topics in with_changes_data.items():
            for topic_name, change_points in topics.items():
                for utt_id, stance_data in change_points.items():
                    prob_tuple = self._to_probs(stance_data)
                    signal = self._get_political_signal(prob_tuple)

                    if signal is not None:
                        all_changes.append({
                            'user_id': user_id,
                            'topic': topic_name,
                            'utterance_id': utt_id,
                            'direction': 'left_shift' if signal < 0 else 'right_shift',
                            'magnitude': abs(signal),
                            'confidence': self._extract_confidence(stance_data)
                        })

        if not all_changes:
            return {'total_changes': 0}

        # Analyze patterns
        change_directions = [c['direction'] for c in all_changes]
        change_magnitudes = [c['magnitude'] for c in all_changes]
        change_confidences = [c['confidence'] for c in all_changes]

        direction_counts = Counter(change_directions)

        return {
            'total_changes': len(all_changes),
            'direction_distribution': dict(direction_counts),
            'average_magnitude': np.mean(change_magnitudes),
            'average_confidence': np.mean(change_confidences),
            'left_shifts': direction_counts.get('left_shift', 0),
            'right_shifts': direction_counts.get('right_shift', 0),
            'most_common_direction': direction_counts.most_common(1)[0] if direction_counts else None
        }

    def tune_cusum_parameters(self, validation_timeline, known_changes=None):
        """Tune CUSUM parameters for optimal performance on validation data.

        Args:
            validation_timeline: Timeline with known change points for tuning
            known_changes: List of known change points for comparison

        Returns:
            Dictionary with optimal parameters and performance metrics
        """
        # Parameter grid for tuning
        threshold_values = [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 5.0]
        drift_values = [0.3, 0.5, 0.7, 1.0]

        best_params = None
        best_score = -1.0
        results = []

        for threshold in threshold_values:
            for drift in drift_values:
                # Temporarily set parameters
                original_threshold = self.threshold
                original_drift = self.drift

                self.threshold = threshold
                self.drift = drift

                # Test detection
                detected = self.detect_cusum_changes(validation_timeline)

                # Calculate performance metrics
                if known_changes:
                    precision, recall, f1 = self._calculate_detection_metrics(
                        detected['change_points'], known_changes
                    )
                    score = f1
                else:
                    # Use change detection rate as proxy metric
                    score = len(detected['change_points']) / max(1, len(validation_timeline))

                results.append({
                    'threshold': threshold,
                    'drift': drift,
                    'score': score,
                    'change_points': len(detected['change_points'])
                })

                if score > best_score:
                    best_score = score
                    best_params = {'threshold': threshold, 'drift': drift}

                # Restore original parameters
                self.threshold = original_threshold
                self.drift = original_drift

        # Set best parameters
        if best_params:
            self.threshold = best_params['threshold']
            self.drift = best_params['drift']

        self.logger.info(f"CUSUM tuning complete. Best params: {best_params}, Score: {best_score:.3f}")

        return {
            'best_parameters': best_params,
            'best_score': best_score,
            'all_results': results
        }

    def _calculate_detection_metrics(self, detected_changes, known_changes):
        """Calculate precision, recall, and F1 for change detection."""
        detected_set = set(detected_changes)
        known_set = set(known_changes)

        true_positives = len(detected_set & known_set)
        false_positives = len(detected_set - known_set)
        false_negatives = len(known_set - detected_set)

        precision = true_positives / max(1, true_positives + false_positives)
        recall = true_positives / max(1, true_positives + false_negatives)
        f1 = 2 * precision * recall / max(1, precision + recall)

        return precision, recall, f1

    def get_change_statistics(self):
        """Get aggregate statistics across all processed timelines."""
        total_points = len(self.all_change_points) + len(self.all_no_change_points)
        change_rate = len(self.all_change_points) / max(1, total_points)

        return {
            'total_change_points': len(self.all_change_points),
            'total_no_change_points': len(self.all_no_change_points),
            'overall_change_rate': change_rate,
            'detection_parameters': {
                'threshold': self.threshold,
                'drift': self.drift,
                'min_separation': self.min_change_separation
            }
        }

In [96]:
class GroupPreprocessor:

    def filter_groups(self, groups, groups_tuple):
        # Calculate activity per user in treatment group
        treatment_total = 0
        control_total = 0
        for group_idx, group in enumerate(tqdm(groups_tuple, desc="Processing groups")):
            for user_id, topic_timelines in group.items():
                for topic_timeline in topic_timelines.values():
                    for change_point in topic_timeline.keys():
                        if group_idx == 0:  # Iterate through change points (keys)
                            treatment_total += 1
                        elif group_idx == 1:
                            control_total += 1

        print(f"treatment: {treatment_total}")
        # print(f"Control: {control_total}")

        treatment_activity = []
        for user_id, timelines in groups['with_changes'].items():
            total_points = sum(len(timeline) for timeline in timelines.values())
            treatment_activity.append(total_points)

        target_activity = sum(treatment_activity) // len(treatment_activity)  # Average activity
        target_total = treatment_total  # Match treatment group size

        # Filter control group users by similar activity level
        filtered_control = {}
        control_total = 0

        for user_id, timelines in groups['no_changes'].items():
            user_activity = sum(len(timeline) for timeline in timelines.values())

            # Keep users with similar activity level
            if target_activity * 0.5 <= user_activity <= target_activity * 2:
                filtered_control[user_id] = timelines
                control_total += user_activity

                # Stop when we reach target total
                if control_total >= target_total:
                    break

        # Replace control group
        groups_tuple = (groups['with_changes'], filtered_control)
        print(f"Filtered control group: {len(filtered_control)} users, ~{control_total} total points")

        return groups_tuple

In [136]:
change_detector = ChangeDetector()

In [137]:
groups_preprocessor = GroupPreprocessor()

In [138]:
groups = change_detector.get_two_groups(timelines)

In [139]:
groups_tuple = (groups['with_changes'], groups['no_changes'])
groups_tuple = groups_preprocessor.filter_groups(groups, groups_tuple)

Processing groups: 100%|██████████| 2/2 [00:00<00:00,  2.95it/s]

treatment: 2280
Filtered control group: 1826 users, ~2280 total points





In [145]:
# Add this code BEFORE your main processing loop to count change points
print("\n=== CHANGE POINT ANALYSIS ===")

for group_idx, group in enumerate(groups_tuple):
    print(f"\n📊 Group {group_idx + 1}:")
    print(f"   Total users: {len(group)}")
    
    group_total_change_points = 0
    user_change_points = []
    
    for user_id, topic_timelines in group.items():
        user_total = 0
        
        for topic_name, topic_timeline in topic_timelines.items():
            topic_change_points = len(topic_timeline.keys())
            user_total += topic_change_points
        
        user_change_points.append((user_id, user_total))
        group_total_change_points += user_total
    
    print(f"   Total change points: {group_total_change_points}")
    print(f"   Average per user: {group_total_change_points / len(group):.1f}")
    
    # Show the first few users (the ones that will actually be processed)
    print(f"   First 2 users (these will be processed):")
    for i, (user_id, count) in enumerate(user_change_points[:2]):
        print(f"     User {user_id}: {count} change points")
    
    # Show distribution
    user_counts = [count for _, count in user_change_points]
    if user_counts:
        print(f"   Min change points per user: {min(user_counts)}")
        print(f"   Max change points per user: {max(user_counts)}")

print("\n" + "="*50 + "\n")


=== CHANGE POINT ANALYSIS ===

📊 Group 1:
   Total users: 1287
   Total change points: 2280
   Average per user: 1.8
   First 2 users (these will be processed):
     User HardCoreModerate: 3 change points
     User kingvitaman: 9 change points
   Min change points per user: 1
   Max change points per user: 56

📊 Group 2:
   Total users: 1826
   Total change points: 2280
   Average per user: 1.2
   First 2 users (these will be processed):
     User amade183: 1 change points
     User TheBromanticOne: 1 change points
   Min change points per user: 1
   Max change points per user: 2




In [172]:
# Calculate the number of utterances in each group
print("Calculating group sizes...")
group_sizes = []

for group_idx, group in enumerate(tqdm(groups_tuple, desc="Counting utterances in groups")):
    utterance_count = 0
    
    for user_id, topic_timelines in group.items():
        for topic_timeline in topic_timelines.values():
            utterance_count += len(topic_timeline.keys())  # Each key is a change point/utterance
    
    group_sizes.append(utterance_count)
    print(f"Group {group_idx + 1}: {utterance_count} utterances")

# Set target_utterances to the smallest group size
target_utterances = min(group_sizes)
print(f"\nSmallest group has {target_utterances} utterances")
print(f"Setting target_utterances = {target_utterances}")

# Show the sampling strategy
print(f"\nSampling strategy:")
for group_idx, size in enumerate(group_sizes):
    print(f"Group {group_idx + 1}: {size} total → {target_utterances} sampled ({target_utterances/size*100:.1f}%)")

Calculating group sizes...


Counting utterances in groups: 100%|██████████| 2/2 [00:00<00:00, 997.93it/s]

Group 1: 2280 utterances
Group 2: 2280 utterances

Smallest group has 2280 utterances
Setting target_utterances = 2280

Sampling strategy:
Group 1: 2280 total → 2280 sampled (100.0%)
Group 2: 2280 total → 2280 sampled (100.0%)





In [173]:
# For each group
for group_idx, group in enumerate(tqdm(groups_tuple, desc="Processing groups")):
    # Initialize dictionary for this group's scores (one score per utterance)
    current_group_scores = {
        'interplay': [],
        'politeness': [],
        'argument_complexity': [],
        'evidence_markers': [],
        'hedging': []
    }

    utterances_processed = 0
    target_reached = False  # Flag to control all nested loops
    
    for user_id, topic_timelines in group.items():
        if target_reached:  # Check flag at user level
            break
            
        user_start_time = time.time()
        user_change_points = 0

        for topic_timeline in topic_timelines.values():
            if target_reached:  # Check flag at topic level
                break
                
            for change_point in topic_timeline.keys():  # Each utterance/change point
                if utterances_processed >= target_utterances:
                    target_reached = True  # Set flag instead of just breaking
                    break
                        
                utts_num += 1
                user_change_points += 1
                utterances_processed += 1

                # Window extraction
                start_time = time.time()
                try:
                    candidate_convos = window_extractor.get_conversations_around_change_point(
                        change_point=change_point, corpus=corpus, test=True
                    )
                    window_time = time.time() - start_time
                    print(f'Window extraction: {window_time:.3f}s')
                except ValueError as e:
                    print(f"Skipping change point {change_point}: {e}")
                    continue

                # Path extraction
                start_time = time.time()
                timeout_duration = 0.25
                op_path_pairs = []

                for candidate_convo in candidate_convos:
                    if time.time() - start_time > timeout_duration:
                        print(f"Path extraction timeout reached ({timeout_duration}s)")
                        break

                    try:
                        op_path_pairs.extend(op_path_pairer.extract_rooted_path_from_candidate_convos(
                            [candidate_convo], user_id
                        ))
                    except ValueError as e:
                        print(f"Skipping conversation {candidate_convo.id}: {e}")
                        continue

                path_time = time.time() - start_time
                print(f'Path extraction: {path_time:.3f}s')

                # Preprocessing
                start_time = time.time()
                preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)
                preprocess_time = time.time() - start_time
                print(f'Preprocessing: {preprocess_time:.3f}s')

                # Feature extraction - collect ALL scores for this utterance
                start_time = time.time()
                utterance_interplay_scores = []
                utterance_politeness_scores = []
                utterance_complexity_scores = []
                utterance_evidence_scores = []
                utterance_hedging_scores = []

                for op, paths in preprocessed_pairs:
                    for k, concatenated_utts in paths.items():
                        # Extract features
                        interplay_features = persuasion_analyzer.calculate_interplay_features(
                            op.text, concatenated_utts, stop_words_set
                        )
                        politeness_features = feature_extractor.get_politeness_features(concatenated_utts)
                        complexity_features = feature_extractor.extract_argument_complexity_features(concatenated_utts)
                        evidence_features = feature_extractor.extract_evidence_features(concatenated_utts)
                        hedging_features = feature_extractor.extract_hedging_features(concatenated_utts)

                        # Calculate scores
                        interplay_score = persuasion_analyzer.calculate_persuasion_score(interplay_features)
                        politeness_score = sum(politeness_features.values())
                        complexity_score = feature_extractor.calculate_complexity_score(complexity_features)
                        evidence_score = feature_extractor.calculate_evidence_score(evidence_features)
                        hedging_score = feature_extractor.calculate_hedging_score_from_features(hedging_features)

                        # Collect all scores for this utterance
                        utterance_interplay_scores.append(interplay_score)
                        utterance_politeness_scores.append(politeness_score)
                        utterance_complexity_scores.append(complexity_score)
                        utterance_evidence_scores.append(evidence_score)
                        utterance_hedging_scores.append(hedging_score)

                feature_time = time.time() - start_time

                # Take mean across all paths for this single utterance
                start_time = time.time()
                if utterance_interplay_scores:  # Only if we have scores
                    # One score per utterance (mean of all conversation paths)
                    utterance_mean_interplay = np.mean(utterance_interplay_scores)
                    utterance_mean_politeness = np.mean(utterance_politeness_scores)
                    utterance_mean_complexity = np.mean(utterance_complexity_scores)
                    utterance_mean_evidence = np.mean(utterance_evidence_scores)
                    utterance_mean_hedging = np.mean(utterance_hedging_scores)

                    # Add ONE score per utterance to group scores
                    current_group_scores['interplay'].append(utterance_mean_interplay)
                    current_group_scores['politeness'].append(utterance_mean_politeness)
                    current_group_scores['argument_complexity'].append(utterance_mean_complexity)
                    current_group_scores['evidence_markers'].append(utterance_mean_evidence)
                    current_group_scores['hedging'].append(utterance_mean_hedging)

                    print(f"Utterance {change_point}: {len(utterance_interplay_scores)} paths -> 1 mean score")
                    print(f"Group {group_idx + 1}: {utterances_processed}/{target_utterances} utterances processed")
                else:
                    print(f"Utterance {change_point}: No valid paths found, skipping")

                scoring_time = time.time() - start_time
                print(f'Scoring: {scoring_time:.3f}s')

                # Print total time for this change point
                total_time = window_time + path_time + preprocess_time + feature_time + scoring_time
                print(f'TOTAL for utterance: {total_time:.3f}s\n')

        user_total_time = time.time() - user_start_time
        if user_change_points > 0:  # Only print if user had utterances
            print(f'USER {user_id}: {user_total_time:.3f}s ({user_change_points} utterances)')

    # Calculate means for each predictor for this group
    group_mean = {}
    for predictor_name, scores in current_group_scores.items():
        if scores:
            group_mean[predictor_name] = np.mean(scores)
        else:
            group_mean[predictor_name] = 0

    print(f"\nGroup {group_idx + 1} final sample sizes:")
    for predictor_name, scores in current_group_scores.items():
        print(f"  {predictor_name}: n={len(scores)}")
    
    print(f"Group {group_idx + 1}: Processed exactly {utterances_processed} utterances")

    group_means.append(group_mean)
    group_scores.append(current_group_scores)

Processing groups:   0%|          | 0/2 [00:00<?, ?it/s]

Window extraction: 0.000s
Path extraction: 0.056s
Preprocessing: 0.000s
Utterance c4w912m: 29 paths -> 1 mean score
Group 1: 1/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.065s

Window extraction: 0.000s
Path extraction: 0.204s
Preprocessing: 0.014s
Utterance caqzidp: 294 paths -> 1 mean score
Group 1: 2/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.603s

Window extraction: 0.000s
Skipping conversation 1jcj4v: Conversation failed integrity check. It is either missing an utterance in the reply-to chain and/or has multiple root nodes. Run check_integrity() to diagnose issues.
Path extraction: 0.060s
Preprocessing: 0.001s
Utterance cbfq2p4: 45 paths -> 1 mean score
Group 1: 3/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.075s

USER HardCoreModerate: 0.744s (3 utterances)
Window extraction: 0.000s
Path extraction: 0.024s
Preprocessing: 0.001s
Utterance c7i4p5r: 215 paths -> 1 mean score
Group 1: 4/2280 utterances processed
Scoring: 

Processing groups:  50%|█████     | 1/2 [09:04<09:04, 544.25s/it]

Utterance e46g2bb: 243 paths -> 1 mean score
Group 1: 2279/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.217s

Window extraction: 0.000s
Path extraction: 0.102s
Preprocessing: 0.001s
Utterance e63zpk0: 165 paths -> 1 mean score
Group 1: 2280/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.143s

USER ElginPoker60123: 0.360s (2 utterances)

Group 1 final sample sizes:
  interplay: n=2278
  politeness: n=2278
  argument_complexity: n=2278
  evidence_markers: n=2278
  hedging: n=2278
Group 1: Processed exactly 2280 utterances
Window extraction: 0.000s
Path extraction: 0.001s
Preprocessing: 0.000s
Utterance ny6mh: 93 paths -> 1 mean score
Group 2: 1/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.025s

USER amade183: 0.025s (1 utterances)
Window extraction: 0.000s
Path extraction: 0.000s
Preprocessing: 0.000s
Utterance o0395: 16 paths -> 1 mean score
Group 2: 2/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.002s

USER Th

Processing groups: 100%|██████████| 2/2 [09:14<00:00, 277.29s/it]

Path extraction: 0.001s
Preprocessing: 0.000s
Utterance 2jq17y: 85 paths -> 1 mean score
Group 2: 2254/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.014s

USER dazzzzzed: 0.014s (1 utterances)
Window extraction: 0.000s
Path extraction: 0.000s
Preprocessing: 0.000s
Utterance 2jqqkt: 5 paths -> 1 mean score
Group 2: 2255/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.002s

Window extraction: 0.000s
Path extraction: 0.000s
Preprocessing: 0.000s
Utterance 2p7yjp: 9 paths -> 1 mean score
Group 2: 2256/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.002s

USER nadircroatia: 0.004s (2 utterances)
Window extraction: 0.000s
Path extraction: 0.002s
Preprocessing: 0.000s
Utterance 2ju3sy: 162 paths -> 1 mean score
Group 2: 2257/2280 utterances processed
Scoring: 0.000s
TOTAL for utterance: 0.018s

Window extraction: 0.000s
Path extraction: 0.002s
Preprocessing: 0.000s
Utterance clf2tbz: 162 paths -> 1 mean score
Group 2: 2258/2280 utterances p




In [176]:
import numpy as np
from scipy import stats
from scipy.stats import mannwhitneyu, levene, shapiro
import pandas as pd

def cohen_d(group1, group2):
    """Calculate Cohen's d for effect size"""
    n1, n2 = len(group1), len(group2)
    pooled_std = np.sqrt(((n1-1)*np.var(group1, ddof=1) + (n2-1)*np.var(group2, ddof=1)) / (n1+n2-2))
    return (np.mean(group1) - np.mean(group2)) / pooled_std

def interpret_effect_size(d):
    """Interpret Cohen's d effect size"""
    abs_d = abs(d)
    if abs_d < 0.2:
        return "Negligible"
    elif abs_d < 0.5:
        return "Small"
    elif abs_d < 0.8:
        return "Medium"
    else:
        return "Large"

print("=== STATISTICAL SIGNIFICANCE TESTING ===\n")

if len(group_scores) < 2:
    print("Error: Need at least 2 groups for comparison")
else:
    # Extract scores for the two groups
    group1_scores = group_scores[0]
    group2_scores = group_scores[1]
    
    # Results storage
    results_df = []
    
    print(f"Group 1 sample sizes: {[len(scores) for scores in group1_scores.values()]}")
    print(f"Group 2 sample sizes: {[len(scores) for scores in group2_scores.values()]}")
    print()
    
    # Test each predictor
    for predictor in group1_scores.keys():
        print(f"=== {predictor.upper()} ===")
        
        # Get scores for both groups
        g1_scores = np.array(group1_scores[predictor])
        g2_scores = np.array(group2_scores[predictor])
        
        # Skip if either group has no scores
        if len(g1_scores) == 0 or len(g2_scores) == 0:
            print(f"Skipping {predictor}: One or both groups have no scores\n")
            continue
            
        # Basic descriptive statistics
        g1_mean, g1_std = np.mean(g1_scores), np.std(g1_scores, ddof=1)
        g2_mean, g2_std = np.mean(g2_scores), np.std(g2_scores, ddof=1)
        
        print(f"Group 1: μ = {g1_mean:.4f}, σ = {g1_std:.4f}, n = {len(g1_scores)}")
        print(f"Group 2: μ = {g2_mean:.4f}, σ = {g2_std:.4f}, n = {len(g2_scores)}")
        
        # Calculate difference and percentage change
        difference = g1_mean - g2_mean
        percent_change = (difference / g2_mean * 100) if g2_mean != 0 else 0
        print(f"Difference: {difference:.4f} ({percent_change:+.1f}%)")
        
        # Test for normality (if sample size allows)
        normal_g1 = normal_g2 = None
        if len(g1_scores) >= 3:
            _, p_norm_g1 = shapiro(g1_scores[:5000] if len(g1_scores) > 5000 else g1_scores)
            normal_g1 = p_norm_g1 > 0.05
        if len(g2_scores) >= 3:
            _, p_norm_g2 = shapiro(g2_scores[:5000] if len(g2_scores) > 5000 else g2_scores)
            normal_g2 = p_norm_g2 > 0.05
            
        # Test for equal variances
        equal_var = None
        if len(g1_scores) >= 2 and len(g2_scores) >= 2:
            _, p_levene = levene(g1_scores, g2_scores)
            equal_var = p_levene > 0.05
        
        print(f"Normality: G1={normal_g1}, G2={normal_g2}")
        print(f"Equal variances: {equal_var}")
        
        # Choose appropriate test
        if normal_g1 and normal_g2 and equal_var:
            # Two-sample t-test (equal variances)
            t_stat, p_value = stats.ttest_ind(g1_scores, g2_scores, equal_var=True)
            test_used = "Two-sample t-test (equal var)"
        elif normal_g1 and normal_g2 and not equal_var:
            # Welch's t-test (unequal variances)
            t_stat, p_value = stats.ttest_ind(g1_scores, g2_scores, equal_var=False)
            test_used = "Welch's t-test (unequal var)"
        else:
            # Mann-Whitney U test (non-parametric)
            u_stat, p_value = mannwhitneyu(g1_scores, g2_scores, alternative='two-sided')
            test_used = "Mann-Whitney U test"
            t_stat = u_stat
        
        # Effect size (Cohen's d)
        effect_size = cohen_d(g1_scores, g2_scores)
        effect_interpretation = interpret_effect_size(effect_size)
        
        # Significance interpretation
        if p_value < 0.001:
            significance = "***"
            sig_text = "p < 0.001"
        elif p_value < 0.01:
            significance = "**"
            sig_text = "p < 0.01"
        elif p_value < 0.05:
            significance = "*"
            sig_text = "p < 0.05"
        elif p_value < 0.1:
            significance = "."
            sig_text = "p < 0.1 (marginal)"
        else:
            significance = ""
            sig_text = "not significant"
        
        print(f"Test used: {test_used}")
        print(f"Test statistic: {t_stat:.4f}")
        print(f"p-value: {p_value:.6f} {significance}")
        print(f"Result: {sig_text}")
        print(f"Effect size (Cohen's d): {effect_size:.4f} ({effect_interpretation})")
        
        # Store results
        results_df.append({
            'Predictor': predictor,
            'Group_1_Mean': g1_mean,
            'Group_1_SD': g1_std,
            'Group_1_N': len(g1_scores),
            'Group_2_Mean': g2_mean,
            'Group_2_SD': g2_std,
            'Group_2_N': len(g2_scores),
            'Difference': difference,
            'Percent_Change': percent_change,
            'Test_Used': test_used,
            'Test_Statistic': t_stat,
            'P_Value': p_value,
            'Significance': significance,
            'Effect_Size_d': effect_size,
            'Effect_Interpretation': effect_interpretation,
            'Significant': p_value < 0.05
        })
        
        print("-" * 50)
        print()
    
    # Create summary table
    results_df = pd.DataFrame(results_df)
    
    print("=== SUMMARY TABLE ===")
    summary_table = results_df[['Predictor', 'Group_1_Mean', 'Group_2_Mean', 'Difference', 
                               'P_Value', 'Significance', 'Effect_Size_d', 'Effect_Interpretation']]
    print(summary_table.to_string(index=False, float_format='%.4f'))
    
    print(f"\n=== OVERALL RESULTS ===")
    significant_predictors = results_df[results_df['Significant'] == True]
    print(f"Significant predictors (p < 0.05): {len(significant_predictors)}/{len(results_df)}")
    
    if len(significant_predictors) > 0:
        print("\nSignificant findings:")
        for _, row in significant_predictors.iterrows():
            direction = "higher" if row['Difference'] > 0 else "lower"
            print(f"  • {row['Predictor']}: Group 1 {direction} than Group 2")
            print(f"    Difference: {row['Difference']:.4f} ({row['Percent_Change']:+.1f}%)")
            print(f"    p = {row['P_Value']:.6f}, d = {row['Effect_Size_d']:.4f} ({row['Effect_Interpretation']})")
    
    # Multiple comparison correction (Bonferroni)
    n_tests = len(results_df)
    bonferroni_alpha = 0.05 / n_tests
    bonferroni_significant = results_df[results_df['P_Value'] < bonferroni_alpha]
    
    print(f"\n=== MULTIPLE COMPARISON CORRECTION ===")
    print(f"Bonferroni corrected α = 0.05/{n_tests} = {bonferroni_alpha:.6f}")
    print(f"Significant after correction: {len(bonferroni_significant)}/{len(results_df)}")
    
    if len(bonferroni_significant) > 0:
        print("\nBonferroni-corrected significant findings:")
        for _, row in bonferroni_significant.iterrows():
            direction = "higher" if row['Difference'] > 0 else "lower"
            print(f"  • {row['Predictor']}: Group 1 {direction} than Group 2")
            print(f"    p = {row['P_Value']:.6f} < {bonferroni_alpha:.6f}")
    
    # Store results for later use
    globals()['statistical_results'] = results_df
    print(f"\nResults saved to 'statistical_results' DataFrame")

=== STATISTICAL SIGNIFICANCE TESTING ===

Group 1 sample sizes: [999, 999, 999, 999, 999]
Group 2 sample sizes: [776, 776, 776, 776, 776]

=== INTERPLAY ===
Group 1: μ = 0.7991, σ = 0.0396, n = 999
Group 2: μ = 0.7618, σ = 0.1126, n = 776
Difference: 0.0372 (+4.9%)
Normality: G1=False, G2=False
Equal variances: False
Test used: Mann-Whitney U test
Test statistic: 510563.0000
p-value: 0.000000 ***
Result: p < 0.001
Effect size (Cohen's d): 0.4646 (Small)
--------------------------------------------------

=== POLITENESS ===
Group 1: μ = 0.8166, σ = 0.6758, n = 999
Group 2: μ = 1.7652, σ = 1.4155, n = 776
Difference: -0.9486 (-53.7%)
Normality: G1=False, G2=False
Equal variances: False
Test used: Mann-Whitney U test
Test statistic: 232593.0000
p-value: 0.000000 ***
Result: p < 0.001
Effect size (Cohen's d): -0.8912 (Large)
--------------------------------------------------

=== ARGUMENT_COMPLEXITY ===
Group 1: μ = 1.0110, σ = 0.0440, n = 999
Group 2: μ = 0.9733, σ = 0.0708, n = 776
Diffe