In [None]:
!git clone https://github.com/Sharp-4rth/temporal_belief_analysis.git

In [None]:
# Need to restart after:
!pip install convokit[llm]
!pip install convokit

In [5]:
import sys
import os
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

Changed working directory to: /content/temporal_belief_analysis/notebooks


In [None]:
import time
!pip install gdown
import zipfile
import nltk
from nltk.corpus import stopwords
from convokit import Corpus, download
import convokit
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.persistence_change_detection import ChangeDetector
from temporal_belief.core.window_extraction import WindowExtractor
from temporal_belief.core.op_path_pairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor
from temporal_belief.data.preprocessors import PairPreprocessor
from temporal_belief.data.preprocessors import ExtractFeatures
from temporal_belief.data.preprocessors import GroupPreprocessor
from temporal_belief.core.interplay import Interplay
import numpy as np
nltk.download('stopwords')

In [4]:
# Download and unzip with python (Dataloading):
# !gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1DLFY6JLMZqNjwvNRZmhlV4-rnoQP_eyH/view?usp=sharing" -O "/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1nWaj5N8nsG7u5homv_kAh4CLPDv01M_Z/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics.zip" --fuzzy
!gdown "https://drive.google.com/file/d/1AIrstrzE259fcVyxJQW4-RwvAkoUyK1x/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip" --fuzzy

# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics.zip").extractall("/content/temporal_belief_analysis")
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip").extractall("/content/temporal_belief_analysis")

Downloading...
From (original): https://drive.google.com/uc?id=1AIrstrzE259fcVyxJQW4-RwvAkoUyK1x
From (redirected): https://drive.google.com/uc?id=1AIrstrzE259fcVyxJQW4-RwvAkoUyK1x&confirm=t&uuid=31dc8d50-9d2d-4a12-8f0b-5ca2d858ad6e
To: /content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip
100% 1.07G/1.07G [00:10<00:00, 103MB/s]


In [5]:
CORPUS_PATH = "/content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned"
corpus = Corpus(filename=CORPUS_PATH)

No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem


In [7]:
import logging

class PersistenceChangeDetector:
    """Class for detecting persistent changes in stance."""

    def __init__(self, logger=None):
        """Initialize the detector with optional logger."""
        self.logger = logger or logging.getLogger(__name__)

    def detect_persistent_changes(self, topic_timeline, min_persistence=2):
        """Detect persistent changes in stance."""
        if len(topic_timeline) < min_persistence + 1:
            return []

        change_points = []

        for i in range(1, len(topic_timeline) - min_persistence + 1):
            # Check if current stance differs from previous
            if topic_timeline[i][1] != topic_timeline[i-1][1]:
                # Check if new stance persists for min_persistence utterances
                current_stance = topic_timeline[i][1]
                is_persistent = all(
                    topic_timeline[i + j][1] == current_stance
                    for j in range(min_persistence)
                )

                if is_persistent:
                    change_index = i
                    utt_id = topic_timeline[i][0]
                    change_points.append((change_index, utt_id))

        return change_points

    def get_two_groups(self, timelines, min_persistence=2, conf_threshold=0.6, **kwargs):
        """
        Group users into with/without changes using persistence-based detection.

        Args:
            timelines: Dictionary of {user_id: {topic: timeline}} data
            min_persistence: Minimum number of consecutive utterances for persistent change
            conf_threshold: Minimum confidence for reliable stance detection
            **kwargs: Additional parameters (for compatibility)

        Returns:
            Dictionary with 'with_changes' and 'no_changes' user groups
        """
        with_changes = {}
        no_changes = {}
        total_change_points = 0

        self.logger.info(f"Starting persistence-based change detection with "
                        f"min_persistence={min_persistence}, conf_threshold={conf_threshold}")

        for user_id, topic_timelines in timelines.items():
            if user_id == '[deleted]':
                continue

            user_has_changes = False

            for topic_name, topic_timeline in topic_timelines.items():
                # Convert to list format expected by detection method
                topic_timeline_list = list(topic_timeline.items())

                # Filter by confidence threshold if stance data includes confidence
                filtered_timeline = []
                for utt_id, stance_data in topic_timeline_list:
                    if isinstance(stance_data, dict) and 'confidence' in stance_data:
                        if stance_data['confidence'] >= conf_threshold:
                            filtered_timeline.append((utt_id, stance_data['stance']))
                    else:
                        # Assume stance_data is just the stance value
                        filtered_timeline.append((utt_id, stance_data))

                # Run persistence-based change detection
                change_points = self.detect_persistent_changes(filtered_timeline, min_persistence)

                if change_points:
                    user_has_changes = True
                    if user_id not in with_changes:
                        with_changes[user_id] = {}

                    # Store change points with their stance data
                    with_changes[user_id][topic_name] = {}
                    for change_index, utt_id in change_points:
                        with_changes[user_id][topic_name][utt_id] = topic_timeline[utt_id]
                        total_change_points += 1

            # Users without any detected changes
            if not user_has_changes:
                no_changes[user_id] = topic_timelines

        # Log summary statistics
        self.logger.info(f"Persistence Results: {len(with_changes)} users with changes, "
                        f"{len(no_changes)} users without changes")
        self.logger.info(f"Total change points detected: {total_change_points}")

        return {
            'with_changes': with_changes,
            'no_changes': no_changes,
            'summary': {
                'users_with_changes': len(with_changes),
                'users_without_changes': len(no_changes),
                'total_change_points': total_change_points,
                'detection_parameters': {
                    'min_persistence': min_persistence,
                    'conf_threshold': conf_threshold
                }
            }
        }

In [8]:
import numpy as np
from collections import Counter
import logging
from typing import Dict, List, Tuple, Any, Optional


class ChangeDetector:
    """CUSUM-based change detection for political stance shifts.

    Focuses on detecting changes between 'left-leaning' and 'right-leaning' positions,
    ignoring neutral stances. Uses cumulative sum control charts to identify
    significant shifts in political orientation over time.
    """

    def __init__(self, threshold=6.0, drift=0.5, min_change_separation=5):
        """Initialize CUSUM detector with control parameters.

        Args:
            threshold: Detection threshold for CUSUM statistic (higher = less sensitive)
            drift: Reference drift value for change detection (typically 0.5-1.0)
            min_change_separation: Minimum posts between detected changes
        """
        self.threshold = threshold
        self.drift = drift
        self.min_change_separation = min_change_separation

        # Map stances to numeric values for CUSUM
        self.stance_values = {
            'left-leaning': -1.0,
            'neutral': 0.0,  # Will be filtered out
            'right-leaning': 1.0
        }

        self.all_change_points = []
        self.all_no_change_points = []

        # Logging setup
        self.logger = logging.getLogger(__name__)

    def _to_probs(self, item):
        """Convert various input formats to probability tuple (pL, pN, pR)."""
        if isinstance(item, str):
            if item == 'left-leaning':  return (1.0, 0.0, 0.0)
            if item == 'neutral':       return (0.0, 1.0, 0.0)
            if item == 'right-leaning': return (0.0, 0.0, 1.0)
            return (0.0, 1.0, 0.0)
        if isinstance(item, dict):
            return (float(item.get('pL', 0.0)), float(item.get('pN', 0.0)), float(item.get('pR', 0.0)))
        if isinstance(item, (list, tuple)) and len(item) == 3:
            pL, pN, pR = item
            return (float(pL), float(pN), float(pR))
        return (0.0, 1.0, 0.0)

    def _get_political_signal(self, prob_tuple, conf_threshold=0.6):
        """Extract political signal from probability tuple, ignoring neutral.

        Args:
            prob_tuple: (pL, pN, pR) probability tuple
            conf_threshold: Minimum confidence to consider stance reliable

        Returns:
            Float value: -1.0 (left), +1.0 (right), or None (neutral/uncertain)
        """
        pL, pN, pR = prob_tuple

        # Only consider if we have sufficient confidence in left or right
        if pL >= conf_threshold:
            return -1.0  # left-leaning
        elif pR >= conf_threshold:
            return 1.0  # right-leaning
        else:
            return None  # neutral or uncertain - ignore for CUSUM

    def detect_cusum_changes(self, topic_timeline, conf_threshold=0.6):
        """Detect political stance changes using CUSUM algorithm.

        Args:
            topic_timeline: List of (utterance_id, stance_data) tuples
            conf_threshold: Minimum confidence for reliable stance detection

        Returns:
            Dictionary with change_points and no_change_points lists
        """
        if not topic_timeline:
            return {'change_points': [], 'no_change_points': []}

        # Extract political signals, filtering out neutral/uncertain
        signals = []
        valid_utterances = []

        for utt_id, stance_data in topic_timeline:
            prob_tuple = self._to_probs(stance_data)
            signal = self._get_political_signal(prob_tuple, conf_threshold)

            if signal is not None:
                signals.append(signal)
                valid_utterances.append(utt_id)

        if len(signals) < 3:
            self.logger.warning(f"Insufficient political signals for CUSUM: {len(signals)}")
            return {'change_points': [], 'no_change_points': [utt_id for utt_id, _ in topic_timeline]}

        # CUSUM change detection
        change_indices = self._cusum_detect_changes(signals)

        # Convert indices back to utterance IDs
        change_points = [valid_utterances[idx] for idx in change_indices if idx < len(valid_utterances)]

        # All other utterances are no-change points
        change_set = set(change_points)
        no_change_points = [utt_id for utt_id, _ in topic_timeline if utt_id not in change_set]

        # Store for aggregate statistics
        self.all_change_points.extend(change_points)
        self.all_no_change_points.extend(no_change_points)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
        }

    def _cusum_detect_changes(self, signals):
        """Core CUSUM algorithm for detecting mean shifts in political stance.

        Args:
            signals: List of political stance values (-1.0 or +1.0)

        Returns:
            List of indices where significant changes were detected
        """
        if len(signals) < 2:
            return []

        signals = np.array(signals)
        n = len(signals)
        change_points = []

        # Calculate overall mean for reference
        overall_mean = np.mean(signals)

        # Initialize CUSUM statistics
        cusum_pos = 0.0  # Positive CUSUM (detecting upward shifts)
        cusum_neg = 0.0  # Negative CUSUM (detecting downward shifts)

        for i in range(1, n):
            # Calculate deviations from reference mean
            deviation = signals[i] - overall_mean

            # Update CUSUM statistics
            cusum_pos = max(0, cusum_pos + deviation - self.drift)
            cusum_neg = max(0, cusum_neg - deviation - self.drift)

            # Check for threshold crossings
            change_detected = False

            if cusum_pos > self.threshold:
                # Positive shift detected (towards right-leaning)
                change_points.append(i)
                cusum_pos = 0.0  # Reset after detection
                change_detected = True
                self.logger.debug(f"CUSUM: Positive shift detected at index {i}")

            elif cusum_neg > self.threshold:
                # Negative shift detected (towards left-leaning)
                change_points.append(i)
                cusum_neg = 0.0  # Reset after detection
                change_detected = True
                self.logger.debug(f"CUSUM: Negative shift detected at index {i}")

            # Enforce minimum separation between changes
            if change_detected and len(change_points) > 1:
                if i - change_points[-2] < self.min_change_separation:
                    change_points.pop()  # Remove this change point
                    self.logger.debug(f"CUSUM: Removed change point at {i} due to minimum separation")

        return change_points

    def detect_cusum_changes_advanced(self, topic_timeline, conf_threshold=0.6,
                                      adaptive_threshold=True):
        """Advanced CUSUM with adaptive thresholding and confidence weighting.

        Args:
            topic_timeline: List of (utterance_id, stance_data) tuples
            conf_threshold: Minimum confidence for reliable stance detection
            adaptive_threshold: Whether to adapt threshold based on signal variance

        Returns:
            Dictionary with change_points and no_change_points lists
        """
        if not topic_timeline:
            return {'change_points': [], 'no_change_points': []}

        # Extract weighted political signals
        signals = []
        confidences = []
        valid_utterances = []

        for utt_id, stance_data in topic_timeline:
            prob_tuple = self._to_probs(stance_data)
            signal = self._get_political_signal(prob_tuple, conf_threshold)

            if signal is not None:
                signals.append(signal)
                # Extract confidence from stance_data if available
                confidence = self._extract_confidence(stance_data)
                confidences.append(confidence)
                valid_utterances.append(utt_id)

        if len(signals) < 3:
            return {'change_points': [], 'no_change_points': [utt_id for utt_id, _ in topic_timeline]}

        # Adaptive threshold based on signal variance
        threshold = self.threshold
        if adaptive_threshold:
            signal_std = np.std(signals)
            threshold = max(self.threshold, 2.0 * signal_std)
            self.logger.debug(f"CUSUM: Adaptive threshold set to {threshold:.2f}")

        # Confidence-weighted CUSUM
        change_indices = self._cusum_detect_changes_weighted(signals, confidences, threshold)

        change_points = [valid_utterances[idx] for idx in change_indices if idx < len(valid_utterances)]
        change_set = set(change_points)
        no_change_points = [utt_id for utt_id, _ in topic_timeline if utt_id not in change_set]

        self.all_change_points.extend(change_points)
        self.all_no_change_points.extend(no_change_points)

        return {
            'change_points': change_points,
            'no_change_points': no_change_points
        }

    def _cusum_detect_changes_weighted(self, signals, confidences, threshold):
        """CUSUM with confidence weighting for more reliable change detection."""
        signals = np.array(signals)
        confidences = np.array(confidences)
        n = len(signals)
        change_points = []

        # Confidence-weighted mean
        weighted_mean = np.average(signals, weights=confidences)

        # Initialize CUSUM with confidence weighting
        cusum_pos = 0.0
        cusum_neg = 0.0

        for i in range(1, n):
            # Weight deviation by confidence
            deviation = (signals[i] - weighted_mean) * confidences[i]

            # Update CUSUM statistics
            cusum_pos = max(0, cusum_pos + deviation - self.drift)
            cusum_neg = max(0, cusum_neg - deviation - self.drift)

            # Detection with separation enforcement
            if cusum_pos > threshold or cusum_neg > threshold:
                if not change_points or i - change_points[-1] >= self.min_change_separation:
                    change_points.append(i)
                    cusum_pos = 0.0
                    cusum_neg = 0.0

                    direction = "right" if cusum_pos > cusum_neg else "left"
                    self.logger.debug(
                        f"CUSUM: {direction} shift detected at index {i}, confidence={confidences[i]:.2f}")

        return change_points

    def _extract_confidence(self, stance_data):
        """Extract confidence score from stance data."""
        if isinstance(stance_data, dict):
            return stance_data.get('confidence', 1.0)
        elif isinstance(stance_data, (list, tuple)) and len(stance_data) == 3:
            # Use max probability as confidence
            return max(stance_data)
        else:
            return 1.0  # Default confidence

    def _get_political_signal(self, prob_tuple, conf_threshold=0.6):
        """Extract political signal, ignoring neutral positions."""
        pL, pN, pR = prob_tuple

        # Only consider confident left/right positions
        if pL >= conf_threshold:
            return -1.0  # left-leaning
        elif pR >= conf_threshold:
            return 1.0  # right-leaning
        else:
            return None  # neutral/uncertain - ignore

    def get_two_groups(self, timelines, method='cusum', conf_threshold=0.6,
                       advanced=True, **kwargs):
        """
        Group users into with/without changes using CUSUM detection.

        Args:
            timelines: Dictionary of {user_id: {topic: timeline}} data
            method: Detection method ('cusum' or 'cusum_advanced')
            conf_threshold: Minimum confidence for reliable stance detection
            advanced: Whether to use confidence-weighted CUSUM
            **kwargs: Additional parameters (threshold, drift, etc.)

        Returns:
            Dictionary with 'with_changes' and 'no_changes' user groups
        """
        with_changes = {}
        no_changes = {}

        # Update detector parameters from kwargs
        if 'threshold' in kwargs:
            self.threshold = kwargs['threshold']
        if 'drift' in kwargs:
            self.drift = kwargs['drift']
        if 'min_change_separation' in kwargs:
            self.min_change_separation = kwargs['min_change_separation']

        # Select detection method
        if advanced:
            detect_func = lambda tl: self.detect_cusum_changes_advanced(
                tl, conf_threshold=conf_threshold, **kwargs
            )
        else:
            detect_func = lambda tl: self.detect_cusum_changes(
                tl, conf_threshold=conf_threshold
            )

        self.logger.info(f"Starting CUSUM change detection with threshold={self.threshold}, "
                         f"drift={self.drift}, advanced={advanced}")

        for user_id, topic_timelines in timelines.items():
            if user_id == '[deleted]':
                continue

            user_has_changes = False

            for topic_name, topic_timeline in topic_timelines.items():
                # Convert to list format expected by detection methods
                topic_timeline_list = list(topic_timeline.items())

                # Run CUSUM change detection
                changes = detect_func(topic_timeline_list)

                if changes['change_points']:
                    user_has_changes = True
                    if user_id not in with_changes:
                        with_changes[user_id] = {}

                    # Store change points with their stance data
                    with_changes[user_id][topic_name] = {
                        utt_id: topic_timeline[utt_id]
                        for utt_id in changes['change_points']
                    }

            # Users without any detected changes
            if not user_has_changes:
                no_changes[user_id] = topic_timelines

        # Log summary statistics
        self.logger.info(f"CUSUM Results: {len(with_changes)} users with changes, "
                         f"{len(no_changes)} users without changes")
        self.logger.info(f"Total change points detected: {len(self.all_change_points)}")

        return {
            'with_changes': with_changes,
            'no_changes': no_changes,
            'summary': {
                'users_with_changes': len(with_changes),
                'users_without_changes': len(no_changes),
                'total_change_points': len(self.all_change_points),
                'detection_parameters': {
                    'threshold': self.threshold,
                    'drift': self.drift,
                    'min_separation': self.min_change_separation,
                    'conf_threshold': conf_threshold
                }
            }
        }

    def analyze_change_patterns(self, with_changes_data):
        """Analyze patterns in detected political stance changes.

        Args:
            with_changes_data: Users with detected changes from get_two_groups()

        Returns:
            Dictionary containing change pattern analysis
        """
        all_changes = []

        for user_id, topics in with_changes_data.items():
            for topic_name, change_points in topics.items():
                for utt_id, stance_data in change_points.items():
                    prob_tuple = self._to_probs(stance_data)
                    signal = self._get_political_signal(prob_tuple)

                    if signal is not None:
                        all_changes.append({
                            'user_id': user_id,
                            'topic': topic_name,
                            'utterance_id': utt_id,
                            'direction': 'left_shift' if signal < 0 else 'right_shift',
                            'magnitude': abs(signal),
                            'confidence': self._extract_confidence(stance_data)
                        })

        if not all_changes:
            return {'total_changes': 0}

        # Analyze patterns
        change_directions = [c['direction'] for c in all_changes]
        change_magnitudes = [c['magnitude'] for c in all_changes]
        change_confidences = [c['confidence'] for c in all_changes]

        direction_counts = Counter(change_directions)

        return {
            'total_changes': len(all_changes),
            'direction_distribution': dict(direction_counts),
            'average_magnitude': np.mean(change_magnitudes),
            'average_confidence': np.mean(change_confidences),
            'left_shifts': direction_counts.get('left_shift', 0),
            'right_shifts': direction_counts.get('right_shift', 0),
            'most_common_direction': direction_counts.most_common(1)[0] if direction_counts else None
        }

    def tune_cusum_parameters(self, validation_timeline, known_changes=None):
        """Tune CUSUM parameters for optimal performance on validation data.

        Args:
            validation_timeline: Timeline with known change points for tuning
            known_changes: List of known change points for comparison

        Returns:
            Dictionary with optimal parameters and performance metrics
        """
        # Parameter grid for tuning
        threshold_values = [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 5.0]
        drift_values = [0.3, 0.5, 0.7, 1.0]

        best_params = None
        best_score = -1.0
        results = []

        for threshold in threshold_values:
            for drift in drift_values:
                # Temporarily set parameters
                original_threshold = self.threshold
                original_drift = self.drift

                self.threshold = threshold
                self.drift = drift

                # Test detection
                detected = self.detect_cusum_changes(validation_timeline)

                # Calculate performance metrics
                if known_changes:
                    precision, recall, f1 = self._calculate_detection_metrics(
                        detected['change_points'], known_changes
                    )
                    score = f1
                else:
                    # Use change detection rate as proxy metric
                    score = len(detected['change_points']) / max(1, len(validation_timeline))

                results.append({
                    'threshold': threshold,
                    'drift': drift,
                    'score': score,
                    'change_points': len(detected['change_points'])
                })

                if score > best_score:
                    best_score = score
                    best_params = {'threshold': threshold, 'drift': drift}

                # Restore original parameters
                self.threshold = original_threshold
                self.drift = original_drift

        # Set best parameters
        if best_params:
            self.threshold = best_params['threshold']
            self.drift = best_params['drift']

        self.logger.info(f"CUSUM tuning complete. Best params: {best_params}, Score: {best_score:.3f}")

        return {
            'best_parameters': best_params,
            'best_score': best_score,
            'all_results': results
        }

    def _calculate_detection_metrics(self, detected_changes, known_changes):
        """Calculate precision, recall, and F1 for change detection."""
        detected_set = set(detected_changes)
        known_set = set(known_changes)

        true_positives = len(detected_set & known_set)
        false_positives = len(detected_set - known_set)
        false_negatives = len(known_set - detected_set)

        precision = true_positives / max(1, true_positives + false_positives)
        recall = true_positives / max(1, true_positives + false_negatives)
        f1 = 2 * precision * recall / max(1, precision + recall)

        return precision, recall, f1

    def get_change_statistics(self):
        """Get aggregate statistics across all processed timelines."""
        total_points = len(self.all_change_points) + len(self.all_no_change_points)
        change_rate = len(self.all_change_points) / max(1, total_points)

        return {
            'total_change_points': len(self.all_change_points),
            'total_no_change_points': len(self.all_no_change_points),
            'overall_change_rate': change_rate,
            'detection_parameters': {
                'threshold': self.threshold,
                'drift': self.drift,
                'min_separation': self.min_change_separation
            }
        }

In [9]:
persistence_change_detector = PersistenceChangeDetector()
timeline_builder = TimelineBuilder(corpus)
timelines = timeline_builder.build_timelines()
change_detector = ChangeDetector()
window_extractor = WindowExtractor(corpus, timelines)

2025-08-30 15:53:25,771 - temporal_belief.core.timeline_building - INFO - timeline_building:73 - Built timelines for 122040 users
INFO:temporal_belief.core.timeline_building:Built timelines for 122040 users


In [10]:
cusum_groups = change_detector.get_two_groups(timelines)
cusum_groups_tuple = (cusum_groups['with_changes'], cusum_groups['no_changes'])

In [11]:
persistence_groups = persistence_change_detector.get_two_groups(timelines)
persistence_groups_tuple = (persistence_groups['with_changes'], persistence_groups['no_changes'])

In [14]:
# Evaluate true positives from cusum with extended temporal context
import datetime
target_utterances = 200

# Only process the first group
first_group = cusum_groups_tuple[0]

utterances_processed = 0
target_reached = False

def get_time_difference_days(timestamp1, timestamp2):
    """Calculate difference in days between two timestamps"""
    if isinstance(timestamp1, (int, float)):
        timestamp1 = datetime.datetime.fromtimestamp(timestamp1)
    if isinstance(timestamp2, (int, float)):
        timestamp2 = datetime.datetime.fromtimestamp(timestamp2)

    return abs((timestamp1 - timestamp2).days)

for user_id, topic_timelines in first_group.items():
    if target_reached:
        break

    user_start_time = time.time()
    user_change_points = 0

    for topic_name, topic_timeline in topic_timelines.items():
        if target_reached:
            break

        for change_point_utterance_id in topic_timeline.keys():  # Each utterance/change point
            if utterances_processed >= target_utterances:
                target_reached = True
                break

            # Get the change point utterance from corpus
            try:
                change_point_utterance = corpus.get_utterance(change_point_utterance_id)
                change_point_conversation = corpus.get_conversation(change_point_utterance.conversation_id)

                print("=" * 80)
                print("CHANGE POINT DETECTED:")
                print("=" * 80)
                print(f"Change point utterance ID: {change_point_utterance_id}")
                print(f"Title: {change_point_conversation.meta.get('title', 'N/A')}")
                print(f"Main post: {change_point_conversation.meta.get('selftext', 'N/A')}")
                print(f"User: {change_point_utterance.speaker.id}")
                print(f"Utterance: {change_point_utterance.text}")
                print(f"Stance label: {change_point_utterance.meta.get('stance', 'N/A')}")
                print(f"Change point timestamp: {change_point_utterance.timestamp}")

                # Get the reply-to post if it exists
                if change_point_utterance.reply_to:
                    try:
                        reply_to_utterance = corpus.get_utterance(change_point_utterance.reply_to)
                        print(f"Reply-to post: {reply_to_utterance.text}")
                    except KeyError:
                        print("Reply-to post: Referenced utterance not found in corpus")
                else:
                    print("Reply-to post: This is a top-level comment")

            except KeyError as e:
                print(f"Error: Change point utterance {change_point_utterance_id} not found in corpus: {e}")
                continue

            print("\n" + "-" * 80)
            print("TEMPORAL CONTEXT AROUND CHANGE POINT:")
            print("-" * 80)

            # Collect ALL utterances from this user across the entire corpus
            change_point_timestamp = change_point_utterance.timestamp
            all_user_utterances = []

            print("Searching corpus for all user utterances...")
            for conversation in corpus.iter_conversations():
                try:
                    user_utterances = [
                        (utt, conversation) for utt in conversation.iter_utterances()
                        if (utt.speaker.id == user_id and
                            utt.timestamp is not None and
                            utt.timestamp < change_point_timestamp)  # Only utterances before change point
                    ]
                    all_user_utterances.extend(user_utterances)
                except Exception as e:
                    print(f"Error processing conversation {conversation.id}: {e}")
                    continue

            if all_user_utterances:
                # Sort all utterances by timestamp (oldest first)
                all_user_utterances.sort(key=lambda x: x[0].timestamp)

                print(f"Found {len(all_user_utterances)} utterances from user before change point")

                if all_user_utterances:
                    # Calculate temporal distances for diagnostic purposes
                    earliest_utterance = all_user_utterances[0]
                    latest_utterance = all_user_utterances[-1]

                    earliest_days = get_time_difference_days(earliest_utterance[0].timestamp, change_point_timestamp)
                    latest_days = get_time_difference_days(latest_utterance[0].timestamp, change_point_timestamp)

                    print(f"📊 Temporal range: earliest is {earliest_days} days before, latest is {latest_days} days before")

                    # Get the 2 oldest utterances (furthest back in time)
                    historical_utterances = all_user_utterances[:2]

                    # Get utterances from 2-3 months ago if available
                    months_ago_utterances = []
                    for utt, conv in all_user_utterances:
                        days_diff = get_time_difference_days(utt.timestamp, change_point_timestamp)
                        if 60 <= days_diff <= 90:  # 2-3 months ago
                            months_ago_utterances.append((utt, conv, days_diff))

                    # If we have utterances from 2-3 months ago, use those instead
                    if months_ago_utterances:
                        # Sort by timestamp and take the 2 most recent from that period
                        months_ago_utterances.sort(key=lambda x: x[0].timestamp)
                        selected_historical = [(utt, conv) for utt, conv, _ in months_ago_utterances[-2:]]
                        print(f"📚 Using 2 utterances from 2-3 months ago (out of {len(months_ago_utterances)} available)")
                    else:
                        selected_historical = historical_utterances
                        print(f"📚 Using 2 oldest utterances (no utterances found from 2-3 months ago)")

                    # Get a few recent utterances after the change point for comparison
                    after_utterances = []
                    for conversation in corpus.iter_conversations():
                        try:
                            user_utterances_after = [
                                (utt, conversation) for utt in conversation.iter_utterances()
                                if (utt.speaker.id == user_id and
                                    utt.timestamp is not None and
                                    utt.timestamp > change_point_timestamp)
                            ]
                            after_utterances.extend(user_utterances_after)
                        except Exception as e:
                            continue

                    # Sort and take first 2 after change point
                    after_utterances.sort(key=lambda x: x[0].timestamp)
                    selected_after = after_utterances[:2] if len(after_utterances) >= 2 else after_utterances

                    # Display the selected utterances
                    all_selected = []

                    # Add historical utterances
                    for utterance, conversation in selected_historical:
                        days_diff = get_time_difference_days(utterance.timestamp, change_point_timestamp)
                        all_selected.append(("HISTORICAL", utterance, conversation, days_diff))

                    # Add the change point
                    all_selected.append(("CHANGE_POINT", change_point_utterance, change_point_conversation, 0))

                    # Add after utterances
                    for utterance, conversation in selected_after:
                        days_diff = get_time_difference_days(utterance.timestamp, change_point_timestamp)
                        all_selected.append(("AFTER", utterance, conversation, days_diff))

                    print(f"\nShowing {len(all_selected)} utterances:")
                    print("="*60)

                    for category, utterance, conversation, days_diff in all_selected:

                        if category == "HISTORICAL":
                            print(f"\n📚 **HISTORICAL** ({days_diff} days before):")
                        elif category == "CHANGE_POINT":
                            print(f"\n🔄 **CHANGE POINT**:")
                        else:  # AFTER
                            print(f"\n📈 **AFTER** ({days_diff} days after):")

                        print(f"Topic: {topic_name}")
                        print(f"Conversation: {conversation.id}")
                        print(f"Title: {conversation.meta.get('title', 'N/A')}")
                        print(f"Timestamp: {utterance.timestamp}")
                        print(f"User: {utterance.speaker.id}")
                        print(f"Utterance: {utterance.text}")
                        print(f"Stance label: {utterance.meta.get('detected_stance', 'N/A')}")

                        # Get main post content
                        try:
                            main_post_utterance = corpus.get_utterance(conversation.id)
                            print(f"Main post: {main_post_utterance.text}")
                        except KeyError:
                            print("Main post: Unable to retrieve main post content")

                        # Get the reply-to post if it exists
                        if utterance.reply_to:
                            try:
                                reply_to_utterance = corpus.get_utterance(utterance.reply_to)
                                print(f"Reply-to post: {reply_to_utterance.text}")
                            except KeyError:
                                print("Reply-to post: Referenced utterance not found in corpus")
                        else:
                            print("Reply-to post: This is a top-level comment")

                        print("-" * 40)

                    # Summary statistics
                    if len([x for x in all_selected if x[0] == "HISTORICAL"]) > 0:
                        historical_stances = [utterance.meta.get('detected_stance', 'N/A') for category, utterance, _, _ in all_selected if category == "HISTORICAL"]
                        print(f"\n📊 HISTORICAL STANCE PATTERN: {historical_stances}")

                    change_point_stance = change_point_utterance.meta.get('detected_stance', 'N/A')
                    print(f"📊 CHANGE POINT STANCE: {change_point_stance}")

                    if len([x for x in all_selected if x[0] == "AFTER"]) > 0:
                        after_stances = [utterance.meta.get('detected_stance', 'N/A') for category, utterance, _, _ in all_selected if category == "AFTER"]
                        print(f"📊 AFTER STANCE PATTERN: {after_stances}")
                else:
                    print(f"No utterances found from user {user_id} before the change point")
            else:
                print("No utterances found from this user in the entire corpus")

            print("\n" + "=" * 80)
            print()

            utterances_processed += 1
            user_change_points += 1

print(f"Processed {utterances_processed} change points from the first group")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
&gt; we haven't seen anything, so I'm inclined to believe there's no exculpatory evidence

That's not the way burden of proof works.  An accusation isn't true until someone proves it false, someone isn't guilty until they're proven innocent.
Stance label: left-leaning
Main post: [removed]
Reply-to post: &gt;Having a sexual relationship with a subordinate within the workplace is defined as sexual harassment.

But not sexual assault.

&gt;They came forward, they just didn't come forward publically.

Well, now that I've clarified what I meant, feel free to clarify any examples you think apply.

&gt;They are all examples of sexual misconduct.

But not sexual assault.

&gt;Yeah, now, after the campaign, much like the accusations against Trump will be found to be not credible after the campaign.

The Clinton accusations weren't.  A hundred thousand fervent trump supporters would be very happy to discredit those accusations and 

In [17]:
# Evaluate negatives from cusum with extended temporal context
import datetime
target_utterances = 200

# Only process the first group
first_group = cusum_groups_tuple[1]

utterances_processed = 0
target_reached = False

def get_time_difference_days(timestamp1, timestamp2):
    """Calculate difference in days between two timestamps"""
    if isinstance(timestamp1, (int, float)):
        timestamp1 = datetime.datetime.fromtimestamp(timestamp1)
    if isinstance(timestamp2, (int, float)):
        timestamp2 = datetime.datetime.fromtimestamp(timestamp2)

    return abs((timestamp1 - timestamp2).days)

for user_id, topic_timelines in first_group.items():
    if target_reached:
        break

    user_start_time = time.time()
    user_change_points = 0

    for topic_name, topic_timeline in topic_timelines.items():
        if target_reached:
            break

        for change_point_utterance_id in topic_timeline.keys():  # Each utterance/change point
            if utterances_processed >= target_utterances:
                target_reached = True
                break

            # Get the change point utterance from corpus
            try:
                change_point_utterance = corpus.get_utterance(change_point_utterance_id)
                change_point_conversation = corpus.get_conversation(change_point_utterance.conversation_id)

                print("=" * 80)
                print("POINT DETECTED:")
                print("=" * 80)
                print(f"Change point utterance ID: {change_point_utterance_id}")
                print(f"Title: {change_point_conversation.meta.get('title', 'N/A')}")
                print(f"Main post: {change_point_conversation.meta.get('selftext', 'N/A')}")
                print(f"User: {change_point_utterance.speaker.id}")
                print(f"Utterance: {change_point_utterance.text}")
                print(f"Stance label: {change_point_utterance.meta.get('stance', 'N/A')}")
                print(f"Change point timestamp: {change_point_utterance.timestamp}")

                # Get the reply-to post if it exists
                if change_point_utterance.reply_to:
                    try:
                        reply_to_utterance = corpus.get_utterance(change_point_utterance.reply_to)
                        print(f"Reply-to post: {reply_to_utterance.text}")
                    except KeyError:
                        print("Reply-to post: Referenced utterance not found in corpus")
                else:
                    print("Reply-to post: This is a top-level comment")

            except KeyError as e:
                print(f"Error: Change point utterance {change_point_utterance_id} not found in corpus: {e}")
                continue

            print("\n" + "-" * 80)
            print("TEMPORAL CONTEXT AROUND POINT:")
            print("-" * 80)

            # Collect ALL utterances from this user across the entire corpus
            change_point_timestamp = change_point_utterance.timestamp
            all_user_utterances = []

            print("Searching corpus for all user utterances...")
            for conversation in corpus.iter_conversations():
                try:
                    user_utterances = [
                        (utt, conversation) for utt in conversation.iter_utterances()
                        if (utt.speaker.id == user_id and
                            utt.timestamp is not None and
                            utt.timestamp < change_point_timestamp)  # Only utterances before change point
                    ]
                    all_user_utterances.extend(user_utterances)
                except Exception as e:
                    print(f"Error processing conversation {conversation.id}: {e}")
                    continue

            if all_user_utterances:
                # Sort all utterances by timestamp (oldest first)
                all_user_utterances.sort(key=lambda x: x[0].timestamp)

                print(f"Found {len(all_user_utterances)} utterances from user before change point")

                if all_user_utterances:
                    # Calculate temporal distances for diagnostic purposes
                    earliest_utterance = all_user_utterances[0]
                    latest_utterance = all_user_utterances[-1]

                    earliest_days = get_time_difference_days(earliest_utterance[0].timestamp, change_point_timestamp)
                    latest_days = get_time_difference_days(latest_utterance[0].timestamp, change_point_timestamp)

                    print(f"📊 Temporal range: earliest is {earliest_days} days before, latest is {latest_days} days before")

                    # Get the 2 oldest utterances (furthest back in time)
                    historical_utterances = all_user_utterances[:2]

                    # Get utterances from 2-3 months ago if available
                    months_ago_utterances = []
                    for utt, conv in all_user_utterances:
                        days_diff = get_time_difference_days(utt.timestamp, change_point_timestamp)
                        if 60 <= days_diff <= 90:  # 2-3 months ago
                            months_ago_utterances.append((utt, conv, days_diff))

                    # If we have utterances from 2-3 months ago, use those instead
                    if months_ago_utterances:
                        # Sort by timestamp and take the 2 most recent from that period
                        months_ago_utterances.sort(key=lambda x: x[0].timestamp)
                        selected_historical = [(utt, conv) for utt, conv, _ in months_ago_utterances[-2:]]
                        print(f"📚 Using 2 utterances from 2-3 months ago (out of {len(months_ago_utterances)} available)")
                    else:
                        selected_historical = historical_utterances
                        print(f"📚 Using 2 oldest utterances (no utterances found from 2-3 months ago)")

                    # Get a few recent utterances after the change point for comparison
                    after_utterances = []
                    for conversation in corpus.iter_conversations():
                        try:
                            user_utterances_after = [
                                (utt, conversation) for utt in conversation.iter_utterances()
                                if (utt.speaker.id == user_id and
                                    utt.timestamp is not None and
                                    utt.timestamp > change_point_timestamp)
                            ]
                            after_utterances.extend(user_utterances_after)
                        except Exception as e:
                            continue

                    # Sort and take first 2 after change point
                    after_utterances.sort(key=lambda x: x[0].timestamp)
                    selected_after = after_utterances[:2] if len(after_utterances) >= 2 else after_utterances

                    # Display the selected utterances
                    all_selected = []

                    # Add historical utterances
                    for utterance, conversation in selected_historical:
                        days_diff = get_time_difference_days(utterance.timestamp, change_point_timestamp)
                        all_selected.append(("HISTORICAL", utterance, conversation, days_diff))

                    # Add the change point
                    all_selected.append(("CHANGE_POINT", change_point_utterance, change_point_conversation, 0))

                    # Add after utterances
                    for utterance, conversation in selected_after:
                        days_diff = get_time_difference_days(utterance.timestamp, change_point_timestamp)
                        all_selected.append(("AFTER", utterance, conversation, days_diff))

                    print(f"\nShowing {len(all_selected)} utterances:")
                    print("="*60)

                    for category, utterance, conversation, days_diff in all_selected:

                        if category == "HISTORICAL":
                            print(f"\n📚 **HISTORICAL** ({days_diff} days before):")
                        elif category == "CHANGE_POINT":
                            print(f"\n🔄 **NOT A CHANGE POINT**:")
                        else:  # AFTER
                            print(f"\n📈 **AFTER** ({days_diff} days after):")

                        print(f"Topic: {topic_name}")
                        print(f"Conversation: {conversation.id}")
                        print(f"Title: {conversation.meta.get('title', 'N/A')}")
                        print(f"Timestamp: {utterance.timestamp}")
                        print(f"User: {utterance.speaker.id}")
                        print(f"Utterance: {utterance.text}")
                        print(f"Stance label: {utterance.meta.get('detected_stance', 'N/A')}")

                        # Get main post content
                        try:
                            main_post_utterance = corpus.get_utterance(conversation.id)
                            print(f"Main post: {main_post_utterance.text}")
                        except KeyError:
                            print("Main post: Unable to retrieve main post content")

                        # Get the reply-to post if it exists
                        if utterance.reply_to:
                            try:
                                reply_to_utterance = corpus.get_utterance(utterance.reply_to)
                                print(f"Reply-to post: {reply_to_utterance.text}")
                            except KeyError:
                                print("Reply-to post: Referenced utterance not found in corpus")
                        else:
                            print("Reply-to post: This is a top-level comment")

                        print("-" * 40)

                    # Summary statistics
                    if len([x for x in all_selected if x[0] == "HISTORICAL"]) > 0:
                        historical_stances = [utterance.meta.get('detected_stance', 'N/A') for category, utterance, _, _ in all_selected if category == "HISTORICAL"]
                        print(f"\n📊 HISTORICAL STANCE PATTERN: {historical_stances}")

                    change_point_stance = change_point_utterance.meta.get('detected_stance', 'N/A')
                    print(f"📊 STANCE: {change_point_stance}")

                    if len([x for x in all_selected if x[0] == "AFTER"]) > 0:
                        after_stances = [utterance.meta.get('detected_stance', 'N/A') for category, utterance, _, _ in all_selected if category == "AFTER"]
                        print(f"📊 AFTER STANCE PATTERN: {after_stances}")
                else:
                    print(f"No utterances found from user {user_id} before the change point")
            else:
                print("No utterances found from this user in the entire corpus")

            print("\n" + "=" * 80)
            print()

            utterances_processed += 1
            user_change_points += 1

print(f"Processed {utterances_processed} change points from the first group")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
200K + Cadillac benefits is a small price to pay for having to deal with us fussy bastards.
Stance label: right-leaning
Main post: tl;dr version:  $200,000 isn't as significant a sum of money to the super rich as it is to the average person.

I've seen a lot of submissions in r/politics claiming slashing congressional delegates' salary and/or benefits would encourage them into taking action.

I could be wrong but I would guess that a significant number of congressional delegates do not consider their salary their primary source of income.

&gt;The current salary (2011) for rank-and-file members of the House and Senate is $174,000 per year. (http://usgovinfo.about.com/od/uscongress/a/congresspay.htm)

Granted, that's a lot of money to most of us but how much money is it to them?  How much of their salary is spent on lawsuits, cost of living (likely in giant houses, mind you) and other expenses related to being a public fig

KeyboardInterrupt: 

In [1]:
# Evaluate positives from persistence with extended temporal context
import datetime
target_utterances = 200

# Only process the first group
first_group = persistence_groups_tuple[0]

utterances_processed = 0
target_reached = False

def get_time_difference_days(timestamp1, timestamp2):
    """Calculate difference in days between two timestamps"""
    if isinstance(timestamp1, (int, float)):
        timestamp1 = datetime.datetime.fromtimestamp(timestamp1)
    if isinstance(timestamp2, (int, float)):
        timestamp2 = datetime.datetime.fromtimestamp(timestamp2)

    return abs((timestamp1 - timestamp2).days)

users_processed = 0
max_users_to_show = 50  # Limit how many different users we show

for user_id, topic_timelines in first_group.items():
    if target_reached or users_processed >= max_users_to_show:
        break

    user_start_time = time.time()
    user_change_points = 0
    user_has_shown_change_point = False  # Track if we've shown a change point for this user

    for topic_name, topic_timeline in topic_timelines.items():
        if target_reached or user_has_shown_change_point:
            break

        for change_point_utterance_id in topic_timeline.keys():  # Each utterance/change point
            if utterances_processed >= target_utterances or user_has_shown_change_point:
                if user_has_shown_change_point:
                    break
                target_reached = True
                break

            # Get the change point utterance from corpus
            try:
                change_point_utterance = corpus.get_utterance(change_point_utterance_id)
                change_point_conversation = corpus.get_conversation(change_point_utterance.conversation_id)

                print("=" * 80)
                print("CHANGE POINT DETECTED:")
                print("=" * 80)
                print(f"Change point utterance ID: {change_point_utterance_id}")
                print(f"Title: {change_point_conversation.meta.get('title', 'N/A')}")
                print(f"Main post: {change_point_conversation.meta.get('selftext', 'N/A')}")
                print(f"User: {change_point_utterance.speaker.id}")
                print(f"Utterance: {change_point_utterance.text}")
                print(f"Stance label: {change_point_utterance.meta.get('stance', 'N/A')}")
                print(f"Change point timestamp: {change_point_utterance.timestamp}")

                # Get the reply-to post if it exists
                if change_point_utterance.reply_to:
                    try:
                        reply_to_utterance = corpus.get_utterance(change_point_utterance.reply_to)
                        print(f"Reply-to post: {reply_to_utterance.text}")
                    except KeyError:
                        print("Reply-to post: Referenced utterance not found in corpus")
                else:
                    print("Reply-to post: This is a top-level comment")

            except KeyError as e:
                print(f"Error: Change point utterance {change_point_utterance_id} not found in corpus: {e}")
                continue

            print("\n" + "-" * 80)
            print("TEMPORAL CONTEXT AROUND CHANGE POINT:")
            print("-" * 80)

            # Collect ALL utterances from THIS USER across the entire corpus
            change_point_timestamp = change_point_utterance.timestamp
            all_user_utterances = []

            print(f"Searching corpus for all utterances from user {user_id}...")
            for conversation in corpus.iter_conversations():
                try:
                    user_utterances = [
                        (utt, conversation) for utt in conversation.iter_utterances()
                        if (utt.speaker.id == user_id and
                            utt.timestamp is not None and
                            utt.timestamp < change_point_timestamp)  # Only utterances before change point
                    ]
                    all_user_utterances.extend(user_utterances)
                except Exception as e:
                    print(f"Error processing conversation {conversation.id}: {e}")
                    continue

            if all_user_utterances:
                # Sort all utterances by timestamp (oldest first)
                all_user_utterances.sort(key=lambda x: x[0].timestamp)

                print(f"Found {len(all_user_utterances)} utterances from user {user_id} before change point")

                if all_user_utterances:
                    # Calculate temporal distances for diagnostic purposes
                    earliest_utterance = all_user_utterances[0]
                    latest_utterance = all_user_utterances[-1]

                    earliest_days = get_time_difference_days(earliest_utterance[0].timestamp, change_point_timestamp)
                    latest_days = get_time_difference_days(latest_utterance[0].timestamp, change_point_timestamp)

                    print(f"📊 Temporal range: earliest is {earliest_days} days before, latest is {latest_days} days before")

                    # Get the 2 oldest utterances (furthest back in time)
                    historical_utterances = all_user_utterances[:2]

                    # Get utterances from 2-3 months ago if available
                    months_ago_utterances = []
                    for utt, conv in all_user_utterances:
                        days_diff = get_time_difference_days(utt.timestamp, change_point_timestamp)
                        if 60 <= days_diff <= 90:  # 2-3 months ago
                            months_ago_utterances.append((utt, conv, days_diff))

                    # If we have utterances from 2-3 months ago, use those instead
                    if months_ago_utterances:
                        # Sort by timestamp and take the 2 most recent from that period
                        months_ago_utterances.sort(key=lambda x: x[0].timestamp)
                        selected_historical = [(utt, conv) for utt, conv, _ in months_ago_utterances[-2:]]
                        print(f"📚 Using 2 utterances from 2-3 months ago (out of {len(months_ago_utterances)} available)")
                    else:
                        selected_historical = historical_utterances
                        print(f"📚 Using 2 oldest utterances (no utterances found from 2-3 months ago)")

                    # Get a few recent utterances after the change point for comparison
                    after_utterances = []
                    for conversation in corpus.iter_conversations():
                        try:
                            user_utterances_after = [
                                (utt, conversation) for utt in conversation.iter_utterances()
                                if (utt.speaker.id == user_id and
                                    utt.timestamp is not None and
                                    utt.timestamp > change_point_timestamp)
                            ]
                            after_utterances.extend(user_utterances_after)
                        except Exception as e:
                            continue

                    # Sort and take first 2 after change point
                    after_utterances.sort(key=lambda x: x[0].timestamp)
                    selected_after = after_utterances[:2] if len(after_utterances) >= 2 else after_utterances

                    # Display the selected utterances
                    all_selected = []

                    # Add historical utterances
                    for utterance, conversation in selected_historical:
                        days_diff = get_time_difference_days(utterance.timestamp, change_point_timestamp)
                        all_selected.append(("HISTORICAL", utterance, conversation, days_diff))

                    # Add the change point
                    all_selected.append(("CHANGE_POINT", change_point_utterance, change_point_conversation, 0))

                    # Add after utterances
                    for utterance, conversation in selected_after:
                        days_diff = get_time_difference_days(utterance.timestamp, change_point_timestamp)
                        all_selected.append(("AFTER", utterance, conversation, days_diff))

                    print(f"\nShowing {len(all_selected)} utterances from ALL speakers:")
                    print("="*60)

                    for category, utterance, conversation, days_diff in all_selected:

                        if category == "HISTORICAL":
                            print(f"\n📚 **HISTORICAL** ({days_diff} days before):")
                        elif category == "CHANGE_POINT":
                            print(f"\n🔄 **CHANGE POINT**:")
                        else:  # AFTER
                            print(f"\n📈 **AFTER** ({days_diff} days after):")

                        print(f"Topic: {topic_name}")
                        print(f"Conversation: {conversation.id}")
                        print(f"Title: {conversation.meta.get('title', 'N/A')}")
                        print(f"Timestamp: {utterance.timestamp}")
                        print(f"Speaker: {utterance.speaker.id}")  # Changed from "User" to "Speaker"

                        # Highlight if this is the same user as the change point or a different speaker
                        if utterance.speaker.id == user_id:
                            print(f"👤 Same user as change point")
                        else:
                            print(f"👥 Different speaker (change point user: {user_id})")

                        print(f"Utterance: {utterance.text}")
                        print(f"Stance label: {utterance.meta.get('detected_stance', 'N/A')}")

                        # Get the reply-to post if it exists
                        if utterance.reply_to:
                            try:
                                reply_to_utterance = corpus.get_utterance(utterance.reply_to)
                                print(f"Reply-to post: {reply_to_utterance.text}")
                                print(f"Reply-to speaker: {reply_to_utterance.speaker.id}")
                            except KeyError:
                                print("Reply-to post: Referenced utterance not found in corpus")
                        else:
                            print("Reply-to post: This is a top-level comment")

                        print("-" * 40)

                    # Summary statistics showing speaker diversity
                    historical_speakers = [utterance.speaker.id for category, utterance, _, _ in all_selected if category == "HISTORICAL"]
                    after_speakers = [utterance.speaker.id for category, utterance, _, _ in all_selected if category == "AFTER"]

                    print(f"\n📊 USER {user_id} ANALYSIS:")
                    print(f"Historical stances: {[utterance.meta.get('detected_stance', 'N/A') for category, utterance, _, _ in all_selected if category == 'HISTORICAL']}")
                    print(f"Change point stance: {change_point_utterance.meta.get('detected_stance', 'N/A')}")
                    print(f"After stances: {[utterance.meta.get('detected_stance', 'N/A') for category, utterance, _, _ in all_selected if category == 'AFTER']}")

                else:
                    print(f"No utterances found from user {user_id} before the change point")
            else:
                print("No utterances found from this user in the entire corpus")

            print("\n" + "=" * 80)
            print()

            utterances_processed += 1
            user_change_points += 1
            user_has_shown_change_point = True  # Mark that we've processed this user

    if user_has_shown_change_point:
        users_processed += 1
        print(f"\n🎯 COMPLETED ANALYSIS FOR USER {user_id} ({users_processed}/{max_users_to_show})")
        print("=" * 80)

print(f"Processed {utterances_processed} change points from {users_processed} different users")

NameError: name 'persistence_groups_tuple' is not defined

In [19]:
# Evaluate negatives from persistence with extended temporal context
import datetime
target_utterances = 200

# Only process the first group
first_group = persistence_groups_tuple[1]

utterances_processed = 0
target_reached = False

def get_time_difference_days(timestamp1, timestamp2):
    """Calculate difference in days between two timestamps"""
    if isinstance(timestamp1, (int, float)):
        timestamp1 = datetime.datetime.fromtimestamp(timestamp1)
    if isinstance(timestamp2, (int, float)):
        timestamp2 = datetime.datetime.fromtimestamp(timestamp2)

    return abs((timestamp1 - timestamp2).days)

for user_id, topic_timelines in first_group.items():
    if target_reached:
        break

    user_start_time = time.time()
    user_change_points = 0

    for topic_name, topic_timeline in topic_timelines.items():
        if target_reached:
            break

        for change_point_utterance_id in topic_timeline.keys():  # Each utterance/change point
            if utterances_processed >= target_utterances:
                target_reached = True
                break

            # Get the change point utterance from corpus
            try:
                change_point_utterance = corpus.get_utterance(change_point_utterance_id)
                change_point_conversation = corpus.get_conversation(change_point_utterance.conversation_id)

                print("=" * 80)
                print("POINT DETECTED:")
                print("=" * 80)
                print(f"Change point utterance ID: {change_point_utterance_id}")
                print(f"Title: {change_point_conversation.meta.get('title', 'N/A')}")
                print(f"Main post: {change_point_conversation.meta.get('selftext', 'N/A')}")
                print(f"User: {change_point_utterance.speaker.id}")
                print(f"Utterance: {change_point_utterance.text}")
                print(f"Stance label: {change_point_utterance.meta.get('stance', 'N/A')}")
                print(f"Change point timestamp: {change_point_utterance.timestamp}")

                # Get the reply-to post if it exists
                if change_point_utterance.reply_to:
                    try:
                        reply_to_utterance = corpus.get_utterance(change_point_utterance.reply_to)
                        print(f"Reply-to post: {reply_to_utterance.text}")
                    except KeyError:
                        print("Reply-to post: Referenced utterance not found in corpus")
                else:
                    print("Reply-to post: This is a top-level comment")

            except KeyError as e:
                print(f"Error: Change point utterance {change_point_utterance_id} not found in corpus: {e}")
                continue

            print("\n" + "-" * 80)
            print("TEMPORAL CONTEXT AROUND POINT:")
            print("-" * 80)

            # Collect ALL utterances from this user across the entire corpus
            change_point_timestamp = change_point_utterance.timestamp
            all_user_utterances = []

            print("Searching corpus for all user utterances...")
            for conversation in corpus.iter_conversations():
                try:
                    user_utterances = [
                        (utt, conversation) for utt in conversation.iter_utterances()
                        if (utt.speaker.id == user_id and
                            utt.timestamp is not None and
                            utt.timestamp < change_point_timestamp)  # Only utterances before change point
                    ]
                    all_user_utterances.extend(user_utterances)
                except Exception as e:
                    print(f"Error processing conversation {conversation.id}: {e}")
                    continue

            if all_user_utterances:
                # Sort all utterances by timestamp (oldest first)
                all_user_utterances.sort(key=lambda x: x[0].timestamp)

                print(f"Found {len(all_user_utterances)} utterances from user before change point")

                if all_user_utterances:
                    # Calculate temporal distances for diagnostic purposes
                    earliest_utterance = all_user_utterances[0]
                    latest_utterance = all_user_utterances[-1]

                    earliest_days = get_time_difference_days(earliest_utterance[0].timestamp, change_point_timestamp)
                    latest_days = get_time_difference_days(latest_utterance[0].timestamp, change_point_timestamp)

                    print(f"📊 Temporal range: earliest is {earliest_days} days before, latest is {latest_days} days before")

                    # Get the 2 oldest utterances (furthest back in time)
                    historical_utterances = all_user_utterances[:2]

                    # Get utterances from 2-3 months ago if available
                    months_ago_utterances = []
                    for utt, conv in all_user_utterances:
                        days_diff = get_time_difference_days(utt.timestamp, change_point_timestamp)
                        if 60 <= days_diff <= 90:  # 2-3 months ago
                            months_ago_utterances.append((utt, conv, days_diff))

                    # If we have utterances from 2-3 months ago, use those instead
                    if months_ago_utterances:
                        # Sort by timestamp and take the 2 most recent from that period
                        months_ago_utterances.sort(key=lambda x: x[0].timestamp)
                        selected_historical = [(utt, conv) for utt, conv, _ in months_ago_utterances[-2:]]
                        print(f"📚 Using 2 utterances from 2-3 months ago (out of {len(months_ago_utterances)} available)")
                    else:
                        selected_historical = historical_utterances
                        print(f"📚 Using 2 oldest utterances (no utterances found from 2-3 months ago)")

                    # Get a few recent utterances after the change point for comparison
                    after_utterances = []
                    for conversation in corpus.iter_conversations():
                        try:
                            user_utterances_after = [
                                (utt, conversation) for utt in conversation.iter_utterances()
                                if (utt.speaker.id == user_id and
                                    utt.timestamp is not None and
                                    utt.timestamp > change_point_timestamp)
                            ]
                            after_utterances.extend(user_utterances_after)
                        except Exception as e:
                            continue

                    # Sort and take first 2 after change point
                    after_utterances.sort(key=lambda x: x[0].timestamp)
                    selected_after = after_utterances[:2] if len(after_utterances) >= 2 else after_utterances

                    # Display the selected utterances
                    all_selected = []

                    # Add historical utterances
                    for utterance, conversation in selected_historical:
                        days_diff = get_time_difference_days(utterance.timestamp, change_point_timestamp)
                        all_selected.append(("HISTORICAL", utterance, conversation, days_diff))

                    # Add the change point
                    all_selected.append(("CHANGE_POINT", change_point_utterance, change_point_conversation, 0))

                    # Add after utterances
                    for utterance, conversation in selected_after:
                        days_diff = get_time_difference_days(utterance.timestamp, change_point_timestamp)
                        all_selected.append(("AFTER", utterance, conversation, days_diff))

                    print(f"\nShowing {len(all_selected)} utterances:")
                    print("="*60)

                    for category, utterance, conversation, days_diff in all_selected:

                        if category == "HISTORICAL":
                            print(f"\n📚 **HISTORICAL** ({days_diff} days before):")
                        elif category == "CHANGE_POINT":
                            print(f"\n🔄 **NOT A CHANGE POINT**:")
                        else:  # AFTER
                            print(f"\n📈 **AFTER** ({days_diff} days after):")

                        print(f"Topic: {topic_name}")
                        print(f"Conversation: {conversation.id}")
                        print(f"Title: {conversation.meta.get('title', 'N/A')}")
                        print(f"Timestamp: {utterance.timestamp}")
                        print(f"User: {utterance.speaker.id}")
                        print(f"Utterance: {utterance.text}")
                        print(f"Stance label: {utterance.meta.get('detected_stance', 'N/A')}")

                        # Get main post content
                        try:
                            main_post_utterance = corpus.get_utterance(conversation.id)
                            print(f"Main post: {main_post_utterance.text}")
                        except KeyError:
                            print("Main post: Unable to retrieve main post content")

                        # Get the reply-to post if it exists
                        if utterance.reply_to:
                            try:
                                reply_to_utterance = corpus.get_utterance(utterance.reply_to)
                                print(f"Reply-to post: {reply_to_utterance.text}")
                            except KeyError:
                                print("Reply-to post: Referenced utterance not found in corpus")
                        else:
                            print("Reply-to post: This is a top-level comment")

                        print("-" * 40)

                    # Summary statistics
                    if len([x for x in all_selected if x[0] == "HISTORICAL"]) > 0:
                        historical_stances = [utterance.meta.get('detected_stance', 'N/A') for category, utterance, _, _ in all_selected if category == "HISTORICAL"]
                        print(f"\n📊 HISTORICAL STANCE PATTERN: {historical_stances}")

                    change_point_stance = change_point_utterance.meta.get('detected_stance', 'N/A')
                    print(f"📊 STANCE: {change_point_stance}")

                    if len([x for x in all_selected if x[0] == "AFTER"]) > 0:
                        after_stances = [utterance.meta.get('detected_stance', 'N/A') for category, utterance, _, _ in all_selected if category == "AFTER"]
                        print(f"📊 AFTER STANCE PATTERN: {after_stances}")
                else:
                    print(f"No utterances found from user {user_id} before the change point")
            else:
                print("No utterances found from this user in the entire corpus")

            print("\n" + "=" * 80)
            print()

            utterances_processed += 1
            user_change_points += 1

print(f"Processed {utterances_processed} change points from the first group")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
--------------------------------------------------------------------------------
TEMPORAL CONTEXT AROUND POINT:
--------------------------------------------------------------------------------
Searching corpus for all user utterances...
Found 1 utterances from user before change point
📊 Temporal range: earliest is 1 days before, latest is 1 days before
📚 Using 2 oldest utterances (no utterances found from 2-3 months ago)

Showing 4 utterances:

📚 **HISTORICAL** (1 days before):
Topic: Abortion
Conversation: o46ly
Title: Haliburton was caught selling refining technology to Iran back in 2005. Should those at Haliburton be "indefinitely detained" for aiding and abetting terrorism?
Timestamp: 1325802328
User: DroodEdwin
Utterance: whats the relevance?? You're mad about a Obama policy so you're trying to include it in a discussion with a  company that was connected to Cheney???  
Stance label: right-leaning
Main post: Because 

KeyboardInterrupt: 