In [None]:
!git clone https://github.com/Sharp-4rth/temporal_belief_analysis.git

In [None]:
# Need to restart after:
!pip install convokit[llm]
!pip install convokit

In [1]:
import sys
import os
os.chdir('/content/temporal_belief_analysis/notebooks')
print("Changed working directory to:", os.getcwd())

# Absolute path to src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

Changed working directory to: /content/temporal_belief_analysis/notebooks


In [None]:
# Add this code BEFORE your main processing loop to count change points
print("\n=== CHANGE POINT ANALYSIS ===")

for group_idx, group in enumerate(groups_tuple):
    print(f"\n📊 Group {group_idx + 1}:")
    print(f"   Total users: {len(group)}")

    group_total_change_points = 0
    user_change_points = []

    for user_id, topic_timelines in group.items():
        user_total = 0

        for topic_name, topic_timeline in topic_timelines.items():
            topic_change_points = len(topic_timeline.keys())
            user_total += topic_change_points

        user_change_points.append((user_id, user_total))
        group_total_change_points += user_total

    print(f"   Total change points: {group_total_change_points}")
    print(f"   Average per user: {group_total_change_points / len(group):.1f}")

    # Show the first few users (the ones that will actually be processed)
    print(f"   First 2 users (these will be processed):")
    for i, (user_id, count) in enumerate(user_change_points[:2]):
        print(f"     User {user_id}: {count} change points")

    # Show distribution
    user_counts = [count for _, count in user_change_points]
    if user_counts:
        print(f"   Min change points per user: {min(user_counts)}")
        print(f"   Max change points per user: {max(user_counts)}")

print("\n" + "="*50 + "\n")

In [None]:
import time
!pip install gdown
import zipfile
import nltk
from nltk.corpus import stopwords
from convokit import Corpus, download
import convokit
from temporal_belief.core.timeline_building import TimelineBuilder
from temporal_belief.core.persistence_change_detection import ChangeDetector
from temporal_belief.core.window_extraction import WindowExtractor
from temporal_belief.core.op_path_pairing import OpPathPairer
from temporal_belief.data.preprocessors import ChangeDetectorPreprocessor
from temporal_belief.data.preprocessors import PairPreprocessor
from temporal_belief.data.preprocessors import ExtractFeatures
from temporal_belief.data.preprocessors import GroupPreprocessor
from temporal_belief.core.interplay import Interplay
import numpy as np
nltk.download('stopwords')

In [3]:
# Download and unzip with python (Dataloading):
# !gdown "https://drive.google.com/file/d/1N0U_jUJlOYjdaju2FaU8p87uB22YBxJ0/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1DLFY6JLMZqNjwvNRZmhlV4-rnoQP_eyH/view?usp=sharing" -O "/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip" --fuzzy
# !gdown "https://drive.google.com/file/d/1nWaj5N8nsG7u5homv_kAh4CLPDv01M_Z/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_topics.zip" --fuzzy
!gdown "https://drive.google.com/file/d/1AIrstrzE259fcVyxJQW4-RwvAkoUyK1x/view?usp=sharing" -O "/content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip" --fuzzy

# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances100000_chronological.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/merged_corpus_checkpoint_5.zip").extractall("/content/temporal_belief_analysis")
# zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_topics.zip").extractall("/content/temporal_belief_analysis")
zipfile.ZipFile("/content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip").extractall("/content/temporal_belief_analysis")

Downloading...
From (original): https://drive.google.com/uc?id=1AIrstrzE259fcVyxJQW4-RwvAkoUyK1x
From (redirected): https://drive.google.com/uc?id=1AIrstrzE259fcVyxJQW4-RwvAkoUyK1x&confirm=t&uuid=cf2419fc-0025-4155-93ac-cba05fe956d3
To: /content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned.zip
100% 1.07G/1.07G [00:11<00:00, 93.0MB/s]


In [8]:
CORPUS_PATH = "/content/temporal_belief_analysis/pd_corpus_with_stances_fine_tuned"

In [None]:
corpus = Corpus(filename=CORPUS_PATH)

In [11]:
timeline_builder = TimelineBuilder(corpus)
timelines = timeline_builder.build_timelines()
change_detector = ChangeDetector()
window_extractor = WindowExtractor(corpus, timelines)
window_extractor.build_global_user_conversations_index()
groups_preprocessor = GroupPreprocessor()
op_path_pairer = OpPathPairer(corpus, timelines)
pair_preprocessor = PairPreprocessor()
feature_extractor = ExtractFeatures()
persuasion_analyzer = Interplay()

In [None]:
groups = change_detector.get_two_groups(timelines)
groups_tuple = (groups['with_changes'], groups['no_changes'])
groups = groups_preprocessor.filter_groups(groups, groups_tuple)

In [52]:
from tqdm import tqdm
from convokit import PolitenessStrategies
import re

stop_words_set = set(stopwords.words('english'))

# Convos has been set to test
# Init
i = 0
group_means = [] # Initialize as a list to append means
group_scores = []
utts_num = 0

# For each group
for group_idx, group in enumerate(tqdm(groups_tuple, desc="Processing groups")):
    # Initialize dictionary for this group's scores
    current_group_scores = {
        'interplay': [],
        'politeness': [],
        'argument_complexity': [],
        'evidence_markers': [],
        'hedging': []
    }

    count = 0
    for user_id, topic_timelines in group.items():
        # Process only 2 users for debugging
        if count < 2:

            user_start_time = time.time()
            user_change_points = 0

            for topic_timeline in topic_timelines.values():

                for change_point in topic_timeline.keys():  # Iterate through change points (keys)
                    utts_num += 1

                    user_change_points += 1

                    # TIME: Window extraction
                    start_time = time.time()
                    try:
                        candidate_convos = window_extractor.get_conversations_around_change_point(
                            change_point=change_point, corpus=corpus, test=True
                        )
                        window_time = time.time() - start_time
                        # print(f'⏱️ Window extraction: {window_time:.3f}s')
                    except ValueError as e:
                        # print(f"Skipping change point {change_point}: {e}")
                        continue


                    # TIME: Path extraction
                    start_time = time.time()
                    timeout_duration = 0.25  # 0.25 seconds
                    op_path_pairs = []

                    for candidate_convo in candidate_convos:
                        # Check if we've exceeded the timeout
                        if time.time() - start_time > timeout_duration:
                            # print(f"⏰ Path extraction timeout reached ({timeout_duration}s)")
                            break

                        try:
                            op_path_pairs.extend(op_path_pairer.extract_rooted_path_from_candidate_convos(
                                [candidate_convo], user_id
                            ))
                        except ValueError as e:
                            # print(f"Skipping conversation {candidate_convo.id}: {e}")
                            continue

                    path_time = time.time() - start_time
                    # print(f'⏱️ Path extraction: {path_time:.3f}s')


                    # TIME: Preprocessing
                    start_time = time.time()
                    preprocessed_pairs = pair_preprocessor.concatenate_path_in_all_pairs(op_path_pairs)
                    preprocess_time = time.time() - start_time
                    # print(f'⏱️ Preprocessing: {preprocess_time:.3f}s')


                    # TIME: Feature extraction (ENHANCED)
                    start_time = time.time()
                    interplay_features_list = []
                    politeness_features_list = []
                    # NEW: Feature lists for new predictors
                    argument_complexity_features_list = []
                    evidence_features_list = []
                    hedging_features_list = []

                    for op, paths in preprocessed_pairs:
                        for k, concatenated_utts in paths.items():
                            # Existing feature extraction
                            interplay_features = persuasion_analyzer.calculate_interplay_features(
                                op.text, concatenated_utts, stop_words_set
                            )
                            interplay_features_list.append(interplay_features)

                            # Fixed politeness feature extraction
                            politeness_features = feature_extractor.get_politeness_features(concatenated_utts)
                            politeness_features_list.append(politeness_features)

                            # NEW: Extract features (not scores) for new predictors
                            complexity_features = feature_extractor.extract_argument_complexity_features(concatenated_utts)
                            argument_complexity_features_list.append(complexity_features)

                            evidence_features = feature_extractor.extract_evidence_features(concatenated_utts)
                            evidence_features_list.append(evidence_features)

                            hedging_features = feature_extractor.extract_hedging_features(concatenated_utts)
                            hedging_features_list.append(hedging_features)

                    feature_time = time.time() - start_time
                    # print(f'⏱️ Feature extraction (enhanced): {feature_time:.3f}s')

                    # TIME: Score interplay (existing)
                    start_time = time.time()
                    interplay_scores = []
                    for interplay_features in interplay_features_list:
                        score = persuasion_analyzer.calculate_persuasion_score(interplay_features)
                        interplay_scores.append(score)

                    # Score politeness features (existing)
                    politeness_scores = []
                    for politeness_features in politeness_features_list:
                        politeness_total = sum(politeness_features.values())
                        politeness_scores.append(politeness_total)

                    # NEW: Score the new predictors
                    argument_complexity_scores = []
                    for complexity_features in argument_complexity_features_list:
                        score = feature_extractor.calculate_complexity_score(complexity_features)
                        argument_complexity_scores.append(score)

                    evidence_scores = []
                    for evidence_features in evidence_features_list:
                        score = feature_extractor.calculate_evidence_score(evidence_features)
                        evidence_scores.append(score)

                    hedging_scores = []
                    for hedging_features in hedging_features_list:
                        score = feature_extractor.calculate_hedging_score_from_features(hedging_features)
                        hedging_scores.append(score)

                    scoring_time = time.time() - start_time
                    # print(f'⏱️ Scoring: {scoring_time:.3f}s')

                    # Add all scores to current group (NEW STRUCTURE)
                    current_group_scores['interplay'].extend(interplay_scores)
                    current_group_scores['politeness'].extend(politeness_scores)
                    current_group_scores['argument_complexity'].extend(argument_complexity_scores)
                    current_group_scores['evidence_markers'].extend(evidence_scores)
                    current_group_scores['hedging'].extend(hedging_scores)

                    # Print total time for this change point
                    total_time = window_time + path_time + preprocess_time + feature_time + scoring_time
                    # print(f'🔥 TOTAL for change point: {total_time:.3f}s\n')

            # TIME: End timing this user
            user_total_time = time.time() - user_start_time
            # print(f'👤 USER {user_id} TOTAL: {user_total_time:.3f}s ({user_change_points} change points)')
            # print(f'📊 Average per change point: {user_total_time/max(1, user_change_points):.3f}s\n')

            count += 1

        if count >= 10:
            break

    # Calculate means for each predictor for this group (ENHANCED)
    group_mean = {}
    for predictor_name, scores in current_group_scores.items():
        if scores:  # Check if we have scores
            group_mean[predictor_name] = sum(scores) / len(scores)
        else:
            group_mean[predictor_name] = 0

    # Append this group's means
    group_means.append(group_mean)
    group_scores.append(current_group_scores)

# Print the calculated group means for each predictor (ENHANCED)
print(f'\n=== GROUP COMPARISON ===')
for group_idx, group_mean in enumerate(group_means):
    print(f'\nGroup {group_idx + 1} Means:')
    for predictor, mean_score in group_mean.items():
        print(f'  {predictor}: {mean_score:.4f}')

# Print comparison between groups
if len(group_means) >= 2:
    print(f'\n=== GROUP 1 vs GROUP 2 COMPARISON ===')
    for predictor in group_means[0].keys():
        group1_mean = group_means[0][predictor]
        group2_mean = group_means[1][predictor]
        difference = group1_mean - group2_mean
        percent_diff = (difference / group2_mean * 100) if group2_mean != 0 else 0
        print(f'{predictor}:')
        print(f'  Group 1: {group1_mean:.4f}')
        print(f'  Group 2: {group2_mean:.4f}')
        print(f'  Difference: {difference:.4f} ({percent_diff:+.1f}%)')
        print()

Processing groups:   0%|          | 0/2 [00:00<?, ?it/s]

⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.013s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction (enhanced): 0.003s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.016s

⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.047s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction (enhanced): 0.008s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.056s

⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.011s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction (enhanced): 0.003s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.014s

👤 USER seltaeb4 TOTAL: 0.087s (3 change points)
📊 Average per change point: 0.029s

⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.032s
⏱️ Preprocessing: 0.002s
⏱️ Feature extraction (enhanced): 0.110s
⏱️ Scoring: 0.001s
🔥 TOTAL for change point: 0.145s

⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.168s
⏱️ Preprocessing: 0.012s


Processing groups:  50%|█████     | 1/2 [00:00<00:00,  1.02it/s]

⏱️ Feature extraction (enhanced): 0.377s
⏱️ Scoring: 0.001s
🔥 TOTAL for change point: 0.558s

⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.011s
⏱️ Preprocessing: 0.001s
⏱️ Feature extraction (enhanced): 0.025s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.038s

⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.042s
⏱️ Preprocessing: 0.001s
⏱️ Feature extraction (enhanced): 0.021s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.064s

⏱️ Window extraction: 0.000s
Skipping conversation 1jcj4v: Conversation failed integrity check. It is either missing an utterance in the reply-to chain and/or has multiple root nodes. Run check_integrity() to diagnose issues.
⏱️ Path extraction: 0.026s
⏱️ Preprocessing: 0.001s
⏱️ Feature extraction (enhanced): 0.020s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.046s

⏱️ Window extraction: 0.000s
⏱️ Path extraction: 0.029s
⏱️ Preprocessing: 0.000s
⏱️ Feature extraction (enhanced): 0.014s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.043s

👤 USER Ha

Processing groups: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s]

⏱️ Feature extraction (enhanced): 0.024s
⏱️ Scoring: 0.000s
🔥 TOTAL for change point: 0.026s

👤 USER amade183 TOTAL: 0.026s (1 change points)
📊 Average per change point: 0.026s


=== GROUP COMPARISON ===

Group 1 Means:
  interplay: 0.8327
  politeness: 2.7514
  argument_complexity: 0.8918
  evidence_markers: 0.9904
  hedging: 0.0044

Group 2 Means:
  interplay: 0.7555
  politeness: 0.6286
  argument_complexity: 1.0112
  evidence_markers: 0.4095
  hedging: 0.0037

=== GROUP 1 vs GROUP 2 COMPARISON ===
interplay:
  Group 1: 0.8327
  Group 2: 0.7555
  Difference: 0.0772 (+10.2%)

politeness:
  Group 1: 2.7514
  Group 2: 0.6286
  Difference: 2.1229 (+337.7%)

argument_complexity:
  Group 1: 0.8918
  Group 2: 1.0112
  Difference: -0.1194 (-11.8%)

evidence_markers:
  Group 1: 0.9904
  Group 2: 0.4095
  Difference: 0.5809 (+141.9%)

hedging:
  Group 1: 0.0044
  Group 2: 0.0037
  Difference: 0.0007 (+18.5%)






In [35]:
# ========== NEW: STATISTICAL ANALYSIS SECTION ==========

def perform_statistical_tests(group1_scores, group2_scores, predictor_name, alpha=0.05):
    """
    Perform comprehensive statistical tests between two groups for a given predictor.

    Args:
        group1_scores: List of scores for group 1
        group2_scores: List of scores for group 2
        predictor_name: Name of the predictor being tested
        alpha: Significance level (default 0.05)

    Returns:
        dict: Results of all statistical tests
    """
    if not group1_scores or not group2_scores:
        return {
            'valid': False,
            'reason': 'Empty score lists'
        }

    # Convert to numpy arrays
    g1 = np.array(group1_scores)
    g2 = np.array(group2_scores)

    # Basic descriptive statistics
    results = {
        'valid': True,
        'predictor': predictor_name,
        'group1_n': len(g1),
        'group2_n': len(g2),
        'group1_mean': np.mean(g1),
        'group2_mean': np.mean(g2),
        'group1_std': np.std(g1, ddof=1),
        'group2_std': np.std(g2, ddof=1),
        'group1_median': np.median(g1),
        'group2_median': np.median(g2),
        'mean_difference': np.mean(g1) - np.mean(g2),
    }

    # Effect size (Cohen's d)
    pooled_std = np.sqrt(((len(g1) - 1) * results['group1_std']**2 +
                         (len(g2) - 1) * results['group2_std']**2) /
                        (len(g1) + len(g2) - 2))
    results['cohens_d'] = results['mean_difference'] / pooled_std if pooled_std > 0 else 0

    # Interpret effect size
    abs_d = abs(results['cohens_d'])
    if abs_d < 0.2:
        effect_size_interpretation = "negligible"
    elif abs_d < 0.5:
        effect_size_interpretation = "small"
    elif abs_d < 0.8:
        effect_size_interpretation = "medium"
    else:
        effect_size_interpretation = "large"
    results['effect_size_interpretation'] = effect_size_interpretation

    # Test for equal variances (Levene's test)
    try:
        levene_stat, levene_p = levene(g1, g2)
        results['levene_statistic'] = levene_stat
        results['levene_p_value'] = levene_p
        results['equal_variances'] = levene_p > alpha
    except Exception as e:
        results['levene_error'] = str(e)
        results['equal_variances'] = True  # Assume equal variances if test fails

    # Independent samples t-test
    try:
        # Use equal_var parameter based on Levene's test
        equal_var = results.get('equal_variances', True)
        t_stat, t_p = ttest_ind(g1, g2, equal_var=equal_var)
        results['t_statistic'] = t_stat
        results['t_p_value'] = t_p
        results['t_significant'] = t_p < alpha

        # Calculate degrees of freedom
        if equal_var:
            results['t_df'] = len(g1) + len(g2) - 2
        else:
            # Welch's t-test degrees of freedom
            s1_sq, s2_sq = results['group1_std']**2, results['group2_std']**2
            n1, n2 = len(g1), len(g2)
            results['t_df'] = ((s1_sq/n1 + s2_sq/n2)**2) / ((s1_sq/n1)**2/(n1-1) + (s2_sq/n2)**2/(n2-1))

    except Exception as e:
        results['t_test_error'] = str(e)

    # Mann-Whitney U test (non-parametric alternative)
    try:
        u_stat, u_p = mannwhitneyu(g1, g2, alternative='two-sided')
        results['mannwhitney_u_statistic'] = u_stat
        results['mannwhitney_p_value'] = u_p
        results['mannwhitney_significant'] = u_p < alpha
    except Exception as e:
        results['mannwhitney_error'] = str(e)

    # 95% Confidence interval for the difference in means
    try:
        # Pooled standard error
        n1, n2 = len(g1), len(g2)
        pooled_se = pooled_std * np.sqrt(1/n1 + 1/n2)

        # Critical t-value
        df = results.get('t_df', n1 + n2 - 2)
        t_critical = stats.t.ppf(1 - alpha/2, df)

        # Confidence interval
        margin_of_error = t_critical * pooled_se
        results['ci_lower'] = results['mean_difference'] - margin_of_error
        results['ci_upper'] = results['mean_difference'] + margin_of_error
    except Exception as e:
        results['ci_error'] = str(e)

    return results

def format_statistical_results(results):
    """Format statistical results for readable output."""
    if not results['valid']:
        return f"❌ {results['predictor']}: {results['reason']}"

    output = f"\n📊 {results['predictor'].upper().replace('_', ' ')} ANALYSIS:\n"
    output += f"{'='*50}\n"

    # Descriptive statistics
    output += f"Sample sizes: Group 1: n={results['group1_n']}, Group 2: n={results['group2_n']}\n"
    output += f"Group 1: M={results['group1_mean']:.4f} (SD={results['group1_std']:.4f}), Mdn={results['group1_median']:.4f}\n"
    output += f"Group 2: M={results['group2_mean']:.4f} (SD={results['group2_std']:.4f}), Mdn={results['group2_median']:.4f}\n"
    output += f"Mean difference: {results['mean_difference']:.4f}\n"

    # Effect size
    output += f"Effect size (Cohen's d): {results['cohens_d']:.4f} ({results['effect_size_interpretation']})\n"

    # Variance equality test
    if 'levene_p_value' in results:
        equal_var_str = "✅ Equal" if results['equal_variances'] else "❌ Unequal"
        output += f"Levene's test: F={results['levene_statistic']:.4f}, p={results['levene_p_value']:.4f} ({equal_var_str} variances)\n"

    # t-test results
    if 't_p_value' in results:
        significance = "✅ SIGNIFICANT" if results['t_significant'] else "❌ Not significant"
        test_type = "Welch's t-test" if not results.get('equal_variances', True) else "Student's t-test"
        output += f"{test_type}: t({results['t_df']:.1f})={results['t_statistic']:.4f}, p={results['t_p_value']:.4f} {significance}\n"

    # Mann-Whitney U test
    if 'mannwhitney_p_value' in results:
        significance = "✅ SIGNIFICANT" if results['mannwhitney_significant'] else "❌ Not significant"
        output += f"Mann-Whitney U: U={results['mannwhitney_u_statistic']:.1f}, p={results['mannwhitney_p_value']:.4f} {significance}\n"

    # Confidence interval
    if 'ci_lower' in results:
        output += f"95% CI for difference: [{results['ci_lower']:.4f}, {results['ci_upper']:.4f}]\n"

    return output

# Print the calculated group means for each predictor (ENHANCED)
print(f'\n=== GROUP COMPARISON ===')
for group_idx, group_mean in enumerate(group_means):
    print(f'\nGroup {group_idx + 1} Means:')
    for predictor, mean_score in group_mean.items():
        print(f'  {predictor}: {mean_score:.4f}')

# Print comparison between groups
if len(group_means) >= 2:
    print(f'\n=== GROUP 1 vs GROUP 2 COMPARISON ===')
    for predictor in group_means[0].keys():
        group1_mean = group_means[0][predictor]
        group2_mean = group_means[1][predictor]
        difference = group1_mean - group2_mean
        percent_diff = (difference / group2_mean * 100) if group2_mean != 0 else 0
        print(f'{predictor}:')
        print(f'  Group 1: {group1_mean:.4f}')
        print(f'  Group 2: {group2_mean:.4f}')
        print(f'  Difference: {difference:.4f} ({percent_diff:+.1f}%)')
        print()

# ========== NEW: COMPREHENSIVE STATISTICAL ANALYSIS ==========

if len(group_scores) >= 2:
    print(f'\n🔬 STATISTICAL SIGNIFICANCE TESTING')
    print(f'=' * 60)

    # Store all test results for summary
    all_test_results = []
    significant_predictors = []

    # Test each predictor
    for predictor in group_scores[0].keys():
        group1_scores = group_scores[0][predictor]
        group2_scores = group_scores[1][predictor]

        # Perform statistical tests
        test_results = perform_statistical_tests(group1_scores, group2_scores, predictor)
        all_test_results.append(test_results)

        # Print formatted results
        print(format_statistical_results(test_results))

        # Track significant predictors
        if test_results.get('t_significant', False):
            significant_predictors.append(predictor)

    # Multiple comparison correction (Bonferroni)
    print(f'\n🎯 MULTIPLE COMPARISON CORRECTION')
    print(f'=' * 40)
    n_tests = len([r for r in all_test_results if r['valid']])
    bonferroni_alpha = 0.05 / n_tests if n_tests > 0 else 0.05
    print(f"Number of tests: {n_tests}")
    print(f"Bonferroni-corrected α: {bonferroni_alpha:.4f}")

    bonferroni_significant = []
    for result in all_test_results:
        if result['valid'] and 't_p_value' in result:
            is_significant = result['t_p_value'] < bonferroni_alpha
            status = "✅ SIGNIFICANT" if is_significant else "❌ Not significant"
            print(f"{result['predictor']}: p={result['t_p_value']:.4f} {status}")
            if is_significant:
                bonferroni_significant.append(result['predictor'])

    # Summary of findings
    print(f'\n📋 SUMMARY OF FINDINGS')
    print(f'=' * 30)
    print(f"Total predictors tested: {n_tests}")
    print(f"Significant at α=0.05: {len(significant_predictors)} ({len(significant_predictors)/n_tests*100:.1f}%)")
    print(f"Significant after Bonferroni correction: {len(bonferroni_significant)} ({len(bonferroni_significant)/n_tests*100:.1f}%)")

    if significant_predictors:
        print(f"\nSignificant predictors (uncorrected): {', '.join(significant_predictors)}")
    if bonferroni_significant:
        print(f"Significant predictors (Bonferroni): {', '.join(bonferroni_significant)}")

    # Effect size summary
    print(f'\n📏 EFFECT SIZES SUMMARY')
    print(f'=' * 25)
    for result in all_test_results:
        if result['valid']:
            direction = "Group 1 > Group 2" if result['mean_difference'] > 0 else "Group 2 > Group 1"
            print(f"{result['predictor']}: d={result['cohens_d']:.3f} ({result['effect_size_interpretation']}, {direction})")

else:
    print("\n⚠️  Need at least 2 groups for statistical comparison")

print(f'\n🏁 Analysis completed. Total utterances processed: {utts_num}')


=== GROUP COMPARISON ===

Group 1 Means:
  interplay: 0.8327
  politeness: 0.7008
  argument_complexity: 0.8918
  evidence_markers: 0.9904
  hedging: 0.0044

Group 2 Means:
  interplay: 0.7555
  politeness: 0.4190
  argument_complexity: 1.0112
  evidence_markers: 0.4095
  hedging: 0.0037

=== GROUP 1 vs GROUP 2 COMPARISON ===
interplay:
  Group 1: 0.8327
  Group 2: 0.7555
  Difference: 0.0772 (+10.2%)

politeness:
  Group 1: 0.7008
  Group 2: 0.4190
  Difference: 0.2817 (+67.2%)

argument_complexity:
  Group 1: 0.8918
  Group 2: 1.0112
  Difference: -0.1194 (-11.8%)

evidence_markers:
  Group 1: 0.9904
  Group 2: 0.4095
  Difference: 0.5809 (+141.9%)

hedging:
  Group 1: 0.0044
  Group 2: 0.0037
  Difference: 0.0007 (+18.5%)


🔬 STATISTICAL SIGNIFICANCE TESTING

📊 INTERPLAY ANALYSIS:
Sample sizes: Group 1: n=1046, Group 2: n=105
Group 1: M=0.8327 (SD=0.0828), Mdn=0.8226
Group 2: M=0.7555 (SD=0.0779), Mdn=0.7694
Mean difference: 0.0772
Effect size (Cohen's d): 0.9369 (large)


📊 POLITE