In [1]:
!pip install polars numpy matplotlib scipy pingouin

Collecting scipy
  Downloading scipy-1.13.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m755.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pingouin
  Downloading pingouin-0.5.4-py2.py3-none-any.whl.metadata (1.1 kB)
Collecting statsmodels (from pingouin)
  Downloading statsmodels-0.14.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.5 kB)
Collecting scikit-learn (from pingouin)
  Downloading scikit_learn-1.4.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting pandas-flavor (from pingouin)
  Downloading pandas_flavor-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Collecting tabulate (from pingouin)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting xarray (from pandas-flavor->pingouin)
  Downloading xarray-2024.3.0-py3-none-any.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn->pingouin)
  Downloading threadpool

In [10]:
import pandas as pd
import pingouin as pg
from pathlib import Path
from scipy.stats import shapiro, ttest_rel, wilcoxon

# Data Preprocessing

In [11]:
outputs_path = Path('outputs')
csv_file = list(outputs_path.glob('*.csv'))
csv_file = sorted(csv_file)[0]

In [12]:
df = pd.read_csv(csv_file)

In [13]:
df

Unnamed: 0,approach,generation_model,evaluation_model,from_ending_type,classified_as,num_stories
0,compressed,claude-3-opus-20240229,claude-3-opus-20240229,positive,positive,100
1,compressed,claude-3-opus-20240229,claude-3-opus-20240229,positive,negative,0
2,compressed,claude-3-opus-20240229,claude-3-opus-20240229,positive,neutral,0
3,compressed,claude-3-opus-20240229,claude-3-opus-20240229,negative,positive,11
4,compressed,claude-3-opus-20240229,claude-3-opus-20240229,negative,negative,78
...,...,...,...,...,...,...
175,compressed,chat-bison-001,claude-3-opus-20240229,positive,bittersweet,0
176,compressed,chat-bison-001,claude-3-opus-20240229,neutral,redemptive,0
177,compressed,chat-bison-001,claude-3-opus-20240229,neutral,bittersweet,0
178,compressed,chat-bison-001,claude-3-opus-20240229,negative,redemptive,0


## Pre-process data to answer the following questions:
1. Does prompt compression make the model generate last chapters with ending of the same type as the given synopsis?

In [14]:
# filter only positive-positive, negaitve-negative, and neutral-neutral pairs
paired_df = df[df['from_ending_type'] == df['classified_as']]

In [15]:
paired_df

Unnamed: 0,approach,generation_model,evaluation_model,from_ending_type,classified_as,num_stories
0,compressed,claude-3-opus-20240229,claude-3-opus-20240229,positive,positive,100
4,compressed,claude-3-opus-20240229,claude-3-opus-20240229,negative,negative,78
8,compressed,claude-3-opus-20240229,claude-3-opus-20240229,neutral,neutral,37
10,compressed,gemini-1.0-pro,claude-3-opus-20240229,positive,positive,100
14,compressed,gemini-1.0-pro,claude-3-opus-20240229,negative,negative,79
18,compressed,gemini-1.0-pro,claude-3-opus-20240229,neutral,neutral,39
19,compressed,gpt-4-0125-preview,claude-3-opus-20240229,positive,positive,100
23,compressed,gpt-4-0125-preview,claude-3-opus-20240229,negative,negative,26
28,compressed,gpt-4-0125-preview,claude-3-opus-20240229,neutral,neutral,23
30,compressed,claude-2.1,claude-3-opus-20240229,positive,positive,100


In [21]:
baseline_paired_df = paired_df[paired_df['approach'] == 'baseline']
compressed_paired_df = paired_df[paired_df['approach'] == 'compressed']

In [22]:
baseline_paired_df

Unnamed: 0,approach,generation_model,evaluation_model,from_ending_type,classified_as,num_stories
58,baseline,claude-3-opus-20240229,claude-3-opus-20240229,positive,positive,100
62,baseline,claude-3-opus-20240229,claude-3-opus-20240229,negative,negative,73
67,baseline,claude-3-opus-20240229,claude-3-opus-20240229,neutral,neutral,30
69,baseline,gemini-1.0-pro,claude-3-opus-20240229,positive,positive,100
73,baseline,gemini-1.0-pro,claude-3-opus-20240229,negative,negative,88
78,baseline,gemini-1.0-pro,claude-3-opus-20240229,neutral,neutral,37
80,baseline,gpt-4-0125-preview,claude-3-opus-20240229,positive,positive,100
84,baseline,gpt-4-0125-preview,claude-3-opus-20240229,negative,negative,35
89,baseline,gpt-4-0125-preview,claude-3-opus-20240229,neutral,neutral,17
91,baseline,claude-2.1,claude-3-opus-20240229,positive,positive,100


# Normality Test

## Baseline

In [25]:
THRESHOLD = 0.05

In [26]:
baseline_shapiro_result = shapiro(baseline_paired_df['num_stories'])
if baseline_shapiro_result.pvalue < THRESHOLD:
    print('Not normal')
else:
    print('Normal')

Not normal


## Compressed

In [27]:
compressed_shapiro_result = shapiro(compressed_paired_df['num_stories'])
if compressed_shapiro_result.pvalue < THRESHOLD:
    print('Not normal')
else:
    print('Normal')

Not normal


Since both settings are not normal, non-parametric tests will be used.

# Mann-Whitney U test 

In [29]:
# Perform Mann-Whitney U test
wilcoxon_result = wilcoxon(baseline_paired_df['num_stories'], compressed_paired_df['num_stories'])

In [30]:
wilcoxon_result

WilcoxonResult(statistic=14.0, pvalue=0.049420750659580896)

# Cohen's D

In [31]:
# Perform Cohen's D
effect_size = pg.compute_effsize(baseline_paired_df['num_stories'], compressed_paired_df['num_stories'], eftype='cohen')

In [32]:
effect_size

-0.1558084982618848

In [None]:
"Our analysis revealed a small effect size (Cohen’s d = 0.049420750659580896), indicating a substantial difference between the two groups."