# Applying bootstrap method to perplexities

In [None]:
import os

import numpy as np
import pandas as pd
from scipy.stats import bootstrap  # see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html
import matplotlib.pyplot as plt
from datasets import load_dataset
from tabulate import tabulate

%matplotlib inline
%load_ext dotenv
%dotenv

In [None]:
try:
    hf_token = os.getenv("HUGGINGFACE_API_KEY")
    huggingface_hub.login(token=hf_token)
except:
    huggingface_hub.login()

In [None]:
ds = load_dataset('LLMsForHepth/hep-th_perplexities', split='test')
df = ds.to_pandas()
df.describe()

## Do bootstrapping

In [None]:
perplexity_cols = [name for name in ds.column_names if name.startswith('perplexity')]

models = ['Llama', 
          's1', 's2', 's3', 's4', 's5', 
          's6', 's7', 's8', 's9', 's10', 
          's1_qkv', 's2_qkv', 's3_qkv', 's4_qkv', 's5_qkv', 
          's6_qkv', 's7_qkv', 's8_qkv', 's9_qkv', 's10_qkv']

col_dict = {perplexity_col: model for perplexity_col, model in zip(perplexity_cols, models)}

In [None]:
bootstrap_dict = {}
for col in perplexity_cols:
    data = (df[col],)  # samples must be in a sequence
    bootstrap_res = bootstrap(data, np.mean, confidence_level=0.95,
                             random_state=1, method='percentile') # change random_state -> rng for scipy > 1.15.0
    bootstrap_dict[col] = bootstrap_res

bootstrap_dict

In [None]:
# for plot_col in perplexity_cols:
#     fig, ax = plt.subplots()
#     ax.hist(bootstrap_dict[plot_col].bootstrap_distribution, bins=25)
#     ax.set_title(f'Bootstrap Distribution for {plot_col}')
#     ax.set_xlabel('statistic value')
#     ax.set_ylabel('frequency')
#     plt.show()

In [None]:
x = range(1,1 + len(perplexity_cols))
y = np.array([bootstrap_dict[col].bootstrap_distribution.mean() for col in perplexity_cols])
yerr_low = y - np.array([bootstrap_dict[col].confidence_interval.low for col in perplexity_cols])
yerr_high = np.array([bootstrap_dict[col].confidence_interval.high for col in perplexity_cols]) - y
yerr = [yerr_low, yerr_high]

fig, ax = plt.subplots()

ax.errorbar(x, y, yerr, fmt='o', linewidth=2, capsize=6)
plt.xticks(x, models)
plt.xticks(rotation=90)
plt.ylabel('Perplexity')
plt.xlabel('Model')
plt.savefig('bootstrap.jpg')
plt.show()

In [None]:
perp_col_nice = ['perplexity_Llama-3.1-8B',
 'perplexity_s1-L-3.1-8B-base',
  'perplexity_s1-L-3.1-8B-qkv_v2',
 'perplexity_s2-L-3.1-8B-base',
  'perplexity_s2-L-3.1-8B-qkv',
 'perplexity_s3-L-3.1-8B-base_v3',
  'perplexity_s3-L-3.1-8B-qkv',
 'perplexity_s4-L-3.1-8B-base',
  'perplexity_s4-L-3.1-8B-qkv',
 'perplexity_s5-L-3.1-8B-base',
  'perplexity_s5-L-3.1-8B-qkv',
 'perplexity_s6-L-3.1-8B-base',
  'perplexity_s6-L-3.1-8B-qkv',
 'perplexity_s7-L-3.1-8B-base',
  'perplexity_s7-L-3.1-8B-qkv2',
 'perplexity_s8-L-3.1-8B-base',
 'perplexity_s8-L-3.1-8B-qkv',
 'perplexity_s9-L-3.1-8B-base',
 'perplexity_s9-L-3.1-8B-qkv',
 'perplexity_s10-L-3.1-8B-base',
 'perplexity_s10-L-3.1-8B-qkv']

In [None]:
models_nice = [col_dict[perp] for perp in perp_col_nice]
x = range(1,1 + len(perp_col_nice))
y = np.array([bootstrap_dict[col].bootstrap_distribution.mean() for col in perp_col_nice])
yerr_low = y - np.array([bootstrap_dict[col].confidence_interval.low for col in perp_col_nice])
yerr_high = np.array([bootstrap_dict[col].confidence_interval.high for col in perp_col_nice]) - y
yerr = [yerr_low, yerr_high]

fig, ax = plt.subplots()

ax.errorbar(x, y, yerr, fmt='o', linewidth=2, capsize=6)
plt.xticks(x, models_nice)
plt.xticks(rotation=90)
plt.ylabel('Perplexity')
plt.xlabel('Model')
plt.savefig('bootstrap_nice.jpg')
plt.show()