# Tables from paper
This notebook contains the code to recreate all the analyses and results presented in tables in the paper. Its recommended to run all cells from the top.

## Table 1
Table 1 is an overview of the format and size of MultiMedQA datasets. This data is manually assessed from the paper of each respective dataset, and does not require any additional processing.

## Table 2


In [2]:
#Overview of intermediate pre-training steps used to evaluate OLMo 7B. 
# The specific revisions are noted in the paper

from huggingface_hub import list_repo_refs

out = list_repo_refs("allenai/OLMo-7B")
branches = [b.name for b in out.branches]

# Extract the step number from the branch name
def get_step_number(branch_name):
    return int(branch_name.split('-')[0].replace('step', ''))

sorted_branches = sorted(branches, key=lambda x: get_step_number(x) if x != "main" else float('inf'))

print(sorted_branches)

['step0-tokens0B', 'step1000-tokens4B', 'step2000-tokens9B', 'step3000-tokens13B', 'step4000-tokens18B', 'step5000-tokens22B', 'step6000-tokens27B', 'step7000-tokens31B', 'step8000-tokens35B', 'step9000-tokens40B', 'step10000-tokens44B', 'step11000-tokens49B', 'step12000-tokens53B', 'step13000-tokens58B', 'step14000-tokens62B', 'step15000-tokens66B', 'step16000-tokens71B', 'step17000-tokens75B', 'step18000-tokens80B', 'step19000-tokens84B', 'step20000-tokens88B', 'step21000-tokens93B', 'step22000-tokens97B', 'step23000-tokens102B', 'step24000-tokens106B', 'step25000-tokens111B', 'step26000-tokens115B', 'step27000-tokens119B', 'step28000-tokens124B', 'step29000-tokens128B', 'step30000-tokens133B', 'step31000-tokens137B', 'step32000-tokens142B', 'step33000-tokens146B', 'step34000-tokens150B', 'step35000-tokens155B', 'step36000-tokens159B', 'step37000-tokens164B', 'step38000-tokens168B', 'step39000-tokens173B', 'step40000-tokens177B', 'step41000-tokens181B', 'step42000-tokens186B', 'step4

## Table 3

In [21]:
#MultiMedQA evaluations across model scale. Raw eval data cleaned using data-cleaner.py in processing folder
import pandas as pd
scale_evals = pd.read_csv('../eval-results/wandb-logs/cleaned/acc_scale_results.csv')
scale_evals

Unnamed: 0.1,Unnamed: 0,model_name,medmcqa/acc,medmcqa/acc_stderr,medqa_4options/acc,medqa_4options/acc_stderr,mmlu_anatomy/acc,mmlu_anatomy/acc_stderr,mmlu_clinical_knowledge/acc,mmlu_clinical_knowledge/acc_stderr,...,mmlu_college_medicine/acc,mmlu_college_medicine/acc_stderr,mmlu_medical_genetics/acc,mmlu_medical_genetics/acc_stderr,mmlu_professional_medicine/acc,mmlu_professional_medicine/acc_stderr,model_family,param_count,pubmedqa/acc,pubmedqa/acc_stderr
0,0,Mamba-1.4b,0.235477,0.006561,0.228594,0.011774,0.237037,0.036737,0.196226,0.024442,...,0.219653,0.031568,0.3,0.046057,0.25,0.026304,Mamba,1400000000,0.652,0.021324
1,1,Mamba-130m,0.320344,0.007215,0.276512,0.012541,0.185185,0.033557,0.211321,0.025126,...,0.208092,0.030953,0.3,0.046057,0.191176,0.023887,Mamba,130000000,0.53,0.022343
2,2,Mamba-2.8b,0.257471,0.006761,0.251375,0.012163,0.251852,0.037499,0.324528,0.028816,...,0.375723,0.036928,0.3,0.046057,0.393382,0.029674,Mamba,2800000000,0.734,0.019781
3,3,Mamba-370m,0.323213,0.007232,0.270228,0.012451,0.274074,0.038533,0.218868,0.025448,...,0.202312,0.030631,0.3,0.046057,0.150735,0.021734,Mamba,370000000,0.53,0.022343
4,4,Mamba-790m,0.314129,0.007178,0.2663,0.012394,0.2,0.034555,0.222642,0.025604,...,0.265896,0.033688,0.33,0.047258,0.238971,0.025905,Mamba,790000000,0.66,0.021206
5,5,OLMo-1B,0.262013,0.0068,0.274156,0.012508,0.318519,0.040248,0.2,0.024618,...,0.225434,0.031862,0.24,0.042923,0.176471,0.023157,OLMo,1000000000,0.592,0.022001
6,6,OLMo-7B,0.240258,0.006607,0.239592,0.011968,0.288889,0.039155,0.267925,0.027257,...,0.32948,0.035839,0.32,0.046883,0.216912,0.025036,OLMo,7000000000,0.69,0.020704
7,7,Qwen1.5-0.5B,0.329429,0.007268,0.315789,0.013033,0.348148,0.041153,0.403774,0.030198,...,0.346821,0.036291,0.46,0.050091,0.275735,0.027146,Qwen,500000000,0.62,0.021729
8,8,Qwen1.5-1.8B,0.368157,0.007458,0.348782,0.013363,0.4,0.042321,0.479245,0.030746,...,0.456647,0.037981,0.58,0.049604,0.485294,0.03036,Qwen,1800000000,0.52,0.022365
9,9,Qwen1.5-14B,0.531676,0.007716,0.542812,0.013968,0.651852,0.041153,0.735849,0.027134,...,0.676301,0.035676,0.77,0.042295,0.713235,0.027472,Qwen,14000000000,0.764,0.019009


## Table 4

In [9]:
#Log-Log Regression Analysis of Model Scale vs Task Accuracy
import numpy as np
import statsmodels.api as sm

data = scale_evals.copy()

#remove OLMo from df
data = data[data['model_family'] != 'OLMo']

data['log_param_count'] = np.log10(data['param_count'])



tasks = ['medmcqa/acc', 'medqa_4options/acc', 'mmlu_anatomy/acc', 
         'mmlu_clinical_knowledge/acc', 'mmlu_college_biology/acc', 
         'mmlu_college_medicine/acc', 'mmlu_medical_genetics/acc', 
         'mmlu_professional_medicine/acc', 'pubmedqa/acc']

for task in tasks:
    data[f'log_{task}'] = np.log10(data[task])

# Function to perform log-log regression and return slope, p-value, and R^2
def log_log_regression(x, y):
    x = sm.add_constant(x)  # Adds a constant term to the predictor
    model = sm.OLS(y, x).fit()
    return model.params[1], model.pvalues[1], model.rsquared

# Prepare the results table
results = []

for model_family in data['model_family'].unique():
    for task in tasks:
        task_label = task.replace('/acc', '').replace('_', ' ').title()
        subset = data[data['model_family'] == model_family]
        x = subset['log_param_count']
        y = subset[f'log_{task}']
        slope, p_value, r_squared = log_log_regression(x, y)
        results.append([model_family, task_label, slope, p_value, r_squared])

results_df_table4 = pd.DataFrame(results, columns=['Model Family', 'Task', 'Slope Coefficient', 'P-value', 'R^2'])


results_df_table4

Unnamed: 0,Model Family,Task,Slope Coefficient,P-value,R^2
0,Mamba,Medmcqa,-0.095008,0.124749,0.598614
1,Mamba,Medqa 4Options,-0.046384,0.167476,0.522841
2,Mamba,Mmlu Anatomy,0.067453,0.398219,0.243512
3,Mamba,Mmlu Clinical Knowledge,0.095133,0.309719,0.331503
4,Mamba,Mmlu College Biology,-0.084192,0.0068,0.937258
5,Mamba,Mmlu College Medicine,0.16358,0.1388,0.572384
6,Mamba,Mmlu Medical Genetics,0.002446,0.913208,0.004654
7,Mamba,Mmlu Professional Medicine,0.245517,0.090186,0.670091
8,Mamba,Pubmedqa,0.113127,0.022653,0.862306
9,Qwen,Medmcqa,0.153585,0.000349,0.969629


## Table 5

In [11]:
#Log-Log Regression Analysis of Model Scale vs Average Task Accuracy
data['avg_accuracy'] = data[[f'log_{task}' for task in tasks]].mean(axis=1)

# Function to perform log-log regression and return slope, p-value, and R^2 for average accuracy
def log_log_avg_accuracy_regression(data):
    results = []
    for model_family in data['model_family'].unique():
        subset = data[data['model_family'] == model_family]
        x = subset['log_param_count']
        y = subset['avg_accuracy']
        slope, p_value, r_squared = log_log_regression(x, y)
        results.append([model_family, slope, p_value, r_squared])
    return pd.DataFrame(results, columns=['Model Family', 'Slope Coefficient', 'P-value', 'R^2'])

# Perform the regression analysis
avg_accuracy_results = log_log_avg_accuracy_regression(data)
avg_accuracy_results

Unnamed: 0,Model Family,Slope Coefficient,P-value,R^2
0,Mamba,0.051297,0.194186,0.480747
1,Qwen,0.183104,3.8e-05,0.989919
2,Pythia,0.012403,0.058374,0.378393


## Table 6

In [16]:
#MultiMedQA evaluations across Pythia model scale and intermediate checkpoints, along with OLMo 7B
#Table 6 is a combination of data from the following files:
Pythia70M_checkpoint_evals = pd.read_csv('../eval-results/wandb-logs/wandb_pythia_70m_dynamics.csv')
Pythia160M_checkpoint_evals = pd.read_csv('../eval-results/wandb-logs/wandb_pythia_160m_dynamics.csv')
Pythia410M_checkpoint_evals = pd.read_csv('../eval-results/wandb-logs/wandb_pythia_410m_dynamics.csv')
Pythia1B_checkpoint_evals = pd.read_csv('../eval-results/wandb-logs/wandb_pythia_1b_dynamics.csv')
Pythia2_8B_checkpoint_evals = pd.read_csv('../eval-results/wandb-logs/wandb_pythia_2-8b_dynamics.csv')
Pythia6_9B_checkpoint_evals = pd.read_csv('../eval-results/wandb-logs/wandb_pythia_6-9b_dynamics.csv')
OLMo7B_checkpoint_evals = pd.read_csv('../eval-results/wandb-logs/wandb_OLMo7B_dynamics.csv')

# Combine the dataframes
combined_checkpoint_evals = pd.concat([Pythia70M_checkpoint_evals, Pythia160M_checkpoint_evals, Pythia410M_checkpoint_evals, Pythia1B_checkpoint_evals, Pythia2_8B_checkpoint_evals, Pythia6_9B_checkpoint_evals, OLMo7B_checkpoint_evals])

#Filter to only include relevant columns

combined_checkpoint_evals = combined_checkpoint_evals[['Name', 'cli_configs.model_args', 'medmcqa/acc', 'medmcqa/acc_stderr', 'medqa_4options/acc', 'medqa_4options/acc_stderr', 'mmlu_anatomy/acc', 'mmlu_anatomy/acc_stderr'  ,'mmlu_clinical_knowledge/acc', 'mmlu_clinical_knowledge/acc_stderr' , 'mmlu_college_biology/acc', 'mmlu_college_biology/acc_stderr','mmlu_college_medicine/acc', 'mmlu_college_medicine/acc_stderr','mmlu_medical_genetics/acc','mmlu_medical_genetics/acc_stderr' ,'mmlu_professional_medicine/acc','mmlu_professional_medicine/acc_stderr' ,'pubmedqa/acc', 'pubmedqa/acc_stderr']]
combined_checkpoint_evals

Unnamed: 0,Name,cli_configs.model_args,medmcqa/acc,medmcqa/acc_stderr,medqa_4options/acc,medqa_4options/acc_stderr,mmlu_anatomy/acc,mmlu_anatomy/acc_stderr,mmlu_clinical_knowledge/acc,mmlu_clinical_knowledge/acc_stderr,mmlu_college_biology/acc,mmlu_college_biology/acc_stderr,mmlu_college_medicine/acc,mmlu_college_medicine/acc_stderr,mmlu_medical_genetics/acc,mmlu_medical_genetics/acc_stderr,mmlu_professional_medicine/acc,mmlu_professional_medicine/acc_stderr,pubmedqa/acc,pubmedqa/acc_stderr
0,Step128000,"pretrained=EleutherAI/pythia-70m-deduped,revis...",0.318193,0.007203,0.277298,0.012552,0.185185,0.033557,0.211321,0.025126,0.263889,0.036857,0.213873,0.031265,0.31,0.046482,0.1875,0.02371,0.538,0.022318
1,Step64000,"pretrained=EleutherAI/pythia-70m-deduped,revis...",0.318671,0.007205,0.277298,0.012552,0.192593,0.034065,0.215094,0.025288,0.291667,0.03801,0.208092,0.030953,0.31,0.046482,0.1875,0.02371,0.536,0.022325
2,Step32000,"pretrained=EleutherAI/pythia-70m-deduped,revis...",0.319866,0.007213,0.278083,0.012563,0.185185,0.033557,0.211321,0.025126,0.263889,0.036857,0.208092,0.030953,0.3,0.046057,0.191176,0.023887,0.434,0.022187
3,Step16000,"pretrained=EleutherAI/pythia-70m-deduped,revis...",0.319627,0.007211,0.275727,0.01253,0.2,0.034555,0.215094,0.025288,0.256944,0.036539,0.208092,0.030953,0.3,0.046057,0.183824,0.023529,0.55,0.022271
4,Step8000,"pretrained=EleutherAI/pythia-70m-deduped,revis...",0.319388,0.00721,0.279654,0.012585,0.192593,0.034065,0.215094,0.025288,0.263889,0.036857,0.219653,0.031568,0.31,0.046482,0.1875,0.02371,0.554,0.022252
5,Step4000,"pretrained=EleutherAI/pythia-70m-deduped,revis...",0.321061,0.00722,0.277298,0.012552,0.185185,0.033557,0.215094,0.025288,0.256944,0.036539,0.213873,0.031265,0.3,0.046057,0.183824,0.023529,0.552,0.022262
6,Step2000,"pretrained=EleutherAI/pythia-70m-deduped,revis...",0.314129,0.007178,0.278083,0.012563,0.192593,0.034065,0.215094,0.025288,0.284722,0.037738,0.231214,0.032147,0.31,0.046482,0.183824,0.023529,0.538,0.022318
0,Step128000,"pretrained=EleutherAI/pythia-160m-deduped,revi...",0.320344,0.007215,0.27337,0.012497,0.192593,0.034065,0.211321,0.025126,0.256944,0.036539,0.208092,0.030953,0.3,0.046057,0.1875,0.02371,0.476,0.022357
1,Step64000,"pretrained=EleutherAI/pythia-160m-deduped,revi...",0.321301,0.007221,0.275727,0.01253,0.185185,0.033557,0.215094,0.025288,0.263889,0.036857,0.213873,0.031265,0.3,0.046057,0.194853,0.024061,0.404,0.021967
2,Step32000,"pretrained=EleutherAI/pythia-160m-deduped,revi...",0.320344,0.007215,0.279654,0.012585,0.192593,0.034065,0.211321,0.025126,0.263889,0.036857,0.208092,0.030953,0.3,0.046057,0.191176,0.023887,0.548,0.02228


## Table 7

In [20]:
import statsmodels.formula.api as smf
from statsmodels.sandbox.stats.multicomp import multipletests

# Load and prepare data
def load_and_prepare_data(file_paths, model_names):
    all_data = []
    tokens_per_step = 2097152
    for file_path, model_name in zip(file_paths, model_names):
        df = pd.read_csv(file_path)
        if 'Name' in df.columns:
            df['Step'] = df['Name'].str.extract('(\d+)').astype('int64')
        df['Model'] = model_name
        accuracy_columns = [col for col in df.columns if '/acc' in col]
        relevant_columns = ['Step', 'Model'] + accuracy_columns
        df = df[relevant_columns]
        df['tokens_seen'] = df['Step'] * tokens_per_step
        all_data.append(df)
    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df['Log10_Step'] = np.log10(combined_df['Step'])
    combined_df['Log10_tokens_seen'] = np.log10(combined_df['tokens_seen'])
    return combined_df

file_paths = [
    "../eval-results/wandb-logs/wandb_pythia_70m_dynamics.csv",
    "../eval-results/wandb-logs/wandb_pythia_160m_dynamics.csv",
    "../eval-results/wandb-logs/wandb_pythia_410m_dynamics.csv",
    "../eval-results/wandb-logs/wandb_pythia_1b_dynamics.csv",
    "../eval-results/wandb-logs/wandb_pythia_2-8b_dynamics.csv",
    "../eval-results/wandb-logs/wandb_pythia_6-9b_dynamics.csv"
]
model_names = ["Pythia 70M", "Pythia 160M", "Pythia 410M", "Pythia 1B", "Pythia 2.8B", "Pythia 6.9B"]

combined_df = load_and_prepare_data(file_paths, model_names)

# Melting DataFrame
task_names = [
    "medqa_4options/acc", "medmcqa/acc", "pubmedqa/acc", "mmlu_anatomy/acc", "mmlu_clinical_knowledge/acc",
    "mmlu_college_biology/acc", "mmlu_college_medicine/acc", "mmlu_medical_genetics/acc", "mmlu_professional_medicine/acc"
]
melted_df = combined_df.melt(id_vars=['tokens_seen', 'Model'], value_vars=task_names, var_name='Task', value_name='Accuracy')

# Reorder the DataFrame based on model size
model_order = ["Pythia 70M", "Pythia 160M", "Pythia 410M", "Pythia 1B", "Pythia 2.8B", "Pythia 6.9B"]
melted_df['Model'] = pd.Categorical(melted_df['Model'], categories=model_order, ordered=True)

# Create a dictionary to map model names to their parameter sizes
param_sizes = {
    "Pythia 70M": 70000000,
    "Pythia 160M": 160000000,
    "Pythia 410M": 410000000,
    "Pythia 1B": 1000000000,
    "Pythia 2.8B": 2800000000,
    "Pythia 6.9B": 6900000000
}

# Create a new column for parameter size
melted_df['ParamSize'] = melted_df['Model'].map(param_sizes).astype('int64')

# Take the log10 of the parameter size and training steps
melted_df['Log10_ParamSize'] = np.log10(melted_df['ParamSize'])
# Take the log10 of the tokens seen
melted_df['Log10_TokensSeen'] = np.log10(melted_df['tokens_seen'])

melted_df['Log10_Accuracy'] = np.log10(melted_df['Accuracy'])


# Fit a separate regression model for each task and extract coefficients and p-values
results_list = []

for task in tasks:
    task_data = melted_df[melted_df['Task'] == task]
    model = smf.ols(formula='Log10_Accuracy ~ Log10_ParamSize * Log10_TokensSeen', data=task_data)
    results = model.fit()
    
    # Extract coefficients and p-values
    coefs = results.params
    pvals = results.pvalues
    
    # Append results to list
    results_list.append({
        'Task': task,
        'Slope Coef for ParamSize': coefs['Log10_ParamSize'],
        'P-value for ParamSize': pvals['Log10_ParamSize'],
        'Slope Coef for TokensSeen': coefs['Log10_TokensSeen'],
        'P-value for TokensSeen': pvals['Log10_TokensSeen'],
        'Interaction Term Coef': coefs['Log10_ParamSize:Log10_TokensSeen'],
        'Interaction Term P-value': pvals['Log10_ParamSize:Log10_TokensSeen']
    })

# Create a DataFrame from the results list
results_df_table7 = pd.DataFrame(results_list)

# Correct for multiple comparisons
p_values_to_correct = results_df_table7[['P-value for ParamSize', 'P-value for TokensSeen', 'Interaction Term P-value']].values.flatten()
_, pvals_corrected, _, _ = multipletests(p_values_to_correct, method='fdr_bh')

# Assign corrected p-values back to the DataFrame
results_df_table7['Corrected P-value for ParamSize'] = pvals_corrected[0::3]
results_df_table7['Corrected P-value for TokensSeen'] = pvals_corrected[1::3]
results_df_table7['Corrected Interaction Term P-value'] = pvals_corrected[2::3]

# Reorder columns for final output
results_df_table7 = results_df_table7[[
    'Task',
    'Slope Coef for ParamSize',
    'Corrected P-value for ParamSize',
    'Slope Coef for TokensSeen',
    'Corrected P-value for TokensSeen',
    'Interaction Term Coef',
    'Corrected Interaction Term P-value'
]]

results_df_table7


Unnamed: 0,Task,Slope Coef for ParamSize,Corrected P-value for ParamSize,Slope Coef for TokensSeen,Corrected P-value for TokensSeen,Interaction Term Coef,Corrected Interaction Term P-value
0,medqa_4options/acc,0.128872,0.222961,0.103852,0.23413,-0.013433,0.216491
1,medmcqa/acc,0.326367,0.016653,0.279611,0.016653,-0.033918,0.016653
2,pubmedqa/acc,-0.467243,0.018419,-0.40579,0.016653,0.046929,0.016653
3,mmlu_anatomy/acc,-0.22074,0.552654,-0.198917,0.527284,0.024666,0.477813
4,mmlu_clinical_knowledge/acc,-0.213527,0.222961,-0.177788,0.222961,0.022411,0.216491
5,mmlu_college_biology/acc,0.056359,0.829696,0.036012,0.865097,-0.005213,0.829696
6,mmlu_college_medicine/acc,-0.087478,0.71923,-0.084906,0.682164,0.009475,0.682164
7,mmlu_medical_genetics/acc,0.003566,0.976314,0.013637,0.92677,-0.001892,0.92677
8,mmlu_professional_medicine/acc,-0.161777,0.675603,-0.121813,0.682164,0.016103,0.675603


## Table 8

In [27]:
#Log-Log Regression for each pythia models accuracy on pubmedqa and medmcqa
pubmedqa_data = melted_df[melted_df['Task'] == 'pubmedqa/acc']
medmcqa_data = melted_df[melted_df['Task'] == 'medmcqa/acc']


# Define a function to perform regression and extract results
def analyze_regression(data):
    results = []
    models = data['Model'].unique()
    for model in models:
        subset = data[data['Model'] == model]
        x = sm.add_constant(subset['Log10_TokensSeen'])  # Add constant for the intercept
        y = subset['Log10_Accuracy']
        model_fit = sm.OLS(y, x).fit()
        # Extract the intercept, slope, and their p-values
        intercept, slope = model_fit.params
        intercept_pvalue, slope_pvalue = model_fit.pvalues
        results.append({'Model': model, 'Slope': slope, 'Slope_pValue': slope_pvalue})
    
    return pd.DataFrame(results)

# Analyze both datasets
pubmedqa_results = analyze_regression(pubmedqa_data)
medmcqa_results = analyze_regression(medmcqa_data)

#Combine the results and add column for task
pubmedqa_results['Task'] = 'pubmedqa'
medmcqa_results['Task'] = 'medmcqa'

combined_results_table8 = pd.concat([pubmedqa_results, medmcqa_results])
combined_results_table8

Unnamed: 0,Model,Slope,Slope_pValue,Task
0,Pythia 70M,-0.015609,0.559706,pubmedqa
1,Pythia 160M,-0.053154,0.108142,pubmedqa
2,Pythia 410M,0.009449,0.807181,pubmedqa
3,Pythia 1B,0.001286,0.878947,pubmedqa
4,Pythia 2.8B,0.058808,0.003295,pubmedqa
5,Pythia 6.9B,0.049303,0.013574,pubmedqa
0,Pythia 70M,0.001294,0.546643,medmcqa
1,Pythia 160M,-0.000307,0.812795,medmcqa
2,Pythia 410M,0.000684,0.890472,medmcqa
3,Pythia 1B,-0.00847,0.489431,medmcqa


## Table 9

In [38]:

file_olmo = '../eval-results/wandb-logs/wandb_OLMo7B_dynamics.csv'
file_pythia = '../eval-results/wandb-logs/wandb_pythia_6-9b_dynamics.csv'

olmo_df = pd.read_csv(file_olmo)
pythia_df = pd.read_csv(file_pythia)

# Extract the step number and select only accuracy-related columns
olmo_df['Step'] = olmo_df['Name'].str.extract('(\d+)').astype('int64')
pythia_df['Step'] = pythia_df['Name'].str.extract('(\d+)').astype('int64')

olmo_df['Model'] = 'OLMo 7B'
pythia_df['Model'] = 'Pythia 6.9B'

# Define the accuracy columns to be selected
accuracy_columns = [
    "medqa_4options/acc", "medmcqa/acc", "pubmedqa/acc", "mmlu_anatomy/acc", "mmlu_clinical_knowledge/acc",
    "mmlu_college_biology/acc", "mmlu_college_medicine/acc", "mmlu_medical_genetics/acc", "mmlu_professional_medicine/acc"
]

# Filter the dataframes
olmo_relevant = olmo_df[['Step', 'Model'] + accuracy_columns]
pythia_relevant = pythia_df[['Step', 'Model'] + accuracy_columns]

# Combine the dataframes
combined_df = pd.concat([olmo_relevant, pythia_relevant], ignore_index=True)

# Calculate tokens seen and log-transform the relevant columns
tokens_per_step = 2097152
combined_df['tokens_seen'] = combined_df['Step'] * tokens_per_step
combined_df['Log10_tokens_seen'] = np.log10(combined_df['tokens_seen'])

# Melt the dataframe for easier analysis
melted_df = combined_df.melt(id_vars=['tokens_seen', 'Log10_tokens_seen', 'Model'], value_vars=accuracy_columns, var_name='Task', value_name='Accuracy')

# Convert 'Accuracy' to log10 scale for regression
melted_df['Log10_Accuracy'] = np.log10(melted_df['Accuracy'])

# Function to perform log-log regression
def analyze_regression(df, model_name):
    results_list = []
    tasks = df['Task'].unique()
    for task in tasks:
        task_data = df[(df['Task'] == task) & (df['Model'] == model_name)]
        if not task_data.empty:
            model = smf.ols(formula='Log10_Accuracy ~ Log10_tokens_seen', data=task_data)
            results = model.fit()
            results_list.append({
                'Task': task,
                'Model': model_name,
                'Slope Coefficient': results.params['Log10_tokens_seen'],
                'P-Value': f"{results.pvalues['Log10_tokens_seen']:.4f}",
                'R_squared': results.rsquared
            })
    return pd.DataFrame(results_list)

#remove rows where tokens_seen is 0 in the melted_df
melted_df = melted_df[melted_df['tokens_seen'] != 0]

# Perform regression analysis for OLMo 7B and Pythia 6.9B
olmo7b_results = analyze_regression(melted_df, 'OLMo 7B')
pythia6_9B_results = analyze_regression(melted_df, 'Pythia 6.9B')



# Combine results
final_results = pd.concat([olmo7b_results, pythia6_9B_results], ignore_index=True)

# Display the results


final_results


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,Task,Model,Slope Coefficient,P-Value,R_squared
0,medqa_4options/acc,OLMo 7B,-0.025473,0.0679,0.357467
1,medmcqa/acc,OLMo 7B,-0.030678,0.0311,0.460162
2,pubmedqa/acc,OLMo 7B,0.10156,0.0,0.971521
3,mmlu_anatomy/acc,OLMo 7B,0.068073,0.0245,0.488472
4,mmlu_clinical_knowledge/acc,OLMo 7B,0.020055,0.069,0.355323
5,mmlu_college_biology/acc,OLMo 7B,-0.017946,0.2198,0.181341
6,mmlu_college_medicine/acc,OLMo 7B,0.0513,0.0262,0.480793
7,mmlu_medical_genetics/acc,OLMo 7B,-0.01096,0.5347,0.049982
8,mmlu_professional_medicine/acc,OLMo 7B,0.012257,0.504,0.057644
9,medqa_4options/acc,Pythia 6.9B,-0.018954,0.3126,0.201303


## Table 10

In [61]:
#MultiMedQA evaluations across model pre-trained on different open-source corpora. 
#Table displays select results from the following csv files:

corpus_evals = pd.read_csv('../eval-results/wandb-logs/cleaned/acc_scale_results.csv')
corpus_evals = corpus_evals[corpus_evals['model_name'].str.contains('pythia-6.9b-deduped|pythia-1b-deduped|OLMo')]
corpus_evals = corpus_evals.drop_duplicates(subset=['model_name'])

openllama_evals = pd.read_csv('../eval-results/wandb-logs/wandb_export_results_updated.csv'    )
openllama_evals = openllama_evals[openllama_evals['Name'].str.contains('OpenLlama')]
openllama_evals.rename(columns={'Name': 'model_name'}, inplace=True)

combined_corpus_evals = pd.concat([corpus_evals, openllama_evals])
combined_corpus_evals = combined_corpus_evals.dropna(axis=1, how='any')

combined_corpus_evals

Unnamed: 0,model_name,medmcqa/acc,medmcqa/acc_stderr,medqa_4options/acc,medqa_4options/acc_stderr,mmlu_anatomy/acc,mmlu_anatomy/acc_stderr,mmlu_clinical_knowledge/acc,mmlu_clinical_knowledge/acc_stderr,mmlu_college_biology/acc,mmlu_college_biology/acc_stderr,mmlu_college_medicine/acc,mmlu_college_medicine/acc_stderr,mmlu_medical_genetics/acc,mmlu_medical_genetics/acc_stderr,mmlu_professional_medicine/acc,mmlu_professional_medicine/acc_stderr,pubmedqa/acc,pubmedqa/acc_stderr
5,OLMo-1B,0.262013,0.0068,0.274156,0.012508,0.318519,0.040248,0.2,0.024618,0.277778,0.037456,0.225434,0.031862,0.24,0.042923,0.176471,0.023157,0.592,0.022001
6,OLMo-7B,0.240258,0.006607,0.239592,0.011968,0.288889,0.039155,0.267925,0.027257,0.298611,0.038271,0.32948,0.035839,0.32,0.046883,0.216912,0.025036,0.69,0.020704
17,pythia-1b-deduped,0.304566,0.007117,0.237235,0.011927,0.207407,0.035026,0.271698,0.027378,0.25,0.03621,0.248555,0.032953,0.3,0.046057,0.1875,0.02371,0.506,0.022381
20,pythia-6.9b-deduped,0.215396,0.006357,0.21524,0.011524,0.288889,0.039155,0.245283,0.02648,0.208333,0.033961,0.242775,0.032693,0.27,0.04462,0.334559,0.028662,0.608,0.021855
1,OpenLlama7B,0.259861,0.006782,0.268657,0.012428,0.303704,0.039726,0.309434,0.02845,0.243056,0.035869,0.225434,0.031862,0.28,0.045126,0.198529,0.024231,0.736,0.019733
2,OpenLlama3B,0.306479,0.007129,0.267871,0.012417,0.185185,0.033557,0.233962,0.026055,0.256944,0.036539,0.196532,0.0303,0.34,0.04761,0.213235,0.024881,0.72,0.0201


## Table 11

In [64]:
#Table that displays total and normalized/standardized biomedical term counts across pre-training corpora

#total tokens in each corpus
total_tokens = {
    'Dolma': 3067858892487,
    'Pile': 383299322520,
    'RPJ': 1385942948192
}

# Loading raw counts
ner_context_counts = pd.read_csv('../termfreq-results/biomedical_ner_context_infinigram_counts_merged.csv') 
ner_question_counts = pd.read_csv('../termfreq-results/biomedical_ner_infinigram_counts_merged.csv')
mesh_counts = pd.read_csv('../termfreq-results/mesh_infinigram_counts_merged.csv') 

total_counts_ner_context = ner_context_counts.iloc[:, 1:].sum()
total_counts_ner = ner_question_counts.iloc[:, 1:].sum()
total_counts_mesh = mesh_counts.iloc[:, 1:].sum()

# Normalize the counts per million tokens
normalized_counts_per_million_ner_context = total_counts_ner_context / pd.Series(total_tokens) * 1e6
normalized_counts_per_million_ner = total_counts_ner / pd.Series(total_tokens) * 1e6
normalized_counts_per_million_mesh = total_counts_mesh / pd.Series(total_tokens) * 1e6

print("Total counts (NER question context):", total_counts_ner_context)
print("Total counts (NER question):", total_counts_ner)
print("Total counts (MeSH):", total_counts_mesh)
print("Normalized counts per million (NER question context):", normalized_counts_per_million_ner_context)
print("Normalized counts per million (NER question):", normalized_counts_per_million_ner)
print("Normalized counts per million (MeSH):", normalized_counts_per_million_mesh)

Total counts (NER question context): Dolma    935014406346
Pile     115040931873
RPJ      403350594489
dtype: int64
Total counts (NER question): Dolma    379119470939
Pile      45275511089
RPJ      173028233524
dtype: int64
Total counts (MeSH): Dolma    24623505658
Pile      2402225335
RPJ      11595477221
dtype: int64
Normalized counts per million (NER question context): Dolma    304777.513932
Pile     300133.407794
RPJ      291029.724575
dtype: float64
Normalized counts per million (NER question): Dolma    123577.871155
Pile     118120.509035
RPJ      124845.134318
dtype: float64
Normalized counts per million (MeSH): Dolma    8026.283646
Pile     6267.230840
RPJ      8366.489570
dtype: float64


## Table 12

In [65]:
#This table displays MultiMedQA results across Paloma1B models, to highlight the impact of different pre-training corpora.
#The data was cleaned from wandb exports using data-cleaner.py in the processing folder
paloma_evals = pd.read_csv('../eval-results/wandb-logs/cleaned/acc_data_results.csv')
paloma_evals


Unnamed: 0.1,Unnamed: 0,Name,model_family,param_count,medmcqa/acc,medmcqa/acc_stderr,medqa_4options/acc,medqa_4options/acc_stderr,mmlu_anatomy/acc,mmlu_anatomy/acc_stderr,...,mmlu_college_biology/acc,mmlu_college_biology/acc_stderr,mmlu_college_medicine/acc,mmlu_college_medicine/acc_stderr,mmlu_medical_genetics/acc,mmlu_medical_genetics/acc_stderr,mmlu_professional_medicine/acc,mmlu_professional_medicine/acc_stderr,pubmedqa/acc,pubmedqa/acc_stderr
0,8,Paloma1b-Falcon-RefinedWeb,Paloma,1000000000,0.284963,0.00698,0.278869,0.012574,0.251852,0.037499,...,0.25,0.03621,0.202312,0.030631,0.3,0.046057,0.209559,0.024723,0.56,0.022221
1,9,Paloma1b-RedPajama,Paloma,1000000000,0.301458,0.007096,0.267086,0.012405,0.222222,0.035914,...,0.180556,0.032166,0.294798,0.034766,0.33,0.047258,0.253676,0.026431,0.598,0.021949
2,10,Paloma1b-MC4,Paloma,1000000000,0.287593,0.006999,0.272584,0.012485,0.266667,0.038202,...,0.236111,0.035514,0.225434,0.031862,0.37,0.048524,0.191176,0.023887,0.59,0.022017
3,11,Paloma1b-C4,Paloma,1000000000,0.317236,0.007197,0.277298,0.012552,0.237037,0.036737,...,0.243056,0.035869,0.208092,0.030953,0.3,0.046057,0.191176,0.023887,0.574,0.022137
4,12,Paloma1b-Pile,Paloma,1000000000,0.277313,0.006923,0.278083,0.012563,0.325926,0.040491,...,0.298611,0.038271,0.231214,0.032147,0.23,0.042295,0.143382,0.021289,0.578,0.022109
5,13,Paloma1b-Dolma,Paloma,1000000000,0.301697,0.007098,0.233307,0.011859,0.185185,0.033557,...,0.256944,0.036539,0.225434,0.031862,0.3,0.046057,0.1875,0.02371,0.6,0.021931
