# Tables from paper
This notebook contains the code to recreate all the analyses and results presented in tables in the paper

## Table 1
Table 1 is an overview of the format and size of MultiMedQA datasets. This data is extracted from the paper of each respective dataset, and does not require any additional processing.

## Table 2


In [2]:
#Overview of intermediate pre-training steps used to evaluate OLMo 7B. 
# The specific revisions are noted in the paper

from huggingface_hub import list_repo_refs

out = list_repo_refs("allenai/OLMo-7B")
branches = [b.name for b in out.branches]

# Extract the step number from the branch name
def get_step_number(branch_name):
    return int(branch_name.split('-')[0].replace('step', ''))

sorted_branches = sorted(branches, key=lambda x: get_step_number(x) if x != "main" else float('inf'))

print(sorted_branches)

['step0-tokens0B', 'step1000-tokens4B', 'step2000-tokens9B', 'step3000-tokens13B', 'step4000-tokens18B', 'step5000-tokens22B', 'step6000-tokens27B', 'step7000-tokens31B', 'step8000-tokens35B', 'step9000-tokens40B', 'step10000-tokens44B', 'step11000-tokens49B', 'step12000-tokens53B', 'step13000-tokens58B', 'step14000-tokens62B', 'step15000-tokens66B', 'step16000-tokens71B', 'step17000-tokens75B', 'step18000-tokens80B', 'step19000-tokens84B', 'step20000-tokens88B', 'step21000-tokens93B', 'step22000-tokens97B', 'step23000-tokens102B', 'step24000-tokens106B', 'step25000-tokens111B', 'step26000-tokens115B', 'step27000-tokens119B', 'step28000-tokens124B', 'step29000-tokens128B', 'step30000-tokens133B', 'step31000-tokens137B', 'step32000-tokens142B', 'step33000-tokens146B', 'step34000-tokens150B', 'step35000-tokens155B', 'step36000-tokens159B', 'step37000-tokens164B', 'step38000-tokens168B', 'step39000-tokens173B', 'step40000-tokens177B', 'step41000-tokens181B', 'step42000-tokens186B', 'step4

## Table 3

In [5]:
#MultiMedQA evaluations across model scale. Raw eval data cleaned using _____
import pandas as pd
scale_evals = pd.read_csv('../eval-results/wandb-logs/cleaned/acc_scale_results.csv')
print(scale_evals)

    Unnamed: 0           model_name  medmcqa/acc  medmcqa/acc_stderr  \
0            0           Mamba-1.4b     0.235477            0.006561   
1            1           Mamba-130m     0.320344            0.007215   
2            2           Mamba-2.8b     0.257471            0.006761   
3            3           Mamba-370m     0.323213            0.007232   
4            4           Mamba-790m     0.314129            0.007178   
5            5              OLMo-1B     0.262013            0.006800   
6            6              OLMo-7B     0.240258            0.006607   
7            7         Qwen1.5-0.5B     0.329429            0.007268   
8            8         Qwen1.5-1.8B     0.368157            0.007458   
9            9          Qwen1.5-14B     0.531676            0.007716   
10          10          Qwen1.5-14B     0.531676            0.007716   
11          11           Qwen1.5-4B     0.436768            0.007670   
12          12           Qwen1.5-7B     0.502271            0.00

## Table 4

In [9]:
#Log-Log Regression Analysis of Model Scale vs Task Accuracy
import numpy as np
import statsmodels.api as sm

data = scale_evals.copy()

#remove OLMo from df
data = data[data['model_family'] != 'OLMo']

data['log_param_count'] = np.log10(data['param_count'])



tasks = ['medmcqa/acc', 'medqa_4options/acc', 'mmlu_anatomy/acc', 
         'mmlu_clinical_knowledge/acc', 'mmlu_college_biology/acc', 
         'mmlu_college_medicine/acc', 'mmlu_medical_genetics/acc', 
         'mmlu_professional_medicine/acc', 'pubmedqa/acc']

for task in tasks:
    data[f'log_{task}'] = np.log10(data[task])

# Function to perform log-log regression and return slope, p-value, and R^2
def log_log_regression(x, y):
    x = sm.add_constant(x)  # Adds a constant term to the predictor
    model = sm.OLS(y, x).fit()
    return model.params[1], model.pvalues[1], model.rsquared

# Prepare the results table
results = []

for model_family in data['model_family'].unique():
    for task in tasks:
        task_label = task.replace('/acc', '').replace('_', ' ').title()
        subset = data[data['model_family'] == model_family]
        x = subset['log_param_count']
        y = subset[f'log_{task}']
        slope, p_value, r_squared = log_log_regression(x, y)
        results.append([model_family, task_label, slope, p_value, r_squared])

results_df_table4 = pd.DataFrame(results, columns=['Model Family', 'Task', 'Slope Coefficient', 'P-value', 'R^2'])


results_df_table4

Unnamed: 0,Model Family,Task,Slope Coefficient,P-value,R^2
0,Mamba,Medmcqa,-0.095008,0.124749,0.598614
1,Mamba,Medqa 4Options,-0.046384,0.167476,0.522841
2,Mamba,Mmlu Anatomy,0.067453,0.398219,0.243512
3,Mamba,Mmlu Clinical Knowledge,0.095133,0.309719,0.331503
4,Mamba,Mmlu College Biology,-0.084192,0.0068,0.937258
5,Mamba,Mmlu College Medicine,0.16358,0.1388,0.572384
6,Mamba,Mmlu Medical Genetics,0.002446,0.913208,0.004654
7,Mamba,Mmlu Professional Medicine,0.245517,0.090186,0.670091
8,Mamba,Pubmedqa,0.113127,0.022653,0.862306
9,Qwen,Medmcqa,0.153585,0.000349,0.969629


## Table 5