In [1]:
import json
import pandas as pd

file_path = '../results/raw_outputs/logged-samples/outputs_allenai/pretrained=allenai__OLMo-1B,trust_remote_code=True_pubmedqa.jsonl'


# Read the entire file content
with open(file_path, 'r') as file:
    file_content = file.read().strip()  # Read and strip unnecessary whitespace

# Attempt to parse the entire content assuming it's a properly formatted JSON array

data = json.loads(file_content)



data_list = []
for document in data:
    doc_id = document['doc_id']
    question = document['doc'].get('QUESTION', "")
    context = document['doc'].get('CONTEXTS', "")
    year = document['doc'].get('YEAR', "")
    labels = ", ".join(document['doc'].get('LABELS', []))
    meshes = ", ".join(document['doc'].get('MESHES', []))
    reasoning_required = document['doc'].get('reasoning_required_pred', "")
    reasoning_free = document['doc'].get('reasoning_free_pred', "")
    final_decision = document['doc'].get('final_decision', "")
    target = document['target']
    acc = document['acc']

    # Extract responses and determine the model's predicted answer based on the highest score
    responses = document['resps']
    # Assuming answers are in the order ['yes', 'no', 'maybe'] for the scores in `resps`
    response_labels = ['yes', 'no', 'maybe']
    highest_score_index = max(range(len(responses)), key=lambda i: responses[i][0][0])
    predicted_answer = response_labels[highest_score_index]

    # Check if the model's prediction matches the final decision
    model_correct = (predicted_answer == final_decision)

    data_list.append({
        "Doc ID": doc_id,
        "Context": context, 
        "Question": question, 
        "Year": year,
        "Labels": labels,
        "MESH Terms": meshes,
        "Reasoning Required": reasoning_required,
        "Reasoning Free": reasoning_free,
        "Final Decision": final_decision,
        "Target": target,
        "Model Accuracy (Reported)": acc,
        "Model Predicted Answer": predicted_answer,
        "Model Correct": model_correct
    })

df = pd.DataFrame(data_list)




In [2]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

# Create the NER pipeline
ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")


# Function to apply NER to a question
def extract_entities(question):
    entities = ner_pipe(question)
    entity_list = [entity['word'] for entity in entities]
    entity_type_list = [entity['entity_group'] for entity in entities]
    return entity_list, entity_type_list

# Apply NER to each question and expand the results into two new columns
df[['Biomedical Entities', 'Entity Types']] = df['Question'].apply(
    lambda x: pd.Series(extract_entities(x))
)




In [4]:
#same but for question context
df['Context'] = df['Context'].astype(str)

df[['Biomedical Entities in Context', 'Entity Types in Context']] = df['Context'].apply(
    lambda x: pd.Series(extract_entities(x))
)

In [6]:
df.to_csv('pubmedqa_all_ner_counts.csv', index=False)

In [20]:
#create new df from select columns 
entity_df = df[['Doc ID', 'Biomedical Entities', 'Entity Types']]

#entity_df.to_csv('biomedical_entities_pubmedqa.csv', index=False)


# Re-create the dictionary, ensuring correct mapping of each entity to its type across all rows
entity_to_type = []

for index, row in entity_df.iterrows():
    entities = row['Biomedical Entities']
    types = row['Entity Types']
    for entity, entity_type in zip(entities, types):
        entity_to_type.append({'Biomedical Entity': entity, 'Entity Type': entity_type})

# Convert the list of dictionaries into a DataFrame
entity_type_df_expanded = pd.DataFrame(entity_to_type)



Unnamed: 0,Biomedical Entity,Entity Type
0,anorectal,Sign_symptom
1,endosonography,Diagnostic_procedure
2,sublingual,Detailed_description
3,varices,Disease_disorder
4,hypertension,Sign_symptom


In [9]:
#Get list of all unique entities 
unique_entities = set()
for entities in df['Biomedical Entities']:
    unique_entities.update(entities)

unique_entities = list(unique_entities)
unique_entities 

['shoulder',
 'counter',
 'joint',
 'helicopter',
 'varicose veins',
 'transbronchial',
 'apnea',
 'optimistic',
 'major',
 'muscle power',
 'perineal',
 'type 2 diabetes',
 'chemotherapeutices',
 'enterocolitis',
 'hypertension',
 'delivery mode',
 'confined placental mosaicism',
 'reading comprehension',
 'treatment',
 'hospice care',
 'three months',
 "hirschsprung ' s disease",
 'valvar',
 'reflux resolution',
 'ototoxic',
 'third',
 'distal ureteral diameter',
 'arginine',
 'disability',
 'elderly patients',
 'follicle assessment',
 'intracranial',
 'better',
 'exist',
 'creatinine',
 'contralateral knee',
 'tobacco',
 'remote',
 'phagocytic',
 'cell',
 'posterior longitudinal ligament',
 'motor vehicle collisions',
 'ischemic',
 'psychophysiological responses',
 'circumcision',
 'intestinal',
 'mesial',
 'defecatory symptoms',
 'compensated',
 'alexithymia',
 'difference between',
 'hypotension',
 'cone beam',
 'modal',
 'mastoidectomy',
 'dental health',
 'endotracheal',
 'coron

In [5]:
#Get list of all unique entities in context text 
unique_entities_context = set()
for entities in df['Biomedical Entities in Context']:
    unique_entities_context.update(entities)

unique_entities_context = list(unique_entities_context)
unique_entities_context

['colorectal surgery',
 'homa',
 'spectral domain enhanced depth',
 'french',
 'plasminogen activator',
 'concussion',
 '605',
 '34 %',
 'xgc',
 'problems',
 'minimum inhibitory concentration',
 'beta1gly49',
 'thirty - two',
 'teaching sessions',
 'power output',
 'care givers',
 'real',
 'intrathecally',
 'iqs',
 'overweight',
 '51 %',
 'polymorphisms',
 '< or',
 'quieter',
 'recalled',
 'occlusion',
 'twenty two',
 '40',
 'frozen',
 'cavitated',
 'ptv',
 'receptor',
 'quality control gene',
 'response',
 '- 105',
 'aquagenic',
 'operative',
 '255',
 'complete',
 'radioimmunoassays',
 'mosaicism',
 'conventional screening',
 '29 % a± 14 %',
 'improvement',
 'surgeries',
 'ostial',
 'tmj',
 '0. 840',
 'surgical procedure',
 'dependent',
 'identical',
 'maternity',
 'faculty of medicine department of pathology',
 'under 1',
 '20a \\ xa0mm',
 'drugs',
 'less than',
 'goiter',
 'men',
 'proliferation',
 '71',
 'floating',
 'evla',
 'tubulointerstitial',
 'edematous',
 'anterior to superi

In [8]:
unique_entities = list(unique_entities)
unique_entities 



In [7]:
import requests
# Base URL for the API
api_url = 'https://api.infini-gram.io/'

# Function to get count from the API
def get_count(query):
    data = {
        "corpus": "v4_piletrain_llama", #The Pile Counts with Llama tokenizer
        "query_type": "count",
        "query": query
    }
    try:
        response = requests.post(api_url, json=data)
        response.raise_for_status()
        return response.json().get('count', 0)  # Return the count or 0 if not found
    except requests.RequestException as e:
        print(f"Error querying {query}: {str(e)}")
        return 0

# DataFrame to store the results
count_df = pd.DataFrame(columns=['Term', 'Count'])

# Process each term in the list
for term in unique_entities_context:
    # Query for both original and lower case versions
    original_count = get_count(term)
    lower_count = get_count(term.lower()) if term.lower() != term else original_count
    
    # Sum the counts (avoid double-counting if the term is already in lowercase)
    total_count = original_count + lower_count if term.lower() != term else original_count
    
    # Append the result to the dataframe
    count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)


count_df.to_csv('biomed_ner_context_infinigram_counts_pile.csv')

  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  count_df = count_df.append({'Term': term, 'Count': t

In [8]:
import requests
# Base URL for the API
api_url = 'https://api.infini-gram.io/'

# Function to get count from the API
def get_count(query):
    data = {
        "corpus": "v4_dolma-v1_6_llama", #Dolma Counts with Llama tokenizer
        "query_type": "count",
        "query": query
    }
    try:
        response = requests.post(api_url, json=data)
        response.raise_for_status()
        return response.json().get('count', 0)  # Return the count or 0 if not found
    except requests.RequestException as e:
        print(f"Error querying {query}: {str(e)}")
        return 0

# DataFrame to store the results
dolma_count_df = pd.DataFrame(columns=['Term', 'Count'])

# Process each term in the list
for term in unique_entities_context:
    # Query for both original and lower case versions
    original_count = get_count(term)
    lower_count = get_count(term.lower()) if term.lower() != term else original_count
    
    # Sum the counts (avoid double-counting if the term is already in lowercase)
    total_count = original_count + lower_count if term.lower() != term else original_count
    
    # Append the result to the dataframe
    dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)


dolma_count_df.to_csv('biomed_ner_context_infinigram_counts_dolma.csv')

  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = dolma_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  dolma_count_df = d

In [9]:
import requests
# Base URL for the API
api_url = 'https://api.infini-gram.io/'

# Function to get count from the API
def get_count(query):
    data = {
        "corpus": "v4_rpj_llama_s4", #redpajama Counts with Llama tokenizer
        "query_type": "count",
        "query": query
    }
    try:
        response = requests.post(api_url, json=data)
        response.raise_for_status()
        return response.json().get('count', 0)  # Return the count or 0 if not found
    except requests.RequestException as e:
        print(f"Error querying {query}: {str(e)}")
        return 0

# DataFrame to store the results
rpj_count_df = pd.DataFrame(columns=['Term', 'Count'])

# Process each term in the list
for term in unique_entities_context:
    # Query for both original and lower case versions
    original_count = get_count(term)
    lower_count = get_count(term.lower()) if term.lower() != term else original_count
    
    # Sum the counts (avoid double-counting if the term is already in lowercase)
    total_count = original_count + lower_count if term.lower() != term else original_count
    
    # Append the result to the dataframe
    rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)


rpj_count_df.to_csv('biomed_ner_context_infinigram_counts_rpj.csv')

  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count': total_count}, ignore_index=True)
  rpj_count_df = rpj_count_df.append({'Term': term, 'Count':

To do:

Get a df that has
Entity type | Entity term | Count | Training Corpus

In [None]:
#import ner count data 
