In [8]:
import pandas as pd

In [5]:
def get_last_question(text):
        questions = text.split('Question:')[1:]
        if questions:
            last_question = questions[-1].strip()
            # Split at 'Correct Answer' and take the first part
            question_and_options = last_question.split('Correct Answer:')[0].strip()
            return f"Question: {question_and_options}"
        return ""


# Example usage:
df = pd.read_csv('all.csv')
df['new_prompt'] = df['prompt'].apply(get_last_question)
df.to_csv('new_prompt.csv', index=False)

In [9]:
import pandas as pd
import ollama
from tqdm import tqdm
import concurrent.futures

system_message = "You are a medical expert analyzing exam questions. Describe the key medical concepts without answering the question. dont write any unrelated information. dont include introduction and start directly with the answer . "

def process_prompt(prompt):
    response = ollama.generate(
                model='llama3:8b',
                system=system_message,
                prompt=f"""describe in one sentence the medical concepts being tested in the following question. don't try to answer the question.:

{prompt}

The medical concepts being tested in this question include: """
    )
    return response['response']


df = pd.read_csv('new_prompt.csv')
prompts = df['new_prompt'].tolist()
# Create a progress bar
with tqdm(total=len(prompts), desc="Processing prompts") as pbar:
    def process_and_update(prompt):
        result = process_prompt(prompt)
        pbar.update(1)
        return result

    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        summaries = list(executor.map(process_and_update, prompts))

df['summary'] = summaries
df.to_csv('output_file_with_summaries.csv', index=False)
df['summary'] = summaries
df.to_csv('output_file_with_summaries.csv', index=False)

KeyboardInterrupt: 

In [12]:
import pandas as pd
import ollama
from tqdm import tqdm
import concurrent.futures

BATCH_SIZE = 10  # You can adjust this value based on your needs

system_message = "You are a medical expert analyzing exam questions. Describe the key medical concepts without answering the question. dont write any unrelated information. dont include introduction and start directly with the answer . "

def process_prompt(prompt):
    response = ollama.generate(
                model='llama3:8b',
                system=system_message,
                prompt=f"""describe in one sentence the medical concepts being tested in the following question. don't try to answer the question.:

{prompt}

The medical concepts being tested in this question include: """
    )
    return response['response']

def process_batch(batch):
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        return list(executor.map(process_prompt, batch))

# Read the input CSV file
df = pd.read_csv('new_prompt.csv')
total_rows = len(df)

# Create a progress bar
with tqdm(total=total_rows, desc="Processing prompts") as pbar:
    for start_idx in range(0, total_rows, BATCH_SIZE):
        end_idx = min(start_idx + BATCH_SIZE, total_rows)
        
        # Process the current batch
        batch = df.loc[start_idx:end_idx-1, 'new_prompt'].tolist()
        summaries = process_batch(batch)
        
        # Update the DataFrame with the new summaries
        df.loc[start_idx:end_idx-1, 'summary'] = summaries
        
        # Save the current batch to the CSV file
        if start_idx == 0:
            df.loc[start_idx:end_idx-1].to_csv('output_file_with_summaries.csv', index=False, mode='w')
        else:
            df.loc[start_idx:end_idx-1].to_csv('output_file_with_summaries.csv', index=False, mode='a', header=False)
        
        # Update the progress bar
        pbar.update(end_idx - start_idx)

print("Processing complete. Results saved to 'output_file_with_summaries.csv'")

Processing prompts: 100%|██████████| 10175/10175 [2:49:25<00:00,  1.00it/s] 

Processing complete. Results saved to 'output_file_with_summaries.csv'





In [20]:
df1 = df
df2 = pd.read_csv('mcq_data_with_custom_ner_tags_cleaned.csv')
print(df1.columns)
print(df2.columns)
print(len(df1), len(df2))
merged_df = pd.merge(df1[['biomistral', 'meditron', 'medalpaca', 'summary', 'prompt']],
                     df2[['biomistral', 'meditron', 'medalpaca', 'correct_answer', ]],
                     on=['biomistral', 'meditron', 'medalpaca'],
                     how='inner')

# Select only the required columns
result_df = merged_df[['summary', 'prompt', 'correct_answer', 'best_model']]

print(len(result_df))
print(result_df.head())

Index(['prompt', 'biomistral', 'meditron', 'medalpaca', 'new_prompt',
       'summary'],
      dtype='object')
Index(['prompt', 'biomistral', 'meditron', 'medalpaca', 'correct_answer',
       'best_model', 'highest_probability', 'processed_prompt', 'tokens'],
      dtype='object')
10175 10034
10034
                                             summary  \
0  The key medical concepts being tested in this ...   
1  The medical concepts being tested in this ques...   
2  The medical concepts being tested in this ques...   
3  The key medical concepts being tested in this ...   
4  The key medical concepts being tested in this ...   

                                              prompt correct_answer  \
0  The following are multiple choice questions (w...              B   
1  The following are multiple choice questions (w...              C   
2  The following are multiple choice questions (w...              C   
3  The following are multiple choice questions (w...              A   
4  The f

In [3]:
import pandas as pd
# Read the input CSV file
df1 = pd.read_csv('output_file_with_summaries.csv')
df2 = pd.read_csv('mcq_data_with_custom_ner_tags_cleaned.csv')

merged_df = pd.merge(df1[['biomistral', 'meditron', 'medalpaca', 'summary', 'prompt']],
                     df2[['biomistral', 'meditron', 'medalpaca', 'correct_answer', ]],
                     on=['biomistral', 'meditron', 'medalpaca'],
                     how='inner')

# Select only the required columns
result_df = merged_df[['summary', 'prompt', 'correct_answer', 'biomistral', 'meditron', 'medalpaca']]

result_df.to_csv('summary_models_output.csv', index=False)

In [6]:
df = pd.read_csv('summary_models_probabilities_correct_answer.csv')
df.columns

Index(['summary', 'prompt', 'correct_answer', 'biomistral', 'meditron',
       'medalpaca'],
      dtype='object')

In [7]:
import pandas as pd
import ast

def get_highest_prob_char(model_output):
    return max(ast.literal_eval(model_output), key=ast.literal_eval(model_output).get)

def check_models(row):
    models = ['biomistral', 'meditron', 'medalpaca']
    correct_models = []
    correct_ids = []
    
    for i, model in enumerate(models):
        if get_highest_prob_char(row[model]) == row['correct_answer']:
            correct_models.append(model)
            correct_ids.append(i)
    
    return pd.Series({
        'models_are_correct': correct_models,
        'ids_are_correct': correct_ids
    })

# Assuming 'df' is your DataFrame
# Apply the function to create new columns
df[['models_are_correct', 'ids_are_correct']] = df.apply(check_models, axis=1)

In [9]:
df.head()
df.to_csv('summary_models_are_correct.csv', index=False)