<a href="https://colab.research.google.com/github/philosophy-question-answerer/model-tests-automated/blob/main/model_test_results_scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations, Imports and Third-Party Services

In [None]:
! pip install cohere
! pip install ratelimit



In [None]:
import os
import re
import time
import pprint
import cohere
from collections import OrderedDict
from ratelimit import limits
from google.colab import userdata, drive

In [None]:
COHERE_API_KEY = userdata.get('COHERE_API_KEY')
# COHERE_API_KEY = userdata.get('COHERE_API_KEY_2')
# COHERE_API_KEY = userdata.get('COHERE_API_KEY_3')

In [None]:
drive.mount('/content/drive')
test_results_dir = '/content/drive/My Drive/Model Tests Results Cleaned'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Preparation

## Extract All QA Pairs From All Test Result Files on Google Drive

In [None]:
all_combinations = {}

qa_pattern = re.compile(r"QUESTION:\s*(.*?)\s*ANSWER:\s*(.*?)\s*INFERENCE_TIME:\s*([\d.]+)( seconds)?", re.DOTALL)

for combination in os.listdir(test_results_dir):

    print(f'Processing file {combination}')

    combination_path = os.path.join(test_results_dir, combination)

    with open(combination_path, 'r') as file:
        file_content = file.read()

        question_data = []

        matches = qa_pattern.findall(file_content)

        for match in matches:
            question_data.append({
                'question': match[0].strip(),
                'answer': match[1].strip(),
                'inference_time': int(match[2]),
                'score': None
            })

        combination_key = os.path.splitext(combination)[0]
        all_combinations[combination_key] = question_data

Processing file Llama_2_13B_chat_512_50_True.txt
Processing file Llama_2_13B_chat_2048_200_False.txt
Processing file Llama_2_7b_Chat_1500_100_False.txt
Processing file Llama_2_13B_chat_1500_100_True.txt
Processing file Llama_2_13B_chat_2048_200_True.txt
Processing file Llama_2_13B_chat_1500_300_False.txt
Processing file Llama_2_13B_chat_512_50_False.txt
Processing file Llama_2_7b_Chat_1500_100_True.txt
Processing file Llama_2_7b_Chat_1024_150_False.txt
Processing file Llama_2_13B_chat_2048_350_True.txt
Processing file Llama_2_7b_Chat_2048_200_False.txt
Processing file Llama_2_7b_Chat_1024_100_True.txt
Processing file Llama_2_13B_chat_1024_150_True.txt
Processing file Llama_2_13B_chat_1024_100_False.txt
Processing file Llama_2_13B_chat_1500_100_False.txt
Processing file Llama_2_13B_chat_2048_350_False.txt
Processing file Llama_2_7b_Chat_1024_150_True.txt
Processing file Llama_2_13B_chat_512_100_False.txt
Processing file Llama_2_7b_Chat_1500_300_False.txt
Processing file Llama_2_7b_Chat_

## Print Resulting Dictionary

In [None]:
pprint.pprint(all_combinations)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                                         "Wittgenstein's "
                                                         'perspective on '
                                                         'language-games '
                                                         'suggests that '
                                                         'epistemological '
                                                         'norms are not '
                                                         'inherently rational, '
                                                         'as custom often '
                                                         'dictates what '
                                                         'language-games get '
                                                         'played. The choice '
                                                         'to pursue certain '
                   

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



None},
                                  {'answer': '1. The notion of '
                                             "'language-games' challenges "
                                             'traditional epistemology by '
                                             'suggesting that our language use '
                                             'is conventional and open to '
                                             'alternative arrangements, rather '
                                             'than a fixed set of rules '
                                             'governing the acquisition and '
                                             'understanding of knowledge. This '
                                             'idea implies that there are '
                                             'multiple ways to interpret and '
                                             'use language, which in turn '
                                             'questions the universality a

# Cohere Functions

## Generate Prompt

In [None]:
def generate_prompt(question, answer):
  return f'''
  Consider the following question about Ludwig Wittgenstein's Philosophical Investigations that a philosophy student may ask his/her professor: "{question}"\n
  The following is a candidate answer to the given question provided by an AI model in training: "{answer}"\n
  Evaluate this answer based on its accuracy, thoroughness, coherency and relevancy using your own knowledge of Wittgenstein's Philosophical Investigations, and strictly return ONLY an integer score out of 100.
  '''

## Query Cohere

In [None]:
co = cohere.Client(COHERE_API_KEY)

one_minute = 60

@limits(calls=99, period=one_minute)
def query_cohere(prompt):
    response = co.chat(message=prompt, model='command', temperature=0.9)
    return response.text

## Parse Score From Cohere Response

In [None]:
def parse_response(response):

    pattern = re.compile(r'\b([1-9]|[1-9][0-9]|100)\b')

    match = re.search(pattern, response)

    if match:
        return int(match.group())
    else:
        return None

# Score All Model Combinations

## Collect Cohere Responses

In [None]:
for combination in all_combinations:

  print(f'Processing combination {combination}')

  for qa_pair in all_combinations[combination]:

    question = qa_pair['question']
    answer = qa_pair['answer']
    inference_time = qa_pair['inference_time']

    prompt = generate_prompt(question, answer)

    try:
      response = query_cohere(prompt)
    except ratelimit.RateLimitException:
      print("Rate limit exceeded.")
      time.sleep(60)
      response = query_cohere(prompt)

    score = parse_response(response)

    qa_pair['score'] = score

Processing combination SOLAR_10p7B_Instruct_v1p0_512_100_False
Processing combination Mistral_7B_Instruct_v0p1_1500_100_True
Processing combination SOLAR_10p7B_Instruct_v1p0_512_50_True
Processing combination Mistral_7B_Instruct_v0p1_1500_100_False
Processing combination Mistral_7B_Instruct_v0p1_512_100_True
Processing combination Mistral_7B_Instruct_v0p2_1500_300_False
Processing combination orca_mini_v3_7B_2048_200_False
Processing combination Mistral_7B_Instruct_v0p1_1500_300_False
Processing combination SOLAR_10p7B_Instruct_v1p0_1500_100_False
Processing combination Mistral_7B_Instruct_v0p2_1024_100_True


In [None]:
pprint.pprint(all_combinations)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                                         'various purposes in '
                                                         'different contexts '
                                                         'without any single '
                                                         'common feature tying '
                                                         'them together. This '
                                                         'view challenges the '
                                                         'notion of a '
                                                         'universal framework '
                                                         'or set of conditions '
                                                         'for knowledge '
                                                         'acquisition and '
                                                         'justification. 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                              'meaning arises not just from '
                                              'mental states or inner rules '
                                              'but rather through shared '
                                              'social practices, norms, and '
                                              'psychological dispositions in '
                                              'human communities.',
                                    'inference_time': 40,
                                    'question': 'What is the relationship '
                                                'between forms of life and '
                                                'language?',
                                    'score': None},
                                   {'answer': 'The meaning of a word is its '
                                              'use in language because the us

## Find Average Score & Ratio of Scored Questions for Each Combination

In [None]:
all_combinations_avg_scores = {}

for combination in all_combinations:

  combination_avg_score = 0
  non_none_count = 0
  num_qa_pairs = len(all_combinations[combination])

  for qa_pair in all_combinations[combination]:

        print(qa_pair['score'])
        if qa_pair['score'] is None:
            continue

        combination_avg_score += qa_pair['score']
        non_none_count += 1

  if non_none_count > 0:
    combination_avg_score /= non_none_count
  else:
    combination_avg_score = None

  combination_avg_score = round(combination_avg_score, 1)

  ratio_scored_qa_pairs = non_none_count / num_qa_pairs * 100
  ratio_str = str(round(ratio_scored_qa_pairs, 1)) + "%"

  all_combinations_avg_scores[combination] = {
        'avg_score': combination_avg_score,
        'ratio_scored_qa_pairs': ratio_str
    }

print(all_combinations_avg_scores)

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


## Sort Combinations by Score


In [None]:
sorted_models = OrderedDict(sorted(all_combinations_avg_scores.items(), key=lambda x: x[1]['avg_score'], reverse=True))

pprint.pprint(sorted_models)

OrderedDict([('Mistral_7B_Instruct_v0p2_1500_300_False',
              {'avg_score': 87.8, 'ratio_scored_qa_pairs': '100.0%'}),
             ('Mistral_7B_Instruct_v0p2_1024_100_True',
              {'avg_score': 87.4, 'ratio_scored_qa_pairs': '90.5%'}),
             ('Mistral_7B_Instruct_v0p1_1500_300_False',
              {'avg_score': 87.2, 'ratio_scored_qa_pairs': '100.0%'}),
             ('Mistral_7B_Instruct_v0p1_1500_100_True',
              {'avg_score': 86.0, 'ratio_scored_qa_pairs': '95.2%'}),
             ('SOLAR_10p7B_Instruct_v1p0_512_100_False',
              {'avg_score': 85.3, 'ratio_scored_qa_pairs': '100.0%'}),
             ('Mistral_7B_Instruct_v0p1_1500_100_False',
              {'avg_score': 85.1, 'ratio_scored_qa_pairs': '100.0%'}),
             ('SOLAR_10p7B_Instruct_v1p0_512_50_True',
              {'avg_score': 85.0, 'ratio_scored_qa_pairs': '100.0%'}),
             ('SOLAR_10p7B_Instruct_v1p0_1500_100_False',
              {'avg_score': 84.3, 'ratio_scored_qa_p

## Save Results to a Text File

In [None]:
save_dir = '/content/drive/My Drive'
file_name = 'combinations_sorted_by_avg_score'

file_path = os.path.join(save_dir, file_name)

with open(file_path, 'w') as file:
    for combination, stats in sorted_models.items():
        file.write(combination + '\n')
        file.write(f"avg_score: {stats['avg_score']}\n")
        file.write(f"ratio_scored_qa_pairs: {stats['ratio_scored_qa_pairs']}\n")
        file.write('\n\n')