<a href="https://colab.research.google.com/github/philosophy-question-answerer/model-tests-automated/blob/main/model_test_results_scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations, Imports and Third-Party Services

In [None]:
! pip install cohere
! pip install ratelimit

In [None]:
import os
import re
import time
import pprint
import cohere
from collections import OrderedDict
from ratelimit import limits
from google.colab import userdata, drive

In [None]:
COHERE_API_KEY = userdata.get('COHERE_API_KEY')
# COHERE_API_KEY = userdata.get('COHERE_API_KEY_2')
# COHERE_API_KEY = userdata.get('COHERE_API_KEY_3')

In [None]:
drive.mount('/content/drive')
test_results_dir = '/content/drive/My Drive/Model Tests Results Cleaned'

# Data Preparation

## Extract All QA Pairs From All Test Result Files on Google Drive

In [None]:
all_combinations = {}

qa_pattern = re.compile(r"QUESTION:\s*(.*?)\s*ANSWER:\s*(.*?)\s*INFERENCE_TIME:\s*([\d.]+)( seconds)?", re.DOTALL)

for combination in os.listdir(test_results_dir):

    print(f'Processing file {combination}')

    combination_path = os.path.join(test_results_dir, combination)

    with open(combination_path, 'r') as file:
        file_content = file.read()

        question_data = []

        matches = qa_pattern.findall(file_content)

        for match in matches:
            question_data.append({
                'question': match[0].strip(),
                'answer': match[1].strip(),
                'inference_time': int(match[2]),
                'score': None
            })

        combination_key = os.path.splitext(combination)[0]
        all_combinations[combination_key] = question_data

## Print Resulting Dictionary

In [None]:
pprint.pprint(all_combinations)

# Cohere Functions

## Generate Prompt

In [None]:
def generate_prompt(question, answer):
  return f'''
  Consider the following question about Ludwig Wittgenstein's Philosophical Investigations that a philosophy student may ask his/her professor: "{question}"\n
  The following is a candidate answer to the given question provided by an AI model in training: "{answer}"\n
  Evaluate this answer based on its accuracy, thoroughness, coherency and relevancy using your own knowledge of Wittgenstein's Philosophical Investigations, and strictly return ONLY an integer score out of 100.
  '''

## Query Cohere

In [None]:
co = cohere.Client(COHERE_API_KEY)

one_minute = 60

@limits(calls=99, period=one_minute)
def query_cohere(prompt):
    response = co.chat(message=prompt, model='command', temperature=0.9)
    return response.text

## Parse Score From Cohere Response

In [None]:
def parse_response(response):

    pattern = re.compile(r'\b([1-9]|[1-9][0-9]|100)\b')

    match = re.search(pattern, response)

    if match:
        return int(match.group())
    else:
        return None

# Score All Model Combinations

## Collect Cohere Responses

In [None]:
for combination in all_combinations:

  print(f'Processing combination {combination}')

  for qa_pair in all_combinations[combination]:

    question = qa_pair['question']
    answer = qa_pair['answer']
    inference_time = qa_pair['inference_time']

    prompt = generate_prompt(question, answer)

    try:
      response = query_cohere(prompt)
    except ratelimit.RateLimitException:
      print("Rate limit exceeded.")
      time.sleep(60)
      response = query_cohere(prompt)

    score = parse_response(response)

    qa_pair['score'] = score

In [None]:
pprint.pprint(all_combinations)

## Find Average Score & Ratio of Scored Questions for Each Combination

In [None]:
all_combinations_avg_scores = {}

for combination in all_combinations:

  combination_avg_score = 0
  non_none_count = 0
  num_qa_pairs = len(all_combinations[combination])

  for qa_pair in all_combinations[combination]:

        print(qa_pair['score'])
        if qa_pair['score'] is None:
            continue

        combination_avg_score += qa_pair['score']
        non_none_count += 1

  if non_none_count > 0:
    combination_avg_score /= non_none_count
  else:
    combination_avg_score = None

  combination_avg_score = round(combination_avg_score, 1)

  ratio_scored_qa_pairs = non_none_count / num_qa_pairs * 100
  ratio_str = str(round(ratio_scored_qa_pairs, 1)) + "%"

  all_combinations_avg_scores[combination] = {
        'avg_score': combination_avg_score,
        'ratio_scored_qa_pairs': ratio_str
    }

print(all_combinations_avg_scores)

## Sort Combinations by Score


In [None]:
sorted_models = OrderedDict(sorted(all_combinations_avg_scores.items(), key=lambda x: x[1]['avg_score'], reverse=True))

pprint.pprint(sorted_models)

## Save Results to a Text File

In [None]:
save_dir = '/content/drive/My Drive'
file_name = 'combinations_sorted_by_avg_score'

file_path = os.path.join(save_dir, file_name)

with open(file_path, 'w') as file:
    for combination, stats in sorted_models.items():
        file.write(combination + '\n')
        file.write(f"avg_score: {stats['avg_score']}\n")
        file.write(f"ratio_scored_qa_pairs: {stats['ratio_scored_qa_pairs']}\n")
        file.write('\n\n')