In [81]:
#Import the required libraries, lots of these are required for the LLMs we utilize for three criteria. 
import Levenshtein
import spacy
import string
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from sentence_transformers import SentenceTransformer, util
from lexicalrichness import LexicalRichness
import re
import math
import random
import numpy as np
nlp = spacy.load('en_core_web_lg')
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

#NLTK Imports
import nltk
nltk.download('stopwords') #Needed for query wellformedness
nltk.download('punkt') #Needed for query wellformedness
nltk.download('averaged_perceptron_tagger') #Needed for query wellformedness
nltk.download('wordnet') #Another one
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize, sent_tokenize

#OpenAI, but could be replaced with Gemini, Claude, etc.
import openai
openai.api_key = 'YOUR_KEY_HERE'
model_engine = "gpt-4" #For all popular LLMS, GPT-4 has proven the best for this process 

#Libraries for Perplexity, Diversity, Grammatical Error, Complexity, Answerability
from evaluate import load
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel, GPT2LMHeadModel, GPT2TokenizerFast
from nltk import trigrams
from nltk import ngrams
import language_tool_python

#Our class used to represent multiple-choice questions
nl = '\n'
class MultipleChoiceQuestion:
    def __init__(self, stem, options, correct_option, qid = None, quality = None):
        self.stem = stem
        self.options = options
        self.correct_option = correct_option
        self.qid = qid
        self.quality = quality
        
    def __str__(self):
        return f"Question: {self.stem}\n {nl.join(self.options)}\nCorrect option: {self.correct_option}\nQuality: {self.quality}"

## Implausible Distractors
Make all distractors plausible as good items depend on having effective distractors


In [82]:
#Uses NER, so if the score is too low, if they're matching entities (i.e. people) then we can ignore this case and say True
def implausible_distractors(question):
    #MiniLM from: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
    model = SentenceTransformer('all-MiniLM-L6-v2')
    correct = question.correct_option
    options = question.options.copy()

    for opt in question.correct_option.split('[SEP]'):
        try:
            options.remove(opt)
        except:
            print('error trying to remove an option, there might be an incorrect space present: ', opt)

    # Two lists of sentences
    sentences1 = [correct, correct, correct, correct]
    sentences2 = options

    #Compute embedding for both lists
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    #Output the pairs with their score
    for i in range(len(sentences2)):
        if cosine_scores[i][i] < 0.15:
            
            #NER check here...
            opt_entity = nlp(sentences2[i])
            lemma_nouns_opt = get_lemma_nouns(sentences2[i])
            
            ans_entity = nlp(sentences1[i])
            lemma_nouns_ans = get_lemma_nouns(sentences1[i])

            #If the noun(s) in the answer choice can be tagged with an entity
            if ans_entity.ents:
                answer_entity = ans_entity.ents[0].label_
            else:
                answer_entity = None

            if opt_entity.ents:
                opt_entity = opt_entity.ents[0].label_
            else:
                opt_entity = None

            #Couldn't find the noun nor the entity? Unable to parse effectively to make a judgement.
            if len(lemma_nouns_ans) == 0 and len(lemma_nouns_opt) == 0:
                return True
            
            #If the option in this case is none/all of the above, it won't be similar, so ignore this criteria
            if not all_of_the_above(question) or not none_of_the_above(question):
                return True

            #Low distance like this means it likely shares some words and should not be flagged
            if jaccard_similarity(sentences1[i], sentences2[i]) > .15 or Levenshtein.distance(sentences1[i], sentences2[i]) < (len(sentences1[i])*.7):
                return True

            #Before saying two distractors are plausible, let's have GPT-4 make a judgement call
            #If GPT-4 is too generous/strict on this call, we can try using updated word embeddings from openai which might be better for the domain jargon
            if are_they_similar(question.stem, sentences1[i], sentences2[i]):
                print('LLM says they are similar: ', sentences1[i], ' -and- ', sentences2[i])
                return True

            print("Distractor not similar enough: {} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
            return False
            
    return True

#Statistic used for gauging the similarity and diversity of text
def jaccard_similarity(str1, str2):
    # Convert strings to sets of words
    set1 = set(str1.split())
    set2 = set(str2.split())

    # Calculate intersection and union
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    # Calculate Jaccard Similarity
    similarity = len(intersection) / len(union)
    return similarity

#The prompt is very basic, but has proved effective through a variety of domains
def are_they_similar(stem, answer, option):
    sysrole = """You are an expert and an astute instructor. Given two options to a multiple-choice question, you will respond 'Yes' if they are related in some way or if option 2 is a somewhat plausible distractor to the question. You will respond with 'No' if the two options are not related or if option 2 is not a somewhat plausible distractor for the question.
    Effectively you are seeing if both options are potential plausible distractors or answers for the question, but they do not need to be perfect.
    """
    prompt = """question: {}
    option 1: {}
    option 2: {}
    """.format(stem, answer, option)
    done = False
    expert_reasoning = 'blank'
    
    # Generate a response
    while(done == False):
        try:
            o = openai.chat.completions.create(
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            done = True 
        except Exception as error:
            print('errored in LLM API call: ', error)
            time.sleep(10)
    completion = o
    done = False

    try:
        expert_reasoning = completion.choices[0].message.content.lower()
    except: 
        print('error with LLM: ', completion)
    if 'yes' in expert_reasoning:
        return True
    elif 'no' in expert_reasoning:
        return False
        
    return expert_reasoning

## None Of The Above
Avoid none of the above as it only really measures students ability to detect incorrect answers 

In [83]:
def none_of_the_above(question):
    for opt in question.options:
        cleaned_opt = opt.strip().lower()
        if 'none of the above' in cleaned_opt or ('none' in cleaned_opt and 'above' in cleaned_opt) or cleaned_opt.startswith('none of') or cleaned_opt == 'neither' or 'none' in question.options[len(question.options)-1]:
            return False
    return True

## All Of The Above
Avoid all of the above options as students can guess correct responses based on partial information

In [84]:
def all_of_the_above(question):
    for opt in question.options:
        cleaned_opt = opt.strip().lower()
        if 'all of the above' in cleaned_opt or ('all' in cleaned_opt and 'above' in cleaned_opt) or ('all if the' in cleaned_opt)  or ('all of the' in cleaned_opt):
            return False

    return True

## Fill-In-The-Blank
Avoid omitting words in the middle of the stem that students must insert from the options provided 

In [85]:
#Programming questions might contain a single underscore, so check for multiple
def fill_in_the_blank(question):
    if "__" in question.stem or ('fill in the blank' in question.stem.lower()):
        return False
    
    return True

## True/False
The options should not be a series of true/false statements

In [86]:
def true_or_false(question):
    options = question.options.copy()
    
    #Check for true & false mentioned in the stem
    for sent in question.stem.split('.'):
        sent = sent.lower()
        if 'false' in sent and 'true' in sent:
            return False    
    
    for opt in options:
        cleaned_opt = opt.strip().lower() 
        if cleaned_opt == 'true' or cleaned_opt == 'false' or cleaned_opt == 'yes'or cleaned_opt == 'no':
            return False

    return True

## Absolute Terms
Avoid the use of absolute terms (e.g. never, always, all) in both the question stem as it can be confusing and the options as students are aware that they are almost always false

In [87]:
#The list of absolute terms can be different for the stem and options, but we need to be careful here, as sometimes these can be used in proper ways
absolutes = ["always", "never", "none", "all", "completely", "absolutely", "totally", "definitely", "incapable", "inevitable"]
def absolute_terms(question):

    #Check for terms in the question stem, if we we find any, have GPT-4 help us verify the use of it.
    stem = question.stem.lower()
    if any(word in stem.split() for word in absolutes):
        if not true_or_false(question):
            return True
        else:
            return absolute_terms_verify(stem)#False

    #Check for terms in the options, if we we find any, have GPT-4 help us verify the use of it.
    absolutes_options = ["always", "never", "none", "completely", "absolutely", "totally", "definitely", "incapable", "inevitable", "all"]
    for opt in question.options:
        cleaned_opt = opt.strip().lower()
        
        #Count all, which is a special case, but not in the case of "all of the above"
        if any(word in cleaned_opt for word in absolutes_options):        
            if none_of_the_above(question) and all_of_the_above(question) and true_or_false(question):
                if "all" in cleaned_opt: 
                    return not absolute_terms_verify(cleaned_opt)
                return False
    
    return True

#Prompt can be modified, specifically the five terms we provide as an example, but this one has proven quite successful. 
def absolute_terms_verify(prompt):
    sysrole = """You are an expert, nitpicky, and astute instructor. 
    Given a phrase that is used as part of a multiple-choice question, you will check if contains absolute terms, such as {}, that are used in a way which constitutes a blanket generalization or hyperbole. 
    It is fine if the text contains these terms, as long as they are not used in a way that might signal the phrase is clearly correct or incorrect.
    You will respond 'Yes' if they are used in this way or 'No' if they are not.
    """.format(" ".join(['all', 'only', 'always', 'never', 'none']))
    
    done = False
    expert_reasoning = 'blank'
    # Generate a response
    while(done == False):
        try:
            o = openai.chat.completions.create(
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            done = True 
        except Exception as error:
            print('errored in GPT-4 API call: ', error)
            time.sleep(10)
    completion = o
    done = False

    try:
        expert_reasoning = completion.choices[0].message.content.lower()
    except: 
        print('error with LLM: ', completion)
        
    #Means an absolute term was used in a manner that makes the question flawed
    if 'yes' in expert_reasoning: 
        return True
    elif 'no' in expert_reasoning:
        return False
        
    return expert_reasoning

## Longest Answer Correct
Often the correct option is longer and includes more detailed information, which clues students to this option

In [88]:
#If the correct answer is noticably longer (more than 25%) than the second longest answer, flag it.
def longest_answer_correct(question):

    #Ignore this criteria for True/False questions
    if not true_or_false(question) or '[SEP]' in question.correct_option:
        return True
        
    correct = question.correct_option
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())

    longest_option = 0
    for opt in options:
        if len(opt) >= longest_option:
            longest_option = len(opt)
        
    #If the longest option is only by 25% or it's a three words or less, then this passes
    if longest_option >= len(correct) *.75 or len(correct.split()) < 4:
        return True
    
    return False

## Negative worded
Negatively worded stems are less likely to measure important learning outcomes and can confuse students

In [89]:
#The list of negative words can potentially cause this to be too restrictive, particularly for words such as can't and won't
def negative_worded_stem(question):
    negatives = ["none", "never", "without", "exclude", "deny", "refuse", "oppose", "dispute", "can't", "won't", "not"] 

    stem = question.stem.lower()
    if any(word in stem.split() for word in negatives):
        return False

    for sent in question.stem.split('.'):
        sent = sent.lower()        
        if 'which' in sent and ('false' in sent or 'not' in sent or 'incorrect' in sent or 'except' in sent) or \
        'what' in sent and ('false' in sent or 'not' in sent or 'incorrect' in sent or 'except' in sent):
            return False    
    
    return True

## Word Repeats
Avoid similarly worded stems and correct responses or words repeated in the stem and correct response

In [90]:
#Find the nouns in question.correct_option and question.stem --> stem them --> compare cosine similiary (usin sentence transformer)
#Also check for the synonyms, compare them. However, if the word(s) are used in the other options, then it's fine.
#Nouns: NN noun, singular ‘- desk’, NNS noun plural – ‘desks’, NNP proper noun, singular – ‘Harrison’, NNPS proper noun, plural – ‘Americans’ 
lemmatizer = WordNetLemmatizer()
nouns = ['NN', 'NNS', 'NNP', 'NNPS']

def word_repeats_in_stem_and_correct_answer(question):   
    options = question.options.copy()
    
    all_options = ' '.join(options)
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())
        
    #This code checks for matching words, specifically nouns and verbs, between the correct answer and stem
    word_types = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBG', 'VBN', 'VBP', 'VBZ']
    stem = strip_punctuation(question.stem)
    matching_words = []
    for wrd in stem.split():
        if wrd not in stop_words and wrd in question.correct_option:
            matching_words.append(wrd)
            
    matching_words = list(set(matching_words)).copy()
    matching_words_copy = matching_words.copy()
    for wrd in matching_words:
        for opt in options:
            if wrd in opt:
                matching_words_copy.remove(wrd)
                break

    #If the word is longer than 4 characters, because non-matching verbs/nouns of smaller characters typically are not cues
    if len([s for s in matching_words_copy if len(s) >= 4]) > 0:
        again = []
        tagged = nltk.pos_tag(matching_words_copy)
        for t in tagged:
            if t[1] in word_types:
                again.append(t[0].lower())
        if len(again) > 0:
            if '[SEP]' in question.correct_option:
                for mwc in matching_words_copy:
                    if mwc in question.correct_option.split('[SEP]')[0] and mwc in question.correct_option.split('[SEP]')[1]:
                        print('*** SEP')
                        return False
                    else:
                        return True
            else:
                #There's the potential false positive where all answer choices are repeated in the question's stem
                all_ops_in_stem = 0
                for opt in question.options:
                    opt = opt.lower()
                    stem = question.stem.lower()
                    if opt in stem:
                        all_ops_in_stem = all_ops_in_stem + 1
                if all_ops_in_stem == len(question.options):
                    return True
                
                return False
    return True

def strip_punctuation(text):
    return ''.join(char for char in text if char not in string.punctuation)

#This is now used for Logical Cue
def get_lemma_nouns(text):
    all_nouns = []
    tokenized = sent_tokenize(text)
    
    for i in tokenized:

        # Word tokenizers is used to find the words and punctuation in a string
        wordsList = nltk.word_tokenize(i)

        # removing stop words from wordList
        wordsList = [w for w in wordsList if not w in stop_words]

        # Using a Tagger. Which is part-of-speech tagger or POS-tagger.
        tagged = nltk.pos_tag(wordsList)
        
        # Add any nouns to this list
        for t in tagged:
            if t[1] in nouns:
                all_nouns.append(t[0].lower())
    
    lemmatized_nouns = []
    for n in all_nouns:
        lemmatized_word = lemmatizer.lemmatize(n, pos="n")
        lemmatized_nouns.append(lemmatized_word.lower())
    
    return lemmatized_nouns

## Logical Cue - This one is challenging, requires domain knowledge.

In [91]:
#An example of a logical cue is asking students to select the most appropriate pharmaceutical intervention for a problem and only having one or two options which
#Using NER, if the question asks for a <certain type of noun, like a <person> then the options should all be <people> too.

def avoid_logical_cues(question):
    options = question.options.copy()

    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())
    
    if len(options) < 2:
        return True

    #Works for MCQs with up to 5 options, any more and this will break, but eventually I'll change this to work for MCQs of any option length
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    if len(options) == 2:
        lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1])]
    if len(options) == 3:
        lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    if len(options) == 4:
        lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2]), get_lemma_nouns(options[3])]
        
    entities_in_options = []
    for opt in lemma_nouns_options:
        for val in opt:
            doc = nlp(val)
            if doc.ents:
                entities_in_options.append(doc.ents[0].label_)
    
    entities_in_answer = []
  
    for val in lemma_nouns_answ:
        doc = nlp(val)
        
        #If the noun(s) in the answer choice can be tagged with an entity
        if doc.ents:
            answer_entity = doc.ents[0].label_
            if answer_entity not in entities_in_options:
                return False 

    
    #If the stem has a number and only one option has a number
    numbers_in_stem = extract_all_numerical_values(question.stem)
    numbers_in_options = 0
    options_without_numbers = 0
    if len(numbers_in_stem) > 0:
        #If only one option has a numerical value and no numerical value is in the stem 
        for opt in options:
           numbers_in_opt = extract_all_numerical_values(opt)
           if len(numbers_in_opt) > 0:
               numbers_in_options = numbers_in_options + 1
        if numbers_in_options == 1:
            return False
    
    return True

## Lost Sequence
If options are numberical, they should go lowest to highest or vice-versa, not a random arrnagement

In [92]:
#If answer choices are numeric, sort them, compare to current order
#If all but one are numerical, make sure they are in order and the "word" option is last.

def lost_sequence(question):
    options = question.options.copy() 
    opts = []
    non_numerical_option = 0
    for opt in options:
        #First check for fractions
        fraction = extract_fraction_to_float(opt)
        if fraction:
            opts.append(fraction)
        else:
            val = extract_all_numerical_values(opt)
            if len(val) == 1:
                opts.append(float(val[0].replace(',', '')))
            else:
                non_numerical_option = non_numerical_option + 1
    
    if non_numerical_option > 0 and not(non_numerical_option == 1 and len(opts) == len(options)-1):
        return True

    float_options = [float(x) for x in opts]    
    sorted_options = sorted(float_options)
    reverse_sorted_options = sorted(float_options, reverse=True)
    
    if sorted_options == float_options:
        #Numeric options are sorted
        return True
    elif reverse_sorted_options == float_options:
        #Numeric options are sorted in reverse order, which might make sense for the question
        return True
    else:
        return False

def extract_all_numerical_values(s):
    pattern = r'-?\d*(?:,\d{3})*\.\d+|-?\d+(?:,\d{3})*'
    return re.findall(pattern, s)

# Regex pattern to match fractions with optional decimal numerator and/or denominator
def extract_fraction_to_float(s):
    pattern = r'-?\b\d+(\.\d+)?/\d+(\.\d+)?\b'
    match = re.search(pattern, s)
    if match:
        a , b = match.group().split("/")
        fraction = float(a) / float(b)
        return fraction
    else:
        return None

## More Than One Correct
There are two ways to do this, one being to present the options of the MCQ along with the question to the LLM and ask if more than one correct. This has a ton of false positives however, so we're going wiht a more basic approach of "can the LLM correctly answer the problem", however this does not necessarily tell us if more than one is correct, but the assumption is if the LLM gets it, then that is likely the singular correct answer. However, the LLM incorrectly answering may just mean the question is difficult and requires higher Blooms. At some point this criteria should be refined.

In [93]:
#Using GPT-4 for QA, the model can confirm if the correct answer is correct, however it is not that accurate.
#In the future, utilize a better QA model as improvements are made.

def more_than_one_correct(question):   
    #Can adjust this prompt to be much more domain specific, but it works fine for the 15 test questions I'm using.

    sysrole = "You are an expert and an astute instructor. Given a multiple-choice quesiton and answer, you will confirm if it is a possible correct answer to the question or not. If it is a possible correct answer, respond with 'Yes' and if it is not then respond with 'No'"
    prompt = """
        question: {}
        
        answer: {}
    """.format(question.stem, question.correct_option)

    done = False
    expert_reasoning = 'blank'
    # Generate a response
    while(done == False):
        try:
            o = openai.chat.completions.create(
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            done = True 
        except Exception as error:
            print('errored in LLM API call: ', error)
            time.sleep(10)
    completion = o
    done = False

    try:
        expert_reasoning = completion.choices[0].message.content.lower()
    except: 
        print('error with LLM: ', completion)
    if 'yes' in expert_reasoning.lower():
        return True
    else:
        return False 

## Complex or K-type
Avoid questions that have a range of correct responses, that ask students to select from a number of possible combinations of the responses

In [94]:
#If the answer options share the same words between one another and there are commas present then it's k type
def complex_k_type(question):    
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())

    if not all_of_the_above(question) or not none_of_the_above(question) or not true_or_false(question):
        return True 
        
    if len(options) < 3:
        return True 
    
    #check if the options contain a comma
    contain_a_comma = 0
    for opt in options:
        if ',' in opt:
            contain_a_comma += 1
    contain_a_comma = contain_a_comma == len(options)
    
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    options_that_share_noun = 0
    for lno in lemma_nouns_options:   
        repeating_nouns = list(set(lno).intersection(lemma_nouns_answ))
        if (len(repeating_nouns) > 0) and (len(lno) > 0):
            options_that_share_noun += 1

    #Yes or No options are fine and might contain repeat noun, so ignore those if all options are effectively yes/no + reason 
    yes_or_no = 0
    for opt in options:
        opt = opt.lower()
        if "yes" in opt or "no" in opt:
            yes_or_no = yes_or_no + 1
    if yes_or_no == len(options):
        return True
    
    #Options share a key word, there are multiple nouns in the options, and they have a comma suggesting it might be a k-type question
    if options_that_share_noun > 0 and contain_a_comma:
       return False
    
    #After removing any list notation in the answer choices, see if they contain the same words
    cleaned_options = []
    for opt in options:
        cleaned_options.append(clean_string(opt))

    options_set_list = [set(i.split()) for i in cleaned_options]
    if options_set_list[0] == options_set_list[1] and options_set_list[0] == options_set_list[2]:
        return False

    return complex_k_type_verify(question)

def clean_string(string):
    # remove whitespace
    cleaned_string = string.strip()
    
    # remove punctuation
    cleaned_string = re.sub(r'[^\w\s]', '', cleaned_string)
    
    # remove list notation
    cleaned_string = re.sub(r'\b(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|xiv|xv|xvi|xvii|xviii|xix|xx)\b', '', cleaned_string)
    cleaned_string = re.sub(r'\b(A|B|C|D|E|F)\b', '', cleaned_string)
    return cleaned_string


def complex_k_type_verify(question):
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())

    sysrole = """Answer only with "Yes" or "No"."""
    
    #They also might have options that refer to other options in the list. --> This was on the very end of the statement below.
    prompt = """
    Are some of these options to a multiple-choice questions partially different combinations of one another, similar to being a K-type question? They may have values separated by "and", "or", "only" or divided by commas or semicolons.
    
    {}
    """.format(options)
    
    done = False
    expert_reasoning = 'blank'
    while(done == False):
        try:
            o = openai.chat.completions.create(
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            done = True 
        except Exception as error:
            print('errored in GPT-4 API call: ', error)
            time.sleep(10)
    completion = o
    done = False


    # Print the response
    try:
        expert_reasoning = completion.choices[0].message.content.lower()
    except: 
        print('error with LLM: ', completion)
    if 'yes' in expert_reasoning: #Means it is used in the bad way
        return False
    elif 'no' in expert_reasoning:
        return True
        
    return expert_reasoning

## Ambiguous or Unclear Information
Questions and all options should be written in clear, unambiguous language

In [95]:
#Roberta model from: https://huggingface.co/cointegrated/roberta-large-cola-krishna2020
#In addition to cola, added a query wellformedness score metric from https://huggingface.co/Ashishkr/query_wellformedness_score 
cola = pipeline('text-classification', model='cointegrated/roberta-large-cola-krishna2020',truncation=True)
tokenizer = AutoTokenizer.from_pretrained("Ashishkr/query_wellformedness_score", ignore_mismatched_parameters=True)
model = AutoModelForSequenceClassification.from_pretrained("Ashishkr/query_wellformedness_score")
def check_string(s):
    # Check for presence of '<' or '>'
    if '<' in s or '>' in s:
        return True

    # Check for two or more ':'
    if len(re.findall(r':', s)) >= 2:
        return True

    # Check for two or more ':'
    if len(re.findall(r'=', s)) >= 2:
        return True
        
    return False

def ambiguous_unclear_information(question):
    stem= question.stem
    sentences = [stem]
    
    features = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    model.eval()
    with torch.no_grad():
        scores = model(**features).logits
    if scores.item() <= .3:
        if check_string(question.stem):
            return True
        return False
    elif scores.item() >= 1.0:
        return True
    
    output = cola(question.stem)
    score = output[0]['score']
    if score < .7:
        return False
    for opt in question.options:
        opt_score = cola(opt)[0]['score']
        if len(opt) > 10 and opt_score <= .7: #This parameter can be modified, depending on how strict we want to be.
            return False
    
    return ambiguous_unclear_information_verify(question)
    
#This method has a tendency to be overly critical, so we have reduced the prompt down to this.
def ambiguous_unclear_information_verify(question):
    sysrole = """Respond only with "Yes" or "No"."""

    prompt = """Is the following multiple-choice question unclear or ambiguous in a way that would make it difficult for a student that is knowledgable in the content to answer it given a set of choices?
    The question text should be akin to a typical multiple-choice question in terms of clarity and ambiguity. 

    question: {}
    answer: {}
    options: {}
    """.format(question.stem, question.correct_option, question.options)
    
    done = False
    expert_reasoning = 'blank'
    # Generate a response
    while(done == False):
        try:
            o = openai.chat.completions.create(
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            done = True 
        except Exception as error:
            print('errored in LLM API call: ', error)
            time.sleep(10)
    completion = o
    done = False

    # Print the response
    try:
        expert_reasoning = completion.choices[0].message.content.lower()
    except: 
        print('error with LLM: ', completion)
    if 'yes' in expert_reasoning: #Means it is confusing, flag it.
        return False
    elif 'no' in expert_reasoning:
        return True
        
    return expert_reasoning

Some weights of the model checkpoint at Ashishkr/query_wellformedness_score were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Gratuitous Information
Avoid unnecessary information in the stem that is not required to answer the question

In [96]:
def gratuitous_information_in_stem(question):  
    #How effective are lexical richness measures for differentiations of vocabulary proficiency? A comprehensive examination with clustering analysis
    #From: https://github.com/LSYS/LexicalRichness
    stem = LexicalRichness(question.stem)
    
    if stem.cttr > 4.5:
        return False
    
    return gratuitous_information_in_stem_verify(question)


def gratuitous_information_in_stem_verify(question):
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())

    sysrole = """Answer only with "Yes" or "No"."""
    prompt = """
    Does this question contain gratuitous information, such that a student may get confused due to the extra unnecessary details?
    The question does not need to contain perfect wording and it does not have to be overly concise, it is fine to have a little extraneous information.
    It should contain a typical amount of information that is commonly used in multiple-choice questions.

    question: {}
    answer: {}
    """.format(question.stem, question.correct_option, options)
    
    done = False
    expert_reasoning = 'blank'
    while(done == False):
        try:
            o = openai.chat.completions.create(
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            done = True 
        except Exception as error:
            print('errored in LLM API call: ', error)
            time.sleep(10)
    completion = o
    done = False


    # Print the response
    try:
        expert_reasoning = completion.choices[0].message.content.lower()
    except: 
        print('error with LLM: ', completion)
    if 'yes' in expert_reasoning: #Means it is used in the bad way
        return False
    elif 'no' in expert_reasoning:
        return True
        
    return expert_reasoning

## Convergence Cues
Avoid convergence cues in options where there are different combinations of multiple components to the answer

In [97]:
#Check for synonyms, because they'll know it's the word they've most recently come across in the text
#The correct option is likely to be used more (when in pairs, etc.) --> k-type (super similar by description)

def avoid_convergence_cues(question):
    #So here we check for synonyms used in the words, in case they get lazy with distractors
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
       options.remove(opt.strip())

    #No convergence cues when there are just two options
    if len(options) < 3:
        return True

    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    #Checking for synonyms 
    synonyms = []
    for noun in lemma_nouns_answ:
        for syn in wn.synsets(noun):
            for l in syn.lemmas():
                synonyms.append(l.name().lower().replace('_', ' '))
    
    for opt in lemma_nouns_options:
        repeating_nouns_synonyms = list(set(synonyms).intersection(opt))
        if len(repeating_nouns_synonyms) > 0:       
            
            #if the repeat is not in every answer choice, flag it.
            for rns in repeating_nouns_synonyms:           
                flag = True
                for value in lemma_nouns_options:
                    if rns not in value:
                        return avoid_convergence_cues_verify(question)
    
    return True

def avoid_convergence_cues_verify(question):
    options = question.options.copy()

    sysrole = """Answer only with "Yes" or "No"."""
    prompt = """
    Are some of these options to a multiple-choice question partially different combinations of one another, potentially signaling convergence cues? They might have options that refer to other options in the list.
    Ignore options that are similar to "Yes" or "No" followed by an explanation.
    
    {}
    """.format(options)
    
    done = False
    expert_reasoning = 'blank'
    while(done == False):
        try:
            o = openai.chat.completions.create(
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            done = True 
        except Exception as error:
            print('errored in LLM API call: ', error)
            time.sleep(10)
    completion = o
    done = False

    try:
        expert_reasoning = completion.choices[0].message.content.lower()
    except: 
        print('error with LLM: ', completion)
    if 'yes' in expert_reasoning: # Means it is used in the bad way
        return False
    elif 'no' in expert_reasoning:
        return True
        
    return expert_reasoning

## Grammatical Cues
All options should be grammatically consistent with the stem and should be parallel in style and form

In [98]:
#If verb exists in answer choice, ensure it's the same tense as verb in other options
#We want the stem to be the same, but as long as all the answers are the same, then it's fine, to avoid false positive.
#https://huggingface.co/Unbabel/gec-t5_small
def grammatical_cues_in_stem(question):
    answer_tense = get_verb_tense(question.correct_option)

    #The simplest option is to ensure the answer and other options are in the same tense, everything else was too high on false positives
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())
    
    for opt in options:
        opt_tense = get_verb_tense(opt)    
        if opt_tense != 'none' and answer_tense != 'none' and answer_tense is not opt_tense:
            return False
            
    return True

#Longer options might contain verbs of different tenses.
#We want options that specifically have a single tense (past or present) and for it to be consistent with all other options.
def get_verb_tense(text):
    verbs = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ == 'VERB':
            if token.tag_ in ['VBP', 'VBZ']:
                verbs.append('present')
            elif token.tag_ in ['VBD', 'VBN']:
                verbs.append('past')
            else:
                verbs.append('none')

    verb_tenses = list(set(verbs))
    if len(verb_tenses) == 1 and verb_tenses[0] == 'past':
        return 'past'
    elif len(verb_tenses) == 1 and verb_tenses[0] == 'present':
        return 'present'
    return 'none'

## Vague Terms
Avoid the use of vague terms (e.g. frequently, occasionally) in the options as there is seldom agreement on their actual meaning

In [99]:
#Like the ohter criteria that use a list of terms, these can be modified
def vague_terms(question):
    vagues = ["frequently", "occasionally", "rarely", "seldom", "sometimes", "usually", "regularly", "periodically", "infrequently", "generally", "nearly", "more or less", "somewhat", "partly"]
    
    #check the options then check the stem
    for opt in question.options:
        opt = opt.lower()
        if any(word in opt for word in vagues):
            return False

    #In particular, these words can sometimes be used in the stem in a way that is not a flaw, but more likely than not, it is
    if any(word in question.stem.lower() for word in vagues):
        return False
    
    return True

## Unfocused Stem
The stem should present a clear and focused question that can be understood and answered without looking at the options

In [100]:
def unfocused_stem(question):   
    if not true_or_false(question) or not all_of_the_above(question) or not none_of_the_above(question) or not fill_in_the_blank(question):
        return True
        
    #Traits of an unfocused question (not being a question, etc.)
    if '?' not in question.stem and ":" not in question.stem:
        if not question.stem.endswith(('.', ':', '?', ';')):
            return False

        if not check_if_first_word_is_a_verb(question.stem):
            return False
        
        contains_question = False
        doc = nlp(question.stem)
        for sent in doc.sents:
            if is_question(sent.text.strip()):
                contains_question = True
                break
                
        return contains_question
    else:
        return True

    return unfocused_stem_verify(question)

def check_if_first_word_is_a_verb(sent):
    d = nlp(sent)
    token = d[0] # gets the first token in a sentence
    if token.pos_ == "VERB" and token.dep_ == "ROOT": # checks if the first token is a verb and root or not
        return True
    return False


#From https://stackoverflow.com/questions/4083060/determine-if-a-sentence-is-an-inquiry
def is_question(sent):
    d = nlp(sent)
    token = d[0] # gets the first token in a sentence
    if token.pos_ == "VERB" and token.dep_ == "ROOT": # checks if the first token is a verb and root or not
        return True
    for token in d: # loops through the sentence and checks for WH tokens
        if token.tag_ == "WDT" or token.tag_ == "WP" or token.tag_ == "WP$" or token.tag_ == "WRB" or token.text == '?':
            return True
    return  False


#Teachers should avoid using MCQs with unfocused stems which do not ask a clear question or state a clear problem in the sentence completion format
#The stem should present a clear and focused question that can be understood and answered without looking at the options
def unfocused_stem_verify(question):
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())

    sysrole = """Answer only with "Yes" or "No"."""
    prompt = """
    Does the following multiple-choice question have an unfocused stem which does not ask a clear question or state a clear poblem in the sentence completion format?
    Ignore questions with options that are similar to "yes"/"no" or "true"/"false" followed by an explanation.
    
    question: {}
    answer: {}
    options: {}
    """.format(question.stem, question.correct_option, options)
    
    done = False
    expert_reasoning = 'blank'
    while(done == False):
        try:
            o = openai.chat.completions.create(
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            done = True 
        except Exception as error:
            print('errored in LLM API call: ', error)
            time.sleep(10)
    completion = o
    done = False

    try:
        expert_reasoning = completion.choices[0].message.content
    except: 
        print('error with LLM: ', completion)
    if expert_reasoning == 'Yes': #Means it is used in the bad way
        return False
    elif expert_reasoning == 'No':
        return True
        
    return expert_reasoning

# Other Metrics (Perplexity, Diversity, Grammatical Error, Cognitive Complexity)

In addition to IWF, calculate these other commonly used metrics to see how they evaluate, Answerability is a fifth metric that I am currently leaving out. Just like BLEU, METEOR, ROGUE, etc. these metrics often do not correlate with human judgements and are not indicators of flawed/bad educational multiple-choice questions like the IWF criteria are. You can still compute them because it's easy enough, but you should put little faith in them.

Note, these metrics aren't great indicators, you can read about it here: https://arxiv.org/pdf/2405.20529

## Perplexity 
This assesses a language model's ability to predict question and answer text based on its training data. Lower scores suggest more coherent questions and answers with predictable language patterns, whereas higher scores indicate complexity or atypical text, suggesting the questions could be unclear or poorly structured. <br/>
<b>NOTE</b>: This will be very slow for a large amount of questions, anything greater than 30 questions will take quite some time.

In [101]:
def perplexity(questions):
    nl = ', '
    predictions = []
    for index, row in questions.iterrows():
        stem=row['text'].strip()
        non_empty_values = [row[col].strip() for col in ['a','b','c','d'] if row[col].strip()]
        row_string = nl.join(non_empty_values)
        predictions.append(stem + ' ' + row_string)
    
    perplexity = load("perplexity", module_type="metric")
    results = perplexity.compute(predictions=predictions, add_start_token=False, model_id='gpt2-large')
    return results['perplexities']

## Diversity
Using Distinct-3, this evaluates the range in vocabulary, structure, and content across generated texts, ensuring a variety of questions and answers and reducing repetition. A higher diversity score indicates greater uniqueness among MCQs, avoiding repetitive phrases and templated patterns. 

In [102]:
def diversity(questions):
    predictions = []
    per_question = []
    for index, row in questions.iterrows():
        stem=row['text'].strip()
        non_empty_values = [row[col].strip() for col in ['a','b','c','d'] if row[col].strip()]
        row_string = ', '.join(non_empty_values)

        predictions.append(stem + ' ' + row_string)
    
    distinct_3_total = 0
    for o in predictions:
        dist3 = calculate_distinct_3(o)
        distinct_3_total = distinct_3_total + dist3
        per_question.append(dist3)

    print('distinct_3_total: ', (distinct_3_total)/len(predictions))   
    print('ngram_diversity_total: ', ngram_diversity(predictions))
    print('length: ', len(predictions))

    return per_question

def calculate_distinct_3(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Generate trigrams
    trigrams_list = list(trigrams(tokens))

    # Count unique trigrams
    unique_trigrams = set(trigrams_list)
    num_unique_trigrams = len(unique_trigrams)

    # Count total trigrams
    total_trigrams = len(trigrams_list)

    # Calculate Distinct-3
    if total_trigrams > 0:
        distinct_3 = num_unique_trigrams / total_trigrams
    else:
        distinct_3 = 0

    return distinct_3

def ngram_diversity(options, n=3):
    all_ngrams = [ngram for option in options for ngram in ngrams(word_tokenize(option), n)]
    unique_ngrams = set(all_ngrams)
    return len(unique_ngrams) / len(all_ngrams)

## Grammatical Error
This uses a Python wrapper for https://languagetool.org/ currently we are using the free API endpoint, so if it's used excessively we might get IP blocked. Grammatical errors pinpoint grammar violations, such as incorrect verb tense or spelling, quantified for each MCQ.

In [103]:
def check_grammar(text):
    tool = language_tool_python.LanguageToolPublicAPI('en-US')
    matches = tool.check(text)
    return len(matches), matches

predictions = []
ques = {}
def grammatical_error(questions):
    total_errors = 0
    errorsList = []
    for index, row in questions.iterrows():
        stem=row['text'].strip()
        num_errors, errors = check_grammar(stem)
        total_errors = total_errors + num_errors
        errorsList.append(num_errors)

    print('total_errors: ', (total_errors/len(questions)))   
    print('length: ', len(questions))
    print('length of errors: ', len(errorsList))

    return errorsList

## Cognitive Complexity
This is measured by Bloom's Taxonomy, although some research has done it by the "difficulty" of the question, which a LLM can assess, but Bloom's is a better fit. Additionally, this might be redundant since the Bloom's label is included in the question's construction

In [104]:
def cognitive_complexity(questions):
    bloom_labels = []
    predictions = []
    for index, row in questions.iterrows():
        stem=row['text'].strip()
        non_empty_values = [row[col].strip() for col in ['a','b','c','d'] if row[col].strip()]
        row_string = '\n'.join(non_empty_values)
        predictions.append(stem + '\n' + row_string)

    sysrole = "You are an expert in pedagogy and an astute instructor here to classify a multiple-choice questions provided to you with one of the six level's of Bloom's Taxonomy"
    prompt = """Given the multiple-choice question below, please respond with that level of Bloom's Revised Taxonomy it falls into and nothing else.
        {}
        """
    for q in predictions:
        p = prompt.format(q)
        done = False
    
        while(done == False):
            try:
                o = openai.chat.completions.create(
                  model=model_engine,
                  messages=[
                     {"role": "system",
                      "content": sysrole},
                    {"role": "user", 
                     "content": p},
                  ],
                  max_tokens = 4096,
                  temperature = 0.7
                 )
                done = True 
            except Exception as error:
                print('errored in LLM API call: ', error)
                time.sleep(10)
        completion = o
        done = False
    
        try:
            expert_reasoning = completion.choices[0].message.content
            bloom_labels.append(expert_reasoning)
        except: 
            print('error with LLM: ', completion)
    return bloom_labels

# Formatting Your CSV of MCQs

| id | text | answer | a | b | c | d |
|----------|----------|----------|----------|----------|----------|----------|
| Data 1   | Data 2   | Data 3   | Data 4   | Data 5   | Data 6   | Data 7   |
| Data 8   | Data 9   | Data 10  | Data 11  | Data 12  | Data 13  | Data 14 |


### id: A unique number
### text: The question's stem
### answer: The text of the correct response, this should match the text in one of the a/b/c/d columns
### a-e: The text for the corresponding option

# 19 Item-Writing Flaws Criteria - Running the code

In [105]:
all_criteria = ['ambiguous_unclear_information', 
    'implausible_distractors', 
    'none_of_the_above',
    'longest_answer_correct',
    'gratuitous_information_in_stem',
    'true_or_false', 
    'avoid_convergence_cues',
    'avoid_logical_cues', 
    'all_of_the_above', 
    'fill_in_the_blank', 
    'absolute_terms', 
    'word_repeats_in_stem_and_correct_answer', 
    'unfocused_stem', 
    'complex_k_type', 
    'grammatical_cues_in_stem',
    'lost_sequence', 
    'vague_terms', 
    'more_than_one_correct', 
    'negative_worded_stem'] 

files = ['TODO.csv'] #Update to the file path containing your CSV
all_data = {}
for criteria in all_criteria:
    match_total = 0
    for file in files:
        print('----- ', criteria, ' ----- ', file)
        data = pd.read_csv(file)
        data = data.fillna('')
        combined_data = pd.concat([data])
        questions = []
        for index, row in combined_data.iterrows():
            question = MultipleChoiceQuestion(
                stem=row['text'],
                options = [row[col].strip() for col in ['a','b','c','d','e'] if row[col].strip()],
                correct_option= row['answer'].strip(),
                qid = row['id'],
                quality = 0
            )
            questions.append(question)
        
        auto_iwf_results = []
        matches = []
        for q in questions:
            ids = globals()[criteria](q)
            if ids:
                ids = 0
            else:
                matches.append(q.stem)
                ids = 1
            auto_iwf_results.append(ids)
            

        all_data[criteria] = auto_iwf_results
        match_total = match_total + len(matches)
    print(criteria, ' matches: ', match_total) 
    
df = pd.DataFrame(all_data)
df.to_csv('RESULTS.csv', index=False)

-----  ambiguous_unclear_information  -----  tufts\mcqs.csv
ambiguous_unclear_information  matches:  1
-----  implausible_distractors  -----  tufts\mcqs.csv
-- GPT-4 says they are similar:  Determining the return to play timeline  -and-  Initiating the rehabilitation process
-- GPT-4 says they are similar:  The StARRT framework  -and-  Evidence-based Practice Model
-- GPT-4 says they are similar:  Younger age, male gender, and positive psychological outlook  -and-  Significant financial resources
-- GPT-4 says they are similar:  The injury’s nature, demands of the sport, and affected body area  -and-  A consensus on predetermined criteria
Distractor not similar enough: A blend of physical health and psychological readiness 		 Endorsement from the entire team 		 Score: 0.0320
implausible_distractors  matches:  1
-----  none_of_the_above  -----  tufts\mcqs.csv
none_of_the_above  matches:  0
-----  longest_answer_correct  -----  tufts\mcqs.csv
longest_answer_correct  matches:  7
-----  gr

# Other Metrics (Perplexity, Diversity, Grammatical Error, Cognitive Complexity) - Running the code

In [106]:
## Calc the other metrics
other_metrics = ['perplexity',
                 'diversity',
                 'grammatical_error',
                 'cognitive_complexity']

files = ['TODO_OTHER.csv']
metric_data = {}
for metric in other_metrics:
    for file in files:
        print('----- ', metric, ' ----- ', file)
        data = pd.read_csv(file)
        data = data.fillna('')
        combined_data = pd.concat([data]) #This is used for multiple files
        
        result = globals()[metric](combined_data)
        metric_data[metric] = result

df = pd.DataFrame(metric_data)
df.to_csv('RESULTS_OTHER.csv', index=False)

-----  perplexity  -----  tufts\mcqs.csv


Using pad_token, but it is not set yet.
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.46s/it]


-----  diversity  -----  tufts\mcqs.csv
distinct_3_total:  0.9972020446596718
ngram_diversity_total:  0.9743589743589743
length:  14
-----  grammatical_error  -----  tufts\mcqs.csv
total_errors:  0.0
length:  14
length of errors:  14
-----  cognitive_complexity  -----  tufts\mcqs.csv
