## Setup

In [1]:
#Import the required libraries, lots of these are required for the LLMs we utilize for three criteria. 
import spacy
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util
from lexicalrichness import LexicalRichness
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import math
import random
import numpy as np
nlp = spacy.load('en_core_web_lg')
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize, sent_tokenize
import openai
openai.api_key = 'REPLACE_WITH_YOUR_KEY'
model_engine = "text-davinci-003"

In [2]:
#Our class used to represent multiple-choice questions
nl = '\n'
class MultipleChoiceQuestion:
    def __init__(self, stem, options, correct_option, qid = None, courseid = None, quality = None):
        self.stem = stem
        self.options = options
        self.correct_option = correct_option
        self.qid = qid
        self.courseid = courseid
        self.quality = quality
        
    def __str__(self):
        return f"Question: {self.stem}\n {nl.join(self.options)}\nCorrect option: {self.correct_option}\nQuality: {self.quality}"

## 19 Item-Writing Flaw Criteria 

### Ambiguous or unclear information 
    Questions and all options should be written in clear, unambiguous language

In [378]:
#Roberta model from: https://huggingface.co/cointegrated/roberta-large-cola-krishna2020
cola = pipeline('text-classification', model='cointegrated/roberta-large-cola-krishna2020',truncation=True)

def ambiguous_unclear_information(question):
    output = cola(question.stem)
    score = output[0]['score']
    if score >= 0.7:
        return True
    else:
        print('--- Question stem is unclear')
        return False

### Implausible distractors
    Make all distractors plausible as good items depend on having effective distractors

In [336]:
#MiniLM from: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
model = SentenceTransformer('all-MiniLM-L6-v2')

#Uses NER, so if the score is too low, if they're matching entities (i.e. people) then we can ignore this case and say True
def implausible_distractors(question):
    correct = question.correct_option
    options = question.options.copy()
    options.remove(correct)

    # Two lists of sentences
    sentences1 = [correct, correct, correct]
    sentences2 = options

    #Compute embedding for both lists
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    #Output the pairs with their score
    for i in range(len(sentences1)):
        if cosine_scores[i][i] < 0.15: #Was .2
            
            #NER check here...
            opt_entity = nlp(sentences2[i])
            lemma_nouns_opt = get_lemma_nouns(sentences2[i])
            
            ans_entity = nlp(sentences1[i])
            lemma_nouns_ans = get_lemma_nouns(sentences1[i])

            #If the noun(s) in the answer choice can be tagged with an entity
            if ans_entity.ents:
                answer_entity = ans_entity.ents[0].label_
            else:
                answer_entity = None

            if opt_entity.ents:
                opt_entity = opt_entity.ents[0].label_
            else:
                opt_entity = None

            if answer_entity and opt_entity and answer_entity in opt_entity:
                #low score, but they are the same entity
                return True
            
            if len(lemma_nouns_ans) == 0 and len(lemma_nouns_opt) == 0:
                #Couldn't find the noun nor the entity? Unable to parse effectively to make a judgement.
                return True
            
            #If the option in this case is none/all of the above, it won't be similar, so ignore this criteria
            if not all_of_the_above(question) or not none_of_the_above(question):
                return True
            
            print("--- Distractor not similar enough: {} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
            return False
        
    return True

### None of the above
    Avoid none of the above as it only really measures students ability to detect incorrect answers 

In [303]:
def none_of_the_above(question):
    for opt in question.options:

        cleaned_opt = opt.strip().lower()
        if 'none of the above' in cleaned_opt or ('none' in cleaned_opt and 'above' in cleaned_opt) or cleaned_opt == 'neither' or cleaned_opt == "none" or 'none' in question.options[3]:
            print('--- None of the above')
            return False
    
    return True

### Longest option correct
    Often the correct option is longer and includes more detailed information, which clues students to this option

In [6]:
#If the correct answer is noticably longer (20% or more) than the second longest answer, flag it.

def longest_answer_correct(question):
    correct = question.correct_option
    options = question.options.copy()
    options.remove(correct)
    
    longest_option = 0
    for opt in options:
        if len(opt) >= longest_option:
            longest_option = len(opt)
        
    #If the longest option is only by 20% or it's a single word/number, then this passes
    if longest_option >= len(correct) * 0.8 or len(correct.split()) == 1:
        return True
    
    print('--- longest option is correct')
    return False

### Gratuitous information
    Avoid unnecessary information in the stem that is not required to answer the question

In [4]:
def gratuitous_information_in_stem(question):  
    #How effective are lexical richness measures for differentiations of vocabulary proficiency? A comprehensive examination with clustering analysis
    #From: https://github.com/LSYS/LexicalRichness
    stem = LexicalRichness(question.stem)
    
    if stem.cttr > 4.5:
        print("--- CTTR above 4.5, text is too complex and extraneous: ", stem.cttr)
        return False
    
    return True

### True/False question
    The options should not be a series of true/false statements

In [7]:
#Question should not be a series of true/false statements, so we can look for "which" and "true" or "false" in the stem

def true_or_false(question):
    correct = question.correct_option
    options = question.options.copy()
    options.remove(correct)
    
    #Check for neatively worded stem too.
    for sent in question.stem.split('.'):
        sent = sent.lower()
        if 'which' in sent and ('false' in sent or 'true' in sent):
            print('--- true/false or yes/no answer choice')
            return False    
    
    for opt in options:
        if opt.strip().lower() == 'true' or opt.strip().lower() == 'false' or opt.strip().lower() == 'yes'or opt.strip().lower() == 'no':
            print('--- true/false or yes/no answer choice')
            return False

    return True

### Convergence cues
    Avoid convergence cues in options where there are different combinations of multiple components to the answer

In [8]:
#Check for synonyms, because they'll know it's the word they've most recently come across in the text
#The correct option is likely to be used more (when in pairs, etc.) --> k-type (super similar by description)

def avoid_convergence_cues(question):
    #So here we check for synonyms used in the words, in case they get lazy with distractors
    options = question.options.copy()
    options.remove(question.correct_option)
    if len(options) < 3:
        return True
    
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    #so we want this code, but like, for synonyms
    synonyms = []
    for noun in lemma_nouns_answ:
        for syn in wn.synsets(noun):
            for l in syn.lemmas():
                synonyms.append(l.name().lower().replace('_', ' '))
    
    for opt in lemma_nouns_options:
        repeating_nouns_synonyms = list(set(synonyms).intersection(opt))
        print("repeating_nouns_synonyms: ", repeating_nouns_synonyms)
        if len(repeating_nouns_synonyms) > 0:           
            
            #if the repeat is not in every answer choice, flag it.
            for rns in repeating_nouns_synonyms:           
                
                flag = True
                for value in lemma_nouns_options:
                    if rns not in value:
                        print('--- we have a synonym of the answer being used in other answer choices, but not all of them: ', rns)
                        return False
    
    return True

### Logical cues
    Avoid clues in the stem and the correct option that can help the test-wise student to identify the correct option

In [9]:
#An example of a logical cue is asking students to select the most appropriate pharmaceutical intervention for a problem and
#only having one or two options which
#Using NER, if the question asks for a <certain type of noun, like a <person> then the options should all be <people> too.

def avoid_logical_cues(question):
    options = question.options.copy()
    options.remove(question.correct_option)
    if len(options) < 3:
        return True
    
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    entities_in_options = []
    for opt in lemma_nouns_options:
        for val in opt:
            doc = nlp(val)
            if doc.ents:
                entities_in_options.append(doc.ents[0].label_)
    
    entities_in_answer = []
  
    for val in lemma_nouns_answ:
        doc = nlp(val)
        
        #If the noun(s) in the answer choice can be tagged with an entity
        if doc.ents:
            answer_entity = doc.ents[0].label_
            if answer_entity not in entities_in_options:
                print('--- The answer entity is not found in any other options: ', answer_entity)
                return False 
    
    return True

### All of the above
    Avoid all of the above options as students can guess correct responses based on partial information

In [10]:
def all_of_the_above(question):
    for opt in question.options:
        if 'all of the above' in opt or ('all' in opt and 'above' in opt) or ('all if the' in opt):
            print('--- all of the above')
            return False

    return True

### Fill-in-blank
    Avoid omitting words in the middle of the stem that students must insert from the options provided 

In [11]:
def fill_in_the_blank(question):
    if "_" in question.stem:
        print('--- fill in the blank')
        return False
    
    return True

### Absolute terms
    Avoid the use of absolute terms (e.g. never, always, all) in the options as students are aware that they are almost 
    always false 

In [12]:
def absolute_terms(question):
    absolutes = ["always", "never", "every", "none", "only"]
    for opt in question.options:
        #Count all, but not in the case of "all of the above"
        if any(word in opt for word in absolutes) or ("all" in opt and all_of_the_above(question)):
            print('--- absolute word in question stem')
            return False
    
    return True

### Word repeats
    Avoid similarly worded stems and correct responses or words repeated in the stem and correct response

In [14]:
#Find the nouns in question.correct_option and question.stem --> stem them --> compare cosine similiary (usin sentence transformer)
#Also check for the synonyms, compare them. However, if the word(s) are used in the other options, then it's fine.

#Nouns: NN noun, singular ‘- desk’, NNS noun plural – ‘desks’, NNP proper noun, singular – ‘Harrison’, NNPS proper noun, plural – ‘Americans’ 
lemmatizer = WordNetLemmatizer()
nouns = ['NN', 'NNS', 'NNP', 'NNPS']

def word_repeats_in_stem_and_correct_answer(question):   
    lemma_nouns_stem = get_lemma_nouns(question.stem)        
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    
    repeating_nouns = list(set(lemma_nouns_stem).intersection(lemma_nouns_answ))
    
    #Check for synonms in question stem w/ answer choice
    synonyms = []
    for noun in lemma_nouns_stem:
        for syn in wn.synsets(noun):
            for l in syn.lemmas():
                synonyms.append(l.name().lower().replace('_', ' '))
                
    repeating_nouns_synonyms = list(set(synonyms).intersection(lemma_nouns_answ))
    
    #If we get a repeat, then it should also repeat in the other answer choices, not just the correct
    if len(repeating_nouns) > 0 or len(repeating_nouns_synonyms) > 0:
        options = question.options.copy()
        options.remove(question.correct_option)
        options_that_share_noun = 0
        
        for opt in options:
            lemma_option = get_lemma_nouns(opt)
            repeating_nouns_ans_opt = list(set(lemma_option).intersection(lemma_nouns_answ))
            if len(repeating_nouns_ans_opt) > 0:
                options_that_share_noun += 1
        
        #If the word is shared between all options, then it is fine
        if options_that_share_noun == 3 or all_of_the_above(question) or none_of_the_above(question):
            return True
        else:
            print('--- The noun is only shared in certain words')
            return False
        
    #Check for word (adjective, noun, verb, adverb) that repeats just in stem and answer    
    word_types = ['NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'VB', 'VBG', 'VBN', 'VBP', 'VBZ']
    stem_token = sent_tokenize(question.stem)
    stem_words = []
    for i in stem_token:
        wordsList = nltk.word_tokenize(i)
        wordsList = [w for w in wordsList if not w in stop_words]
        tagged = nltk.pos_tag(wordsList)
        for t in tagged:
            if t[1] in word_types:
                stem_words.append(t[0].lower())
                
    ans_token = sent_tokenize(question.correct_option)
    ans_words = []
    for i in ans_token:
        wordsList = nltk.word_tokenize(i)
        wordsList = [w for w in wordsList if not w in stop_words]
        tagged = nltk.pos_tag(wordsList)
        for t in tagged:
            if t[1] in word_types:
                ans_words.append(t[0].lower())
    
    if any(x in stem_words for x in ans_words):
        return False

    
    return True

def get_lemma_nouns(text):
    all_nouns = []
    tokenized = sent_tokenize(text)
    
    for i in tokenized:

        # Word tokenizers is used to find the words and punctuation in a string
        wordsList = nltk.word_tokenize(i)

        # removing stop words from wordList
        wordsList = [w for w in wordsList if not w in stop_words]

        #  Using a Tagger. Which is part-of-speech tagger or POS-tagger.
        tagged = nltk.pos_tag(wordsList)
        
        # Add any nouns to this list
        for t in tagged:
            if t[1] in nouns:
                all_nouns.append(t[0].lower())
    
    lemmatized_nouns = []
    for n in all_nouns:
        lemmatized_word = lemmatizer.lemmatize(n, pos="n")
        lemmatized_nouns.append(lemmatized_word.lower())
    
    return lemmatized_nouns

### Unfocused stem
    The stem should present a clear and focused question that can be understood and answered without looking at the options

In [17]:
def unfocused_stem(question):
    contains_question = False
    doc = nlp(question.stem)
    for sent in doc.sents:
        if is_question(sent.text.strip()):
            contains_question = True
            
    if not contains_question:
        print("--- Question stem does not contain a question")
        
    return contains_question

#From https://stackoverflow.com/questions/4083060/determine-if-a-sentence-is-an-inquiry
def is_question(sent):
    d = nlp(sent)
    token = d[0] # gets the first token in a sentence
    if token.pos_ == "VERB" and token.dep_ == "ROOT": # checks if the first token is a verb and root or not
        return True
    for token in d: # loops through the sentence and checks for WH tokens
        if token.tag_ == "WDT" or token.tag_ == "WP" or token.tag_ == "WP$" or token.tag_ == "WRB" or token.text == '?':
            return True
    return  False

### Complex or K-type
    Avoid questions that have a range of correct responses, that ask students to select from a number of possible 
    combinations of the responses

In [18]:
#If the answer options share the same words between one another and there are commas present then it's k type

def complex_k_type(question):    
    options = question.options.copy()
    options.remove(question.correct_option)
    if len(options) < 3:
        return True 
    
    #check if the options contain a comma
    contain_a_comma = 0
    for opt in options:
        if ',' in opt:
            contain_a_comma += 1
    contain_a_comma = contain_a_comma == len(options)
    
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    options_that_share_noun = 0
    for lno in lemma_nouns_options:   
        repeating_nouns = list(set(lno).intersection(lemma_nouns_answ))
        if (len(repeating_nouns) > 0) and (len(lno) > 0):
            options_that_share_noun += 1
    
    #Options share a key word, there are multiple nouns in the options, and they have a comma
    #suggesting it might be a k-type question
    if options_that_share_noun > 0 and contain_a_comma:
        print("--- This is a K-type question")
        return False
    
    #After removing any list notation in the answer choices, see if they contain the same words
    cleaned_options = []
    for opt in options:
        cleaned_options.append(clean_string(opt))

    options_set_list = [set(i.split()) for i in cleaned_options]
    if options_set_list[0] == options_set_list[0] and options_set_list[0] == options_set_list[1] and options_set_list[0] == options_set_list[2]:
        return False
    
    return True

def clean_string(string):
    # remove whitespace
    cleaned_string = string.strip()
    
    # remove punctuation
    cleaned_string = re.sub(r'[^\w\s]', '', cleaned_string)
    
    # remove list notation
    cleaned_string = re.sub(r'\b(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|xiv|xv|xvi|xvii|xviii|xix|xx)\b', '', cleaned_string)
    cleaned_string = re.sub(r'\b(A|B|C|D|E|F)\b', '', cleaned_string)
    print('cleaned_string: ', cleaned_string)
    return cleaned_string

### Grammatical cues
    All options should be grammatically consistent with the stem and should be parallel in style and form

In [19]:
#If verb exists in answer choice, ensure it's the same tense as verb in other options
#We want the stem to be the same, but as long as all the answers are the same, then it's fine, to avoid false positive.
def grammatical_cues_in_stem(question):
    stem_tense = get_verb_tense(question.stem)
    answer_tense = get_verb_tense(question.correct_option)
    
    options = question.options.copy()
    options.remove(question.correct_option)
    for opt in options:
        opt_tense = get_verb_tense(opt)
        if opt_tense != 'none' and answer_tense is not opt_tense:
            print("--- Verb tense doesn't align between answer other options")
            return False
        
    return True
    
def get_verb_tense(text):
    doc = nlp(text)
    for token in doc:
        if token.pos_ == 'VERB':
            if token.tag_ in ['VBP', 'VBZ']:
                return 'present'
            elif token.tag_ in ['VBD', 'VBN']:
                return 'past'
            else:
                return 'other'
    return 'none'

### Lost sequence
    All options should be arranged in chronological or numerical order

In [20]:
#If answer choices are numeric, sort them, compare to current order
def lost_sequence(question):
    options = question.options.copy()
    
    opts = []
    for opt in options:
        opt = re.sub(r'[$%°FC,]', '', opt)
        opts.append(opt)
        try:
            float(opt)
        except ValueError:
            return True
        
    float_options = [float(x) for x in opts]    
    sorted_options = sorted(float_options)
    if sorted_options == float_options:
        #Numeric options are sorted
        return True
    else:
        print('--- Options are numeric and not sorted')
        return False

### Vague terms
    Avoid the use of vague terms (e.g. frequently, occasionally) in the options as there is seldom agreement on their actual meaning 

In [21]:
def vague_terms(question):
    vagues = ["often", "sometimes", "rarely", "typically", "usually", "normally", "generally", "nearly", "approximately", "more or less", "somewhat"]
    for opt in question.options:
        if any(word in opt for word in vagues):
            print('--- vague word in question stem')
            return False
    
    return True

### More than one correct
    In single best-answer form, questions should have 1, and only 1, best answer 

In [24]:
#Using GPT-4 for QA, the model can confirm if the correct answer is correct, however it is not that accurate.
#In the future, utilize a better QA model as improvements are made.

def more_than_one_correct(question):
    if not question.options[2]:
        question.options[2] = 'None'
        
    if not question.options[3]:
        question.options[3] = 'None'
    
    # Define the prompt
    prompt = """
    Answer the multiple-choice question below by responding with A, B, C, or D.
    
    {}

    A) {}
    B) {}
    C) {}
    D) {}
    """.format(question.stem, question.options[0], question.options[1], question.options[2], question.options[3])

    # Generate a response
    while(done == False):
        try:
            o = openai.ChatCompletion.create(
              model="gpt-4",
              messages=[
                {"role": "user", "content": prompt},
              ]
             )
            done = True 
        except:
            #If there's an error from the API being down, wait 150 seconds before retrying
            time.sleep(150)
    completion = o
    done = False

    # Print the response
    try:
        cleaned_response = completion.choices[0].message.content.split(')')[0].strip()
        if (cleaned_response == 'A'):
            cleaned_response = question.options[0]
        elif (cleaned_response == 'B'):
            cleaned_response = question.options[1]
        elif (cleaned_response == 'C'):
            cleaned_response = question.options[2]
        elif (cleaned_response == 'D'):
            cleaned_response = question.options[3]
    except: 
        print('error in GPT-4: ', completion)
    
    if cleaned_response == question.correct_option:
        return True
    else:
        print('--- GPT-4 believes the answer is incorrect: ', cleaned_response, ' ', question.correct_option)
        return False

### Negative worded
    Negatively worded stems are less likely to measure important learning
    outcomes and can confuse students

In [25]:
def negative_worded_stem(question):
    negatives = ["no", "none", "never", "without", "exclude", "avoid", "deny", "refuse", "oppose", "dispute"]
    for opt in question.stem:
        if any(word in opt for word in negatives):
            print('--- Absolute word in question stem')
            return False
    
    #Check for neatively worded stem too.
    for sent in question.stem.split('.'):
        sent = sent.lower()
        
        if 'which' in sent and ('false' in sent or 'not' in sent or 'incorrect' in sent or 'except' in sent) or \
        'what' in sent and ('false' in sent or 'not' in sent or 'incorrect' in sent or 'except' in sent):
            return False    
    
    
    return True

## Example of how to use this rule-based method

In [None]:
#The CSV should contain 5 columns in total, the question's text and answer choices
    #question: question text
    #altA, altB, altC, altD: answer choices
qs = pd.read_csv('some_csv.csv')
qs = qs.fillna('')

questions = []
for index, row in qs.iterrows():
    question = MultipleChoiceQuestion(
        stem=row['question'],
        options=[row['altA'], row['altB'], row['altC'], row['altD']],
        correct_option= row['altA']
    )
    questions.append(question)

#Call each criteria on the question, save it to an array, which will become a row in our dataframe
rows = []
for q in questions:
    r = [ambiguous_unclear_information(q),
        implausible_distractors(q),
        none_of_the_above(q),
        longest_answer_correct(q),
        gratuitous_information_in_stem(q),
        true_or_false(q),
        avoid_convergence_cues(q),
        avoid_logical_cues(q),
        all_of_the_above(q),
        fill_in_the_blank(q),
        absolute_terms(q),
        word_repeats_in_stem_and_correct_answer(q),
        unfocused_stem(q),
        complex_k_type(q),
        grammatical_cues_in_stem(q),
        lost_sequence(q),
        vague_terms(q),
        more_than_one_correct(q),
        negative_worded_stem(q)]
    rows.append(r)

#Set the columns up for our dataframe
columns = [
    'ambiguous_unclear_information',
    'implausible_distractors',
    'none_of_the_above',
    'longest_answer_correct',
    'gratuitous_information_in_stem',
    'true_or_false',
    'avoid_convergence_cues',
    'avoid_logical_cues',
    'all_of_the_above',
    'fill_in_the_blank',
    'absolute_terms',
    'word_repeats_in_stem_and_correct_answer',
    'unfocused_stem',
    'complex_k_type',
    'grammatical_cues_in_stem',
    'lost_sequence',
    'vague_terms',
    'more_than_one_correct',
    'negative_worded_stem'
]

results = pd.DataFrame(rows, columns=columns)
results.to_csv("results.csv")

## Applying the IWF Rubric with GPT-4

In [None]:
results = []
questions = data_structure_of_questions
iwfs = [{"criteria": "ambiguous or unclear information ",
     "definition": "questions and all options should be written in clear, unambiguous language"}, 
     {"criteria": "implausible distractors",
     "definition": "questions should make all alternative answer choices plausible, as good items depend on having effective distractors"}, 
     {"criteria": "none of the above",
     "definition": "questions should avoid using none of the above as an answer choice as it only really measures students ability to detect incorrect answers"},
     {"criteria": "longest option is correct",
     "definition": "all question options should be similar in length and the amount of detail provided in each option, however this can be ignored if the options are only a few words in length"}, 
     {"criteria": "gratuitous information in the stem",
     "definition": "questions should avoid using gratuitous or unnecessary information in the stem that is not required to answer the question"},
    {"criteria": "true/false question",
     "definition": "question options should not be a series of true or false statements"},
    {"criteria": "convergence cues",
     "definition": "questions should avoid convergence cues in options where there are different combinations of multiple components to the answer"}, 
    {"criteria": "logical cues",
     "definition": "questions should avoid clues in the stem and the correct option that can help the test-wise student to identify the correct option"}, 
    {"criteria": "all of the above",
     "definition": "question options should not contain 'all of the above' or something similar, as students can guess correct responses based on partial information. If the question options do not contain it, then the question satisfies this criteria."},
    {"criteria": "fill-in-the-blank",
     "definition": "questions should avoid omitting words in the middle of the stem that students must insert from the options provided"}, 
    {"criteria": "absolute terms",
     "definition": "questions should avoid the use of absolute terms (e.g. never, always, only, all) in the options as students are aware that they are almost always false"},
    {"criteria": "word repeats",
     "definition": "questions should avoid repeating words between just the stem and the correct option, if there are repeated words, they should also be included among other options"}, 
    {"criteria": "unfocused stem",
     "definition": "the question stem should present a clear and focused question that can be understood and answered without looking at the options"},
    {"criteria": "complex or k-type",
     "definition": "questions should avoid questions that have a range of correct responses, that ask students to select from a number of possible combinations of the responses"}, 
    {"criteria": "grammatical cues",
     "definition": "all question options should be grammatically consistent with the stem and should be parallel in style and form"}, 
    {"criteria": "lost sequence",
     "definition": "all question options should be arranged in chronological or numerical order"},
    {"criteria": "vague terms",
     "definition": "questions should avoid the use of vague terms (e.g. frequently, occasionally, rarely, usually, commonly) in the options, as these terms lack precision and there is seldom agreement on their actual meaning"},
     {"criteria": "more than one correct",
     "definition": "questions should only have one correct answer from the question options"},
     {"criteria": "negative worded",
     "definition": "questions should avoid the use of negative words (e.g., not, except, incorrect) in the stem"}, 
    ]

#Loop over each question, then for each question, call the IWF criteria one at a time on it.
done = False
for q in questions:
    results.append(q.stem)
    for i in iwfs:
        #Run this as a while loop with error handling code, as sometimes the GPT-4 API goes down, returning an error, in which 
        #we'll need to wait and retry our call
        while(done == False):
            try:
                o = openai.ChatCompletion.create(
                  model="gpt-4", 
                  messages=[
                    {"role": "user", "content": f'Begin your response with yes or no, does this multiple-choice question satisfy the criteria relating to {i["criteria"]}: {i["definition"]}? Explain why. \n' + q.stem + " \n" + nl.join(q.options)},
                  ]
                 )
                done = True 
            except:
                time.sleep(150)
        done = False
        results.append(o)
    
    
rows = []
r = []
indz = 0
for res in results 
    try:
        r.append(res.choices[0].message.content)
    except:
        r.append(res)
        
    #Once we've created a row, r, that contains the question text and 19 criteria, append it to our greater rows list
    if indz == 19:
        rows.append(r)
        r = []
        indz = 0
    else:
        indz = indz + 1


columns = [
    'question',
    'ambiguous_unclear_information',
    'implausible_distractors',
    'none_of_the_above',
    'longest_answer_correct',
    'gratuitous_information_in_stem',
    'true_or_false',
    'avoid_convergence_cues',
    'avoid_logical_cues',
    'all_of_the_above',
    'fill_in_the_blank',
    'absolute_terms',
    'word_repeats_in_stem_and_correct_answer',
    'unfocused_stem',
    'complex_k_type',
    'grammatical_cues_in_stem',
    'lost_sequence',
    'vague_terms',
    'more_than_one_correct',
    'negative_worded_stem'
]

pd_results = pd.DataFrame(rows, columns=columns)
pd_results.to_csv("gpt-4_results.csv")