In [None]:
#Step 1- Import the text file/article that has to be used for MCQ generation

file=open(r"C:\Users\Victor Umunna\projects\nlp-mcq-generator\data\ahistoryofdepression.txt",encoding="utf8") #"r" deontes read version open
text=file.read().strip()

In [None]:
text

'When is sorrow sickness? So begins Jonathan Sadowsky’s The Empire of Depression, a history riven with professional turf wars around where to draw the line between normal sadness and something more serious — now, across much of the world, called depression. He argues against reductionism and dogma. Instead of getting stuck in old disagreements about whether depression is caused by a chemical imbalance or by social inequality, Sadowsky urges that depression can be psychological, biological and social, just as it can be a real illness even if it is cultural.\n\nGiven that the World Health Organization names depression as a major contributor to the global burden of disease, tracing its history is a significant task. And it is an important one, given the mental-health crisis attending the COVID-19 pandemic. It is no mean feat to characterize something that has ever-shifting and contested boundaries dependent on time and place. Sadowsky, a historian of medicine, offers three possible reason

In [None]:
#Importing the needed files and packages

import nltk
from typing_extensions import TypeAlias
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('popular')

In [None]:
#Step 2- Extract the important words(keywords) from the text article that can be used to create MCQ using PKE (Python Keyword Extraction)

import pke
from nltk.corpus import stopwords
import string

def extract_keywords(article: str) -> list[str]:
  """Extracts keywords  from an article using the MultipartiteRank algorithm.

  Args:
    article: The article text.

  Returns:
    A list of extracted keywords.
  """

  extractor = pke.unsupervised.MultipartiteRank()

  stop_words = list(string.punctuation)
  stop_words += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
  stop_words += stopwords.words('english')

  extractor.load_document(input=article, stoplist=stop_words)

  # Select the longest sequences of nouns and adjectives, that do not
  # contain punctuation marks or stopwords as candidates.
  
  #pos = {'NOUN', 'PROPN', 'ADJ'}
  pos = {'PROPN'}
  extractor.candidate_selection(pos=pos)

  # Build the Multipartite graph.
  extractor.candidate_weighting()

  # Get the 25-highest scored candidates as keyphrases.
  keyphrases = extractor.get_n_best(n=25)

  keywords = []
  for phrases in keyphrases:
    keywords.append(phrases[0])

  return keywords


words = extract_keywords(text)
print(words)

['jonathan sadowsky', 'depression', 'empire', 'emily martin', 'bipolar expeditions', 'sadowsky', 'panic diaries', 'jackie orr', 'united states', 'black power', 'ian marsh', 'north america', 'britain', 'africans', 'sigmund freud', 'prozac', 'middle ages', 'renaissance europe', 'world health organization', 'imperial bedlam']


In [None]:
#Step 3- Split the whole text article into an array/list of individual sentences. This will help us fetch the sentences related to the keywords easily

from nltk.tokenize import sent_tokenize


def split_text_to_sentences(article: str) -> list[str]:
  """Splits a text article into a list of individual sentences.

  Args:
    article: The text article.

  Returns:
    A list of individual sentences.
  """

  sentences = sent_tokenize(article)
  sentences = [sentence.strip() for sentence in sentences if len(sentence) > 15]
  #reason for the strip()?

  return sentences


sentences = split_text_to_sentences(text)
print(sentences)

['When is sorrow sickness?', 'So begins Jonathan Sadowsky’s The Empire of Depression, a history riven with professional turf wars around where to draw the line between normal sadness and something more serious — now, across much of the world, called depression.', 'He argues against reductionism and dogma.', 'Instead of getting stuck in old disagreements about whether depression is caused by a chemical imbalance or by social inequality, Sadowsky urges that depression can be psychological, biological and social, just as it can be a real illness even if it is cultural.', 'Given that the World Health Organization names depression as a major contributor to the global burden of disease, tracing its history is a significant task.', 'And it is an important one, given the mental-health crisis attending the COVID-19 pandemic.', 'It is no mean feat to characterize something that has ever-shifting and contested boundaries dependent on time and place.', 'Sadowsky, a historian of medicine, offers th

In [None]:
#Step 4- Map the sentences which contain the keywords to the related keywords so that we can easily lookup the sentences related to the keywords

from flashtext import KeywordProcessor

def map_sentences_to_keywords(words, sentences):
    """
    Maps the sentences which contain the keywords to the related keywords.

    Args:
        words: A list of keywords.
        sentences: A list of sentences.

    Returns:
        A dictionary mapping each keyword to a list of sentences that contain the keyword.
    """

    keywordProcessor = KeywordProcessor() #Using keyword processor as our processor for this task
    keySentences = {}
    for word in words:
        keySentences[word] = []
        keywordProcessor.add_keyword(word) #Adds key word to the processor

    for sentence in sentences:
        found = keywordProcessor.extract_keywords(sentence)
        for each in found:
            keySentences[each].append(sentence)

    for key in keySentences.keys():
        sortedSentences = sorted(keySentences[key], key=len, reverse=True)
        keySentences[key] = sortedSentences

    return keySentences

#Achieve the sentences that contain the keywords and map those sentences to the keywords using this function
mapped_sentences = map_sentences_to_keywords(words, sentences)

# Print the mapped sentences
print(mapped_sentences)

{'jonathan sadowsky': ['So begins Jonathan Sadowsky’s The Empire of Depression, a history riven with professional turf wars around where to draw the line between normal sadness and something more serious — now, across much of the world, called depression.'], 'depression': ['The power to say who’s rational and who isn’t, and to detain people or treat them without consent, is perhaps the starkest reminder of why treating depression is not just like administering insulin for diabetes, and of why stigma looms large despite (or because of) the understandable appeals to biochemistry.', 'Sadowsky, a historian of medicine, offers three possible reasons for the current boom in diagnoses: that there really is more depression; that the amount is the same but we’re better at detecting it; or that emotional states not previously seen as illness are now being labelled as such.', 'Instead of getting stuck in old disagreements about whether depression is caused by a chemical imbalance or by social ine

In [None]:
#Step 5- Get the sense of the word. In order to attain a quality set of distractors we need to get the right sense of the keyword. This is explained in detail in the seperate alogrithm documentation

from pywsd.similarity import max_similarity
from pywsd.lesk import adapted_lesk
from pywsd.lesk import simple_lesk
from pywsd.lesk import cosine_lesk
from nltk.corpus import wordnet

def get_word_sense(sentence, word):
  """
  Returns the most likely sense of a word in a given context.

  Args:
    sent: The sentence in which the word appears.
    words_new: The list of words to get the sense for.

  Returns:
    A synset representing the most likely sense of the word, or None if no
    synset is found.
  """

  word = word.lower()
  # Split the word with underscores(_) instead of spaces if there are multiple
  # words. This is necessary because wordnet does not support compound words.
  if len(word.split()) > 0:
    word = word.replace(" ", "_")

  synsets = wordnet.synsets(word, 'n')
  if synsets:
    # Use two different WSD algorithms, Wu-Palmer (WUP) and Adapted Lesk,
    # to determine the most likely sense of the word. The algorithm with the
    # lowest index is returned.
    wu_palmer_output = max_similarity(sentence, word, 'wup', pos='n')
    adapted_lesk_output = adapted_lesk(sentence, word, pos='n')
    lowest_index = min(synsets.index(wu_palmer_output), synsets.index(adapted_lesk_output))
    return synsets[lowest_index]
  else:
    return None

Warming up PyWSD (takes ~10 secs)... took 2.764868974685669 secs.


In [None]:
def get_distractors_wordnet(syn, word):
    """Gets distractors for a word from WordNet.

    Args:
        syn: A WordNet synset.
        word: A string.

    Returns:
        A list of distractor words.
    """

    distractors = []
    word = word.lower()
    actual_word = word

    if len(word.split()) > 0:
        word = word.replace(" ", "_")

    hypernyms = syn.hypernyms()
    if not hypernyms:
        return distractors

    # Find the hyponyms of the first hypernym.
    hyponyms = hypernyms[0].hyponyms()

    # Iterate over the hyponyms and add them to the list of distractors,
    # if they are not the actual word and are not already in the list.
    for hypo in hyponyms:
        name = hypo.lemmas()[0].name()
        if name == actual_word:
            continue

        name = name.replace("_", " ")
        name = " ".join(w.capitalize() for w in name.split())
        if name not in distractors:
            distractors.append(name)

    return distractors


In [None]:
import requests
import json


def get_distractors_conceptnet(word):
    """Gets distractors for a word from ConceptNet.

    Args:
        word: A string.

    Returns:
        A list of distractor words.
    """

    # Convert the word to lowercase and store the original word in `actual_word`.
    word = word.lower()
    actual_word = word

    # Replace any underscores in the word with spaces. This will ensure that compound sentences are handled correctly.
    word = word.replace("_", " ")

    # Initialize the list of distractors.
    distractors = []

    # Construct the first ConceptNet API query URL.
    url = (
        "http://api.conceptnet.io/query?node=/c/en/%s/n&rel=/r/PartOf&start=/c/en/%s&limit=5"
        % (word, word)
    )

    # Send the first ConceptNet API query and get the response.
    response = requests.get(url)
    data = response.json()

    # Iterate over the edges in the first ConceptNet API response.
    for edge in data["edges"]:

        # Get the link from the edge.
        link = edge["end"]["term"]

        # Construct the second ConceptNet API query URL.
        url2 = (
            "http://api.conceptnet.io/query?node=%s&rel=/r/PartOf&end=%s&limit=10"
            % (link, link)
        )

        # Send the second ConceptNet API query and get the response.
        response2 = requests.get(url2)
        data2 = response2.json()

        # Iterate over the edges in the second ConceptNet API response.
        for edge in data2["edges"]:

            # Get the word from the edge.
            word2 = edge["start"]["label"]

            # If the word is not already in the list of distractors and is different from the actual word, add it to the list.
            if word2 not in distractors and actual_word.lower() not in word2.lower():
                distractors.append(word2)

    # Return the list of distractors.
    return distractors


In [None]:
# Step 8: Find and map the distractors to the keywords

mapped_distractors = {}
for keyword in mapped_sentences:
    # Get the word sense of the keyword.
    word_sense = get_word_sense(mapped_sentences[keyword][0], keyword)

    # If there is a word sense, then get the WordNet distractors.
    if word_sense:
        distractors = get_distractors_wordnet(word_sense, keyword)

        # If there are no WordNet distractors, then get the ConceptNet distractors.
        if len(distractors) == 0:
            distractors = get_distractors_conceptnet(keyword)

        # If there are any distractors, then map them to the keyword.
        if len(distractors) != 0:
            mapped_distractors[keyword] = distractors

    # If there is no word sense, then directly search for and map the ConceptNet distractors.
    else:
        distractors = get_distractors_conceptnet(keyword)

        # If there are any distractors, then map them to the keyword.
        if len(distractors) > 0:
            mapped_distractors[keyword] = distractors
# Print the mapped distractors.
print(mapped_distractors)


{'depression': ['Aquifer', 'Beach', 'Cave', 'Cliff', 'Delta', 'Diapir', 'Folium', 'Foreshore', 'Ice Mass', 'Lakefront', 'Massif', 'Monocline', 'Mouth', 'Natural Depression', 'Natural Elevation', 'Oceanfront', 'Range', 'Relict', 'Ridge', 'Shore', 'Slope', 'Spring', 'Talus', 'Vein', 'Volcanic Crater', 'Wall', 'Water Table'], 'empire': ['Archduchy', 'Barony', 'Duchy', 'Earldom', 'Emirate', 'Fiefdom', 'Grand Duchy', 'Khanate', 'Kingdom', 'Principality', 'Sheikdom', 'Suzerainty', 'Viscounty'], 'united states': ['United States Government'], 'africans': ['Abator', 'Abjurer', 'Abomination', 'Abstainer', 'Achiever', 'Acquaintance', 'Acquirer', 'Active', 'Actor', 'Adjudicator', 'Admirer', 'Adoptee', 'Adult', 'Adventurer', 'Adversary', 'Advisee', 'Advocate', 'Affiant', 'African', 'Agnostic', 'Amateur', 'Amerindian', 'Ancient', 'Anomaly', 'Anti-american', 'Anti', 'Applicant', 'Appointee', 'Appreciator', 'Apprehender', 'Aquarius', 'Archaist', 'Aries', 'Arrogator', 'Assessee', 'Asthmatic', 'Authorit

In [None]:
# Step 9: Present the multiple choice questions in a nice and readable manner.

print("**************************************        Multiple Choice Questions        *******************************")
print()

import re
import random

iterator = 1  # To keep the count of the questions

for keyword in mapped_distractors:
    # Get the first sentence from the set of sentences.
    sentence = mapped_sentences[keyword][0]

    pattern = re.compile(keyword, re.IGNORECASE)  # Converts into regular expression for pattern matching
    option_string = pattern.sub("________", sentence)  # Replaces the keyword with underscores(blanks)

    # Prints the question along with a question number
    print(f"Question {iterator}: {option_string}")

    # Capitalizes the options and selects only 4 options
    options = [keyword.capitalize()]
    for distractor in mapped_distractors[keyword]:
        options.append(distractor)
        if len(options) == 4:
            break

    # Shuffles the options so that order is not always same
    random.shuffle(options)

    # Prints the options
    opts=['a','b','c','d']
    for i, option in enumerate(options):
        if i < len(opts):
            print(f"\t{opts[i]}) {option}")

    print()
    iterator += 1  # Increase the counter


**************************************        Multiple Choice Questions        *******************************

Question 1: The power to say who’s rational and who isn’t, and to detain people or treat them without consent, is perhaps the starkest reminder of why treating ________ is not just like administering insulin for diabetes, and of why stigma looms large despite (or because of) the understandable appeals to biochemistry.
	a) Aquifer
	b) Beach
	c) Depression
	d) Cave

Question 2: It is alarming, then, that apart from using “________” as an analogy for the global dominance of Western psychiatry in interpreting distress, Sadowsky devotes little attention here to power and politics — especially given his previous work on colonialism (in the 1999 book Imperial Bedlam).
	a) Barony
	b) Duchy
	c) Empire
	d) Archduchy

Question 3: In fact, many forms of resistance have been deemed symptoms of mental illness, from enslaved Africans fleeing brutality in the nineteenth-century ________ to t