In [7]:

#Step 1- Import the text file/article that has to be used for MCQ generation

file=open(r"C:\Users\Victor Umunna\projects\nlp-mcq-generator\data\foundationofcomputerscience.txt",encoding="utf8") #"r" deontes read version open
text=file.read().strip()

In [8]:
text

'The earliest foundations of what would become computer science predate the invention of the modern digital computer. Machines for calculating fixed numerical tasks such as the abacus have existed since antiquity, aiding in computations such as multiplication and division. Algorithms for performing computations have existed since antiquity, even before the development of sophisticated computing equipment.\n\nWilhelm Schickard designed and constructed the first working mechanical calculator in 1623. In 1673, Gottfried Leibniz demonstrated a digital mechanical calculator, called the Stepped Reckoner. Leibniz may be considered the first computer scientist and information theorist, because of various reasons, including the fact that he documented the binary number system. In 1820, Thomas de Colmar launched the mechanical calculator industry when he invented his simplified arithmometer, the first calculating machine strong enough and reliable enough to be used daily in an office environment

In [3]:
#Importing the needed files and packages

import nltk
from typing_extensions import TypeAlias
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('popular')

In [4]:
#Step 2- Extract the important words(keywords) from the text article that can be used to create MCQ using PKE (Python Keyword Extraction)

import pke
from nltk.corpus import stopwords
import string

def extract_keywords(article: str) -> list[str]:
  """Extracts keywords  from an article using the MultipartiteRank algorithm.

  Args:
    article: The article text.

  Returns:
    A list of extracted keywords.
  """

  extractor = pke.unsupervised.MultipartiteRank()

  stop_words = list(string.punctuation)
  stop_words += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
  stop_words += stopwords.words('english')

  extractor.load_document(input=article, stoplist=stop_words)

  # Select the longest sequences of nouns and adjectives, that do not
  # contain punctuation marks or stopwords as candidates.
  
  pos = {'NOUN', 'PROPN', 'ADJ'}
  #pos = {'PROPN'}
  extractor.candidate_selection(pos=pos)

  # Build the Multipartite graph.
  extractor.candidate_weighting()

  # Get the 25-highest scored candidates as keyphrases.
  keyphrases = extractor.get_n_best(n=25)

  keywords = []
  for phrases in keyphrases:
    keywords.append(phrases[0])

  return keywords


words = extract_keywords(text)
print(words)

['waste management', 'nigeria', 'major environmental problems', 'crude oil', 'industrial', 'deforestation', 'delta region', 'soil degradation', 'year', 'hectares', 'illegal oil refineries', 'problems', 'sewage treatment', 'country', 'municipal councils', 'water flows', 'global warming', 'environmental damage', 'climate change', 'megacity', 'local operators', 'major reasons', 'lighter fuel components', 'world', 'processes']


In [5]:
#Step 3- Split the whole text article into an array/list of individual sentences. This will help us fetch the sentences related to the keywords easily

from nltk.tokenize import sent_tokenize


def split_text_to_sentences(article: str) -> list[str]:
  """Splits a text article into a list of individual sentences.

  Args:
    article: The text article.

  Returns:
    A list of individual sentences.
  """

  sentences = sent_tokenize(article)
  sentences = [sentence.strip() for sentence in sentences if len(sentence) > 15]
  #reason for the strip()?

  return sentences


sentences = split_text_to_sentences(text)
print(sentences)

['Waste management including sewage treatment, the linked processes of deforestation and soil degradation, and climate change or global warming are the major environmental problems in Nigeria.', 'Waste management presents problems in a megacity like Lagos and other major Nigerian cities which are linked with economic development, population growth and the inability of municipal councils to manage the resulting rise in industrial and domestic waste.', 'This waste management problem is also attributable to unsustainable environmental management lifestyles of Kubwa community in the Federal Capital Territory, where there are habits of indiscriminate disposal of waste, dumping of waste along or into the canals, sewerage systems that are channels for water flows, and the like.', 'Haphazard industrial planning, increased urbanisation, poverty and lack of competence of the municipal government are seen as the major reasons for high levels of waste pollution in major cities of the country.', 'S

In [6]:
#Step 4- Map the sentences which contain the keywords to the related keywords so that we can easily lookup the sentences related to the keywords

from flashtext import KeywordProcessor

def map_sentences_to_keywords(words, sentences):
    """
    Maps the sentences which contain the keywords to the related keywords.

    Args:
        words: A list of keywords.
        sentences: A list of sentences.

    Returns:
        A dictionary mapping each keyword to a list of sentences that contain the keyword.
    """

    keywordProcessor = KeywordProcessor() #Using keyword processor as our processor for this task
    keySentences = {}
    for word in words:
        keySentences[word] = []
        keywordProcessor.add_keyword(word) #Adds key word to the processor

    for sentence in sentences:
        found = keywordProcessor.extract_keywords(sentence)
        for each in found:
            keySentences[each].append(sentence)

    for key in keySentences.keys():
        sortedSentences = sorted(keySentences[key], key=len, reverse=True)
        keySentences[key] = sortedSentences

    return keySentences

#Achieve the sentences that contain the keywords and map those sentences to the keywords using this function
mapped_sentences = map_sentences_to_keywords(words, sentences)

# Print the mapped sentences
print(mapped_sentences)

{'waste management': ['This waste management problem is also attributable to unsustainable environmental management lifestyles of Kubwa community in the Federal Capital Territory, where there are habits of indiscriminate disposal of waste, dumping of waste along or into the canals, sewerage systems that are channels for water flows, and the like.', 'Waste management presents problems in a megacity like Lagos and other major Nigerian cities which are linked with economic development, population growth and the inability of municipal councils to manage the resulting rise in industrial and domestic waste.', 'Waste management including sewage treatment, the linked processes of deforestation and soil degradation, and climate change or global warming are the major environmental problems in Nigeria.'], 'nigeria': ['The lighter components of crude oil (methane to butane, isobutane), on the other hand, always mean a certain risk of explosion, which often leads to disasters at illegal plants.In 2

In [None]:
#Step 5- Get the sense of the word. In order to attain a quality set of distractors we need to get the right sense of the keyword. This is explained in detail in the seperate alogrithm documentation

from pywsd.similarity import max_similarity
from pywsd.lesk import adapted_lesk
from pywsd.lesk import simple_lesk
from pywsd.lesk import cosine_lesk
from nltk.corpus import wordnet

def get_word_sense(sentence, word):
  """
  Returns the most likely sense of a word in a given context.

  Args:
    sent: The sentence in which the word appears.
    word: The list of words to get the sense for.

  Returns:
    A synset representing the most likely sense of the word, or None if no
    synset is found.
  """

  word = word.lower()
  # Split the word with underscores(_) instead of spaces if there are multiple
  # words. This is necessary because wordnet does not support compound words.
  if len(word.split()) > 0:
    word = word.replace(" ", "_")

  synsets = wordnet.synsets(word, 'n')
  if synsets:
    # Use two different WSD algorithms, Wu-Palmer (WUP) and Adapted Lesk,
    # to determine the most likely sense of the word. The algorithm with the
    # lowest index is returned.
    wu_palmer_output = max_similarity(sentence, word, 'wup', pos='n')
    adapted_lesk_output = adapted_lesk(sentence, word, pos='n')
    lowest_index = min(synsets.index(wu_palmer_output), synsets.index(adapted_lesk_output))
    return synsets[lowest_index]
  else:
    return None

In [None]:
def get_distractors_wordnet(syn, word):
    """Gets distractors for a word from WordNet.

    Args:
        syn: A WordNet synset.
        word: A string.

    Returns:
        A list of distractor words.
    """

    distractors = []
    word = word.lower()
    actual_word = word

    if len(word.split()) > 0:
        word = word.replace(" ", "_")

    hypernyms = syn.hypernyms()
    if not hypernyms:
        return distractors

    # Find the hyponyms of the first hypernym.
    hyponyms = hypernyms[0].hyponyms()

    # Iterate over the hyponyms and add them to the list of distractors,
    # if they are not the actual word and are not already in the list.
    for hypo in hyponyms:
        name = hypo.lemmas()[0].name()
        if name == actual_word:
            continue

        name = name.replace("_", " ")
        name = " ".join(w.capitalize() for w in name.split())
        if name not in distractors:
            distractors.append(name)

    return distractors



In [None]:
import requests
import json


def get_distractors_conceptnet(word):
    """Gets distractors for a word from ConceptNet.

    Args:
        word: A string.

    Returns:
        A list of distractor words.
    """

    # Convert the word to lowercase and store the original word in `actual_word`.
    word = word.lower()
    actual_word = word

    # Replace any underscores in the word with spaces. This will ensure that compound sentences are handled correctly.
    word = word.replace("_", " ")

    # Initialize the list of distractors.
    distractors = []

    # Construct the first ConceptNet API query URL.
    url = (
        "http://api.conceptnet.io/query?node=/c/en/%s/n&rel=/r/PartOf&start=/c/en/%s&limit=5"
        % (word, word)
    )

    # Send the first ConceptNet API query and get the response.
    response = requests.get(url)
    data = response.json()

    # Iterate over the edges in the first ConceptNet API response.
    for edge in data["edges"]:

        # Get the link from the edge.
        link = edge["end"]["term"]

        # Construct the second ConceptNet API query URL.
        url2 = (
            "http://api.conceptnet.io/query?node=%s&rel=/r/PartOf&end=%s&limit=10"
            % (link, link)
        )

        # Send the second ConceptNet API query and get the response.
        response2 = requests.get(url2)
        data2 = response2.json()

        # Iterate over the edges in the second ConceptNet API response.
        for edge in data2["edges"]:

            # Get the word from the edge.
            word2 = edge["start"]["label"]

            # If the word is not already in the list of distractors and is different from the actual word, add it to the list.
            if word2 not in distractors and actual_word.lower() not in word2.lower():
                distractors.append(word2)

    # Return the list of distractors.
    return distractors


In [None]:
def map_distractors(mapped_sentence, words_sense, distractors_wordnet, distractors_conceptnet):
    mapped_distractors = {}
    for keyword in mapped_sentence:
        # Get the word sense of the keyword.
        word_sense = words_sense(mapped_sentence[keyword][0], keyword)

        # If there is a word sense, then get the WordNet distractors.
        if word_sense:
            distractors = distractors_wordnet(word_sense, keyword)

            # If there are no WordNet distractors, then get the ConceptNet distractors.
            if len(distractors) == 0:
                distractors = distractors_conceptnet(keyword)

            # If there are any distractors, then map them to the keyword.
            if len(distractors) != 0:
                mapped_distractors[keyword] = distractors

        # If there is no word sense, then directly search for and map the ConceptNet distractors.
        else:
            distractors = distractors_conceptnet(keyword)

            # If there are any distractors, then map them to the keyword.
            if len(distractors) > 0:
                mapped_distractors[keyword] = distractors
    # Print the mapped distractors.
    return mapped_distractors

m_d = map_distractors(mapped_sentences, get_word_sense, get_distractors_wordnet, get_distractors_conceptnet)
print(m_d)

In [None]:
def print_result(mapped_distractors, mapped_sentences):
  """Prints the multiple choice questions to the console.

  Args:
    mapped_distractors: A dictionary mapping keywords to distractors.
    mapped_sentences: A dictionary mapping keywords to sentences that contain the keywords.
  """

  # Initialize a Boolean variable to track whether the header has been printed.
  header_printed = False

  import re
  import random

  iterator = 1  # To keep the count of the questions

  # Create a list to store the questions.
  questions = []

  for keyword in mapped_distractors:
    # Get the first sentence from the set of sentences.
    sentence = mapped_sentences[keyword][0]

    pattern = re.compile(keyword, re.IGNORECASE)  # Converts into regular expression for pattern matching
    option_string = pattern.sub("________", sentence)  # Replaces the keyword with underscores(blanks)

    # Create a multiple choice question dictionary.
    question = {
      "question": option_string,
      "options": [keyword.capitalize()] + [distractor for distractor in mapped_distractors[keyword]],
      "answer": keyword.capitalize()
    }

    # Add the question to the list of questions.
    questions.append(question)

  # Prints the header if it has not already been printed.
  if not header_printed:
    print("************************************** Multiple Choice Questions *******************************")
    header_printed = True

  # Print the questions.
  for question in questions:
    print(f"Question {iterator}: {question['question']}")
    iterator += 1

    # Shuffle the options and print them.
    random.shuffle(question['options'])
    opts = ['a', 'b', 'c', 'd']
    for i, option in enumerate(question['options']):
      if i < len(opts):
        print(f"\t{opts[i]}) {option}")

    print()

print_result(m_d, mapped_sentences)

In [None]:
# Step 9: Present the multiple choice questions in a nice and readable manner.

def print_result(mapped_distractors, mapped_sentences):

    header_printed = False

    import re
    import random

    iterator = 1  # To keep the count of the questions

    for keyword in mapped_distractors:
        # Get the first sentence from the set of sentences.
        sentence = mapped_sentences[keyword][0]

        pattern = re.compile(keyword, re.IGNORECASE)  # Converts into regular expression for pattern matching
        option_string = pattern.sub("________", sentence)  # Replaces the keyword with underscores(blanks)

        # Prints the header if it has not already been printed.
        if not header_printed:
            print("************************************** Multiple Choice Questions **************************************")
            header_printed = True

        # Prints the question along with a question number
        print(f"Question {iterator}: {option_string}")

        # Capitalizes the options and selects only 4 options
        options = [keyword.capitalize()]
        for distractor in m_d[keyword]:
            options.append(distractor)
            if len(options) == 4:
                break

        # Shuffles the options so that order is not always same
        random.shuffle(options)

        # Prints the options
        opts=['a','b','c','d']
        for i, option in enumerate(options):
            if i < len(opts):
                print(f"\t{opts[i]}) {option}")

        print()
        iterator += 1  # Increase the counter



print_result(m_d, mapped_sentences)
