Import necessary NLTK data for tokenization and stopword removal

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/razvansavin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/razvansavin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Import required libraries and set constants

In [2]:
import nltk
import os
import math
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

FILE_MATCHES = 1
SENTENCE_MATCHES = 1

Define a function to load text files from a given directory

In [3]:
def load_files(directory):
    """
    Load all text files from a given directory.

    Parameters:
    directory (str): The directory containing the text files.

    Returns:
    dict: A dictionary mapping filenames to their contents.
    """
    files = {}
    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        # Check if the file has a ".txt" extension
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), encoding="utf-8") as file:
                files[filename] = file.read()
    return files


Define a function to tokenize documents, converting to lowercase and removing punctuation and stopwords

In [4]:
def tokenize(document):
    """
    Tokenize a document into words, converting to lowercase and removing punctuation and stopwords.

    Parameters:
    document (str): The document to tokenize.

    Returns:
    list: A list of words in the document.
    """
    # Convert the entire document to lowercase
    document = document.lower()
    # Tokenize the document into a list of words
    tokens = nltk.tokenize.word_tokenize(document)
    # Get the set of English stopwords
    stop_words = set(stopwords.words("english"))
    # Create a new list of tokens without stopwords and punctuation
    new_tokens = [
        token for token in tokens if token.isalpha() and token not in stop_words and token not in string.punctuation
    ]
    return new_tokens

Define a function to compute Inverse Document Frequency (IDF) values for each word in the corpus

In [5]:
def compute_idfs(documents):
    """
    Compute Inverse Document Frequency (IDF) values for each word in the corpus.

    Parameters:
    documents (dict): A dictionary mapping document names to a list of their words.

    Returns:
    dict: A dictionary mapping words to their IDF values.
    """
    words = set()
    # Iterate through each document in the corpus
    for file_name in documents:
        # Add all words in the current document to the set of unique words
        words.update(documents[file_name])
    # Dict to store the IDF values for each word in the corpus
    idfs = {}
    total_documents = len(documents)
    # Calculate IDF for each word in the corpus
    for word in words:
        num_documents_containing_word = sum(word in documents[file_name] for file_name in documents)
        idf = math.log(total_documents / num_documents_containing_word)
        idfs[word] = idf
    return idfs

Define a function to find the top file matches for a query based on TF-IDF scores

In [6]:
def top_files(query, files, idfs, n):
    """
    Find the top file matches for a query based on TF-IDF scores.

    Parameters:
    query (set): A set of query words.
    files (dict): A dictionary mapping filenames to a list of their words.
    idfs (dict): A dictionary mapping words to their IDF values.
    n (int): The number of top files to return.

    Returns:
    list: A list of filenames of the top n matching files.
    """
    file_scores = []
    # Iterate over each file and its corresponding words
    for file_name, words in files.items():
        # Initialize the tf-idf score for the current file
        tf_idf = 0
        # Iterate over each word in the query
        for word in query:
            tf = words.count(word)
            idf = idfs.get(word, 0)
            tf_idf += tf * idf
        file_scores.append((file_name, tf_idf))
    # Sort the file_scores list in descending order based on the tf-idf score
    file_scores.sort(key=lambda idf_score: idf_score[1], reverse=True)
    # Get the top n files based on their tf-idf scores
    top_n_files = [filename for filename, tf_idf in file_scores[:n]]
    return top_n_files

Define a function to find the top sentence matches for a query based on matching word measure and query term density

In [7]:
def top_sentences(query, sentences, idfs, n):
    """
    Find the top sentence matches for a query based on matching word measure and query term density.

    Parameters:
    query (set): A set of query words.
    sentences (dict): A dictionary mapping sentences to a list of their words.
    idfs (dict): A dictionary mapping words to their IDF values.
    n (int): The number of top sentences to return.

    Returns:
    list: A list of the top n matching sentences.
    """
    sentence_scores = []
    # Iterate over each sentence and its corresponding words
    for sentence, words in sentences.items():
        # Calculate the matching word measure for the current sentence
        matching_word_measure = sum(idfs.get(word, 0) for word in query if word in words)
        # Calculate the query term density for the current sentence
        query_term_density = sum(word in query for word in words) / len(words)
        if len(sentence) > 0:
            sentence_scores.append((sentence, matching_word_measure, query_term_density))
    # Sort the sentence_scores list in descending order based on the matching word measure and query term density
    sentence_scores.sort(key=lambda x: (x[1], x[2]), reverse=True)
    # Get the top n sentences based on their scores
    top_sentences = [score[0] for score in sentence_scores[:n]]
    return top_sentences[:n]

Main function to execute the question-answering system

In [8]:
def main(corpus_dir, queries):
    """
    Main function to execute the question-answering system.
    - Calculates IDF values for the corpus files.
    - Iterates over each query to find answers.
    - Determines top file matches based on TF-IDF.
    - Extracts sentences from the top files.
    - Calculates IDF values for the sentences.
    - Determines top sentence matches.

    Parameters:
    corpus_dir (str): The directory containing the text files.
    queries (list): A list of query strings.
    """
    files = load_files(corpus_dir)
    file_words = {filename: tokenize(files[filename]) for filename in files}
    file_idfs = compute_idfs(file_words)

    for query in queries:
        print(f"Query: {query}")
        query_set = set(tokenize(query))
        
        filenames = top_files(query_set, file_words, file_idfs, n=FILE_MATCHES)
        
        sentences = dict()
        for filename in filenames:
            for passage in files[filename].split("\n"):
                for sentence in nltk.sent_tokenize(passage):
                    tokens = tokenize(sentence)
                    if tokens:
                        sentences[sentence] = (tokens, filename)

        idfs = compute_idfs({key: value[0] for key, value in sentences.items()})
        
        matches = top_sentences(query_set, {key: value[0] for key, value in sentences.items()}, idfs, n=SENTENCE_MATCHES)
        for match in matches:
            print(f"Answer: {match}")
            print(f"Source: {sentences[match][1]}")
        print("\n")

Define queries for the question-answering system

In [9]:
queries = [
    "What are the types of supervised learning?",
    "How do neurons connect in a neural network?",
    "When was Python 3.0 released?"
]

Specify the corpus directory and call the main function with the corpus directory and queries

In [10]:
path_to_corpus_directory = 'corpus'
# path_to_corpus_directory = 'corpus2'

# Call the main function with the corpus directory and queries
main(path_to_corpus_directory, queries)

Query: What are the types of supervised learning?
Answer: Types of supervised learning algorithms include Active learning , classification and regression.
Source: machine_learning.txt


Query: How do neurons connect in a neural network?
Answer: Neurons of one layer connect only to neurons of the immediately preceding and immediately following layers.
Source: neural_network.txt


Query: When was Python 3.0 released?
Answer: Python 3.0 was released on 3 December 2008.
Source: python.txt


