#### Import Libraries



In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.tokenize import MWETokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [3]:
import numpy as np
import pandas as pd
import pathlib
import os
import re
import string
import time
from collections import defaultdict
from typing import List, Union
import pprint
import json

#### Mount to Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

simpsons_dir = 'Simpsons2022'
drive_path = "/content/drive" + "/My Drive/" + simpsons_dir
files = os.listdir(drive_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Implementation of Inverted Index

In [5]:
from IPython.utils import process

from inverted_index import InvertedIndex

#### Demonstration

In [6]:
def main():
    index = InvertedIndex() # initilaise the index
    drive_path = "/content/drive/My Drive/Simpsons2022"
    corpus = index.read_data(drive_path) # specify the directory path in which files are located
    
    index.index_corpus(corpus) # index documents/corpus
    
    # write a demo to check entered search terms against the inverted index
    print('--------------- PART I ---------------')
    print('Enter the way you would like to run Part I:')
    option = input("Type q - (query) or p - (path):")
    if option not in ['q', 'p']:
        print('Wrong option. Try again.')
        return 0
    if option == 'q':  # Run a single term query
        search_term = input("Enter your query: ") # insert a query
        docs = index.dump(examples=[search_term])  # includes preprocessing of query
    else:  # Run on a file containing a term per line
        file_path = input("Specify the relative path to the file: ") # specify a path
        drive_path = "/content/drive/My Drive/" + file_path  # development-examples.txt
        with open(drive_path) as f_open:
            search_terms = f_open.readlines()
        search_term_merged = []
        for search_term in search_terms:
            # Assume terms are per line
            search_term_merged.append(search_term.replace('\n', ''))
        docs = index.dump(examples=search_term_merged)
    print(docs)
    print()
    
    print('--------------- PART II ---------------')
    print('You will be asked to enter two terms and a window size.')
    print('Each of these must be provided one per line.')
    search_term1 = input("Enter term 1: ") 
    search_term2 = input("Enter term 2: ") 
    window_size = int(input("Enter proximity window size: "))
    assert window_size > 0
    coexistance_dict = index.proximity_search(term1=search_term1, term2=search_term2, window=window_size)
    print(f'"{search_term1}" and "{search_term2}" appear within a pre-defined window of {window_size} in {len(coexistance_dict.keys())} documents. These are:')
    print(list(coexistance_dict.keys()))

    return index
    
index = main()

Size of index is: 11100
Total time to create the inverted index: 16.5426 sec
--------------- PART I ---------------
Enter the way you would like to run Part I:
Type q - (query) or p - (path):q
Enter your query: river
"river" is processed as "river"
Document frequency: 3
Document IDs: ['5.11', '5.20', '7.5']

['5.11', '5.20', '7.5']

--------------- PART II ---------------
You will be asked to enter two terms and a window size.
Each of these must be provided one per line.
Enter term 1: episode
Enter term 2: 11
Enter proximity window size: 1
In document 4.11 pre-processed terms: "episod" & "11" are within a window of 1 at positions: [[16, 17]]
In document 5.11 pre-processed terms: "episod" & "11" are within a window of 1 at positions: [[14, 15]]
In document 6.11 pre-processed terms: "episod" & "11" are within a window of 1 at positions: [[15, 16]]
In document 7.11 pre-processed terms: "episod" & "11" are within a window of 1 at positions: [[16, 17]]
"episode" and "11" appear within a pre

#### Testing Input

In [7]:
prepared_test_samples_task1 = """Bart
first
image
montage
well
top
arguably
best
number
humor
dollarydoos
Bart Simpson
Gordie Howe
recalled
Bart the Lover
cents
won
voice-overs
Simpsonovi""".split('\n')

my_test_samples_task1_SINGLE_TERM = [
    'United States', 
    'Springfield Elementary School',
    'Krusty the Clown',
    'Supreme Court',
    'Saint Sebastian',
    'Steven Spielberg',
    'John Swartzwelder',
    'Episode 10',
]

my_test_samples_task1_MULTI_TERM = [
    ['United', 'States'], 
    ['Bart', 'Simpson'],
    ['Bart', 'Lover'],
    ['Springfield', 'Elementary', 'School'],
    ['Krusty', 'the', 'Clown'],
    ['Supreme', 'Court'],
    ['Saint', 'Sebastian'],
    ['Steven', 'Spielberg'],
    ['John', 'Swartzwelder'],
    ['Episode', '10'],
]

my_test_samples_task2 = [
    ['United', 'States', 1],
    ['Bart', 'Simpson', 1],
    ['Bart', 'Lover', 2], 
    ['Springfield', 'Elementary', 1],
    ['Krusty', 'Clown', 2],
    ['Supreme', 'Court', 1],
    ['Saint', 'Sebastian', 1],
    ['Steven', 'Spielberg', 1],
    ['John', 'Swartzwelder', 1],
    ['Episode', '10', 1],
]

#### Testing methods

In [8]:
def run_tests_with_data_task1(index, data: List[str], is_prepared: bool = True):
    test_results = {}
    for search_term in data:
        docs = index.dump(examples=[search_term])  # includes preprocessing of query
        test_results[search_term] = len([d for d in docs if d])  # None is returned when nothing is found
    output = json.dumps(dict(sorted(test_results.items(), key=lambda item: item[0])), indent=4)
    if is_prepared:
        print('--------------- Prepared SINGLE-TERM test data ---------------')
    else:
        print('--------------- Personal SINGLE-TERM test data ---------------')
    print(output)


def run_tests_with_MULTI_data_task1(index, data, is_prepared: bool = True):
    test_results = {}
    for search_term in data:
        docs = index.dump(examples=search_term)
        search_term_str = re.sub(r"\[|]|\'|,", "", str(search_term))  # remove list-like chars
        test_results[search_term_str] = len([d for d in docs if d])  # None is returned when nothing is found
    output = json.dumps(dict(sorted(test_results.items(), key=lambda item: item[0])), indent=4)
    if is_prepared:
        print('--------------- Prepared SET-OF-TERMS test data ---------------')
    else:
        print('--------------- Personal SET-OF-TERMS test data ---------------')
    print(output)


def run_tests_with_data_task2(index, data, is_prepared: bool = True):
    test_results = {}
    for (search_term1, search_term2, window) in data:
        docs = index.proximity_search(term1=search_term1, term2=search_term2, window=window)
        test_results[search_term1 + ' ' + search_term2 + f" {window}"] = len(
            [d for d in docs if d])  # None is returned when nothing is found
    output = json.dumps(dict(sorted(test_results.items(), key=lambda item: item[0])), indent=4)
    if is_prepared:
        print('--------------- Prepared TWO-TERMS test data ---------------')
    else:
        print('--------------- Personal TWO-TERMS test data ---------------')
    print(output)

In [9]:
def test():
    index = InvertedIndex() # initilaise the index
    corpus = index.read_data(drive_path) # specify the directory path in which files are located
    index.index_corpus(corpus) # index documents/corpus
    
    print('--------------- PART I ---------------')
    run_tests_with_data_task1(index=index, data=prepared_test_samples_task1, is_prepared=True)
    run_tests_with_data_task1(index=index, data=my_test_samples_task1_SINGLE_TERM, is_prepared=False)
    run_tests_with_MULTI_data_task1(index=index, data=my_test_samples_task1_MULTI_TERM, is_prepared=False)
    
    print('--------------- PART II ---------------')
    run_tests_with_data_task2(index=index, data=my_test_samples_task2, is_prepared=False)

    return index
    
index = test()

Size of index is: 11100
Total time to create the inverted index: 7.3152 sec
--------------- PART I ---------------
"Bart" is processed as "bart"
Document frequency: 113
Document IDs: ['3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12', '3.13', '3.14', '3.15', '3.16', '3.17', '3.18', '3.19', '3.20', '3.21', '3.22', '3.24', '3.23', '4.1', '4.2', '4.4', '4.5', '4.6', '4.7', '4.8', '4.9', '4.10', '4.11', '4.12', '4.13', '4.14', '4.15', '4.16', '4.18', '4.19', '4.20', '4.21', '4.22', '5.1', '5.2', '5.4', '5.5', '5.6', '5.7', '5.8', '5.9', '5.10', '5.11', '5.12', '5.13', '5.14', '5.15', '5.16', '5.17', '5.18', '5.19', '5.20', '5.21', '5.22', '6.1', '6.2', '6.3', '6.4', '6.5', '6.6', '6.7', '6.8', '6.9', '6.10', '6.12', '6.13', '6.14', '6.15', '6.16', '6.17', '6.18', '6.19', '6.20', '6.21', '6.22', '6.24', '6.25', '5.3', '7.1', '7.3', '7.4', '7.5', '7.6', '7.7', '7.8', '7.9', '7.10', '7.11', '7.12', '7.13', '7.14', '7.15', '7.16', '7.18', '7.19', '7.20', '7.

#### Report Examples & Demo

In [10]:
blackboard_set = ['Bart Simpson', 'Bart the Lover', 'Simpsonovi']
blackboard_set_processed = [index.process_document(d) for d in blackboard_set]
blackboard_set_processed

[['bart_simpson'], ['bart', 'the', 'lover'], ['simpsonovi']]

In [11]:
print(index.dump(['Bart the Lover']))

"Bart the Lover" is processed as "bart_the_lover"
"bart_the_lover" does not occur in the inverted index

These terms are not in the inverted index: ['bart_the_lover']
Input is not in the inverted index
[None]


In [12]:
print(index.dump('Bart the Lover'.split(' ')))

"Bart" is processed as "bart"
Document frequency: 113
Document IDs: ['3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12', '3.13', '3.14', '3.15', '3.16', '3.17', '3.18', '3.19', '3.20', '3.21', '3.22', '3.24', '3.23', '4.1', '4.2', '4.4', '4.5', '4.6', '4.7', '4.8', '4.9', '4.10', '4.11', '4.12', '4.13', '4.14', '4.15', '4.16', '4.18', '4.19', '4.20', '4.21', '4.22', '5.1', '5.2', '5.4', '5.5', '5.6', '5.7', '5.8', '5.9', '5.10', '5.11', '5.12', '5.13', '5.14', '5.15', '5.16', '5.17', '5.18', '5.19', '5.20', '5.21', '5.22', '6.1', '6.2', '6.3', '6.4', '6.5', '6.6', '6.7', '6.8', '6.9', '6.10', '6.12', '6.13', '6.14', '6.15', '6.16', '6.17', '6.18', '6.19', '6.20', '6.21', '6.22', '6.24', '6.25', '5.3', '7.1', '7.3', '7.4', '7.5', '7.6', '7.7', '7.8', '7.9', '7.10', '7.11', '7.12', '7.13', '7.14', '7.15', '7.16', '7.18', '7.19', '7.20', '7.21', '7.22', '7.23', '7.24', '7.25', '7.2']

"the" is processed as "the"
Document frequency: 118
Document IDs: ['3.

In [13]:
print(index.dump('Lover nonexistentfiller'.split(' ')))

"Lover" is processed as "lover"
Document frequency: 7
Document IDs: ['3.15', '3.16', '3.17', '5.20', '5.21', '5.22', '6.3']

"nonexistentfiller" is processed as "nonexistentfil"
"nonexistentfil" does not occur in the inverted index

These terms are not in the inverted index: ['nonexistentfil']
['3.15', '3.16', '3.17', '5.20', '5.21', '5.22', '6.3']


In [14]:
search_term2 = 'run'
search_term1 = 'across'
window_size = 2
print(f'Searching for ("{search_term1}" AND "{search_term2}")')
coexistance_dict = index.proximity_search(term1=search_term1, term2=search_term2, window=window_size)
print(f'"{search_term1}" and "{search_term2}" appear within a pre-defined window of {window_size} in {len(coexistance_dict.keys())} documents. These are:')
print(list(coexistance_dict.keys()))

Searching for ("across" AND "run")
In document 6.23 pre-processed terms: "across" & "run" are within a window of 2 at positions: [[470, 472]]
In document 6.25 pre-processed terms: "across" & "run" are within a window of 2 at positions: [[77, 76]]
In document 5.3 pre-processed terms: "across" & "run" are within a window of 2 at positions: [[77, 76]]
In document 7.1 pre-processed terms: "across" & "run" are within a window of 2 at positions: [[77, 76]]
"across" and "run" appear within a pre-defined window of 2 in 4 documents. These are:
['6.23', '6.25', '5.3', '7.1']


In [15]:
index.inverted_index["voice-ov"]

{0: [1, [2509]],
 5: [2, [339, 1414]],
 7: [1, [1422]],
 11: [1, [1458]],
 13: [1, [1532]],
 15: [2, [1416, 1623]],
 17: [1, [1004]],
 32: [2, [217, 1894]],
 57: [1, [1146]]}

In [16]:
index.process_document("Springfield Elementary School")

['springfield_elementari_school']

In [17]:
index.process_document("operational functionality")

['oper', 'function']

In [18]:
index.process_document("operative functioning")

['oper', 'function']

In [19]:
print(index.get_doc_filenames_from_ids(index.inverted_index['springfield_elementari_school'].keys()))

['3.16', '3.18', '3.23', '4.4', '4.6', '4.7', '4.15', '5.1', '5.19', '6.1', '6.2', '6.8', '6.22', '6.25', '5.3', '7.1', '7.6', '7.18', '7.22']


In [20]:
index.dump(['Springfield', 'Elementary', 'School'])

"Springfield" is processed as "springfield"
Document frequency: 81
Document IDs: ['3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.10', '3.13', '3.16', '3.17', '3.19', '3.20', '3.22', '3.24', '4.1', '4.2', '4.3', '4.4', '4.5', '4.6', '4.9', '4.12', '4.14', '4.16', '4.17', '4.18', '4.19', '4.20', '4.21', '4.22', '5.4', '5.6', '5.7', '5.8', '5.10', '5.11', '5.13', '5.15', '5.16', '5.17', '5.20', '5.22', '6.1', '6.2', '6.5', '6.7', '6.8', '6.11', '6.12', '6.13', '6.14', '6.15', '6.18', '6.19', '6.20', '6.21', '6.22', '6.23', '6.24', '6.25', '5.3', '7.1', '7.3', '7.4', '7.6', '7.7', '7.8', '7.9', '7.13', '7.14', '7.15', '7.16', '7.17', '7.20', '7.21', '7.22', '7.23', '7.24', '7.25', '7.2']

"Elementary" is processed as "elementari"
Document frequency: 6
Document IDs: ['3.22', '4.5', '4.14', '6.2', '6.21', '7.20']

"School" is processed as "school"
Document frequency: 47
Document IDs: ['3.3', '3.4', '3.6', '3.8', '3.10', '3.13', '3.16', '3.18', '3.19', '3.22', '3.23', '4.2', '4.4', '4.5'

['3.22', '4.5', '6.2', '6.21', '7.20']

In [21]:
print(index.dump(["Gordie Howe"]))

"Gordie Howe" is processed as "gordi_howe"
"gordi_howe" does not occur in the inverted index

These terms are not in the inverted index: ['gordi_howe']
Input is not in the inverted index
[None]


In [22]:
index.dump(['Springfield', 'Elementary',])

"Springfield" is processed as "springfield"
Document frequency: 81
Document IDs: ['3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.10', '3.13', '3.16', '3.17', '3.19', '3.20', '3.22', '3.24', '4.1', '4.2', '4.3', '4.4', '4.5', '4.6', '4.9', '4.12', '4.14', '4.16', '4.17', '4.18', '4.19', '4.20', '4.21', '4.22', '5.4', '5.6', '5.7', '5.8', '5.10', '5.11', '5.13', '5.15', '5.16', '5.17', '5.20', '5.22', '6.1', '6.2', '6.5', '6.7', '6.8', '6.11', '6.12', '6.13', '6.14', '6.15', '6.18', '6.19', '6.20', '6.21', '6.22', '6.23', '6.24', '6.25', '5.3', '7.1', '7.3', '7.4', '7.6', '7.7', '7.8', '7.9', '7.13', '7.14', '7.15', '7.16', '7.17', '7.20', '7.21', '7.22', '7.23', '7.24', '7.25', '7.2']

"Elementary" is processed as "elementari"
Document frequency: 6
Document IDs: ['3.22', '4.5', '4.14', '6.2', '6.21', '7.20']

['springfield', 'elementari'] are in ['3.22', '4.5', '4.14', '6.2', '6.21', '7.20']


['3.22', '4.5', '4.14', '6.2', '6.21', '7.20']