In [None]:
import os
import nltk
import string
import numpy as np
import pandas as pd
from natsort import natsorted

In [None]:
import zipfile
with zipfile.ZipFile('/content/Humor,Hist,Media,Food.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [None]:
file_names = natsorted(os.listdir('/content/Humor,Hist,Media,Food'))
data = []

for file_name in file_names:
    path = '/content/Humor,Hist,Media,Food/'+file_name
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        temp = {}
        temp['name'] = file_name
        temp['text'] = f.read()
        data.append(temp)

df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,text
0,1st_aid.txt,HERBALHERB1ST AIDCALENDULACOMFREYREMEDIESSICKM...
1,a-team,From uunet!cs.utexas.edu!usc!ucsd!ucbvax!CAE.W...
2,a_fish_c.apo,From: murph@buscard.fidonet.org (Brian Murphy)...
3,a_tv_t-p.com,____________________________________________\n...
4,abbott.txt,\n Abbott & Coste...


In [None]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import re
import numpy as np
from tqdm import tqdm
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

cachedStopWords = stopwords.words("english")

In [None]:
# References: https://williamscott701.medium.com/information-retrieval-unigram-postings-and-positional-postings-a28b907c4e8
def convert_lower_case(data):
    return np.char.lower(data)

def remove_punctuation(data):
    symbols = """˛şË›ÃºÅŸ§ż±ŕőíä°üß!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_stop_words(data):
    words = word_tokenize(str(data))
    res = ' '.join([word for word in words if word not in cachedStopWords])
    return np.char.strip(res)

def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return np.char.strip(new_text)

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_stop_words(data)
    data = lemmatization(data)
    return data

In [None]:
def clean_text(data_df):
    for index, row in tqdm(data_df.iterrows(), total=data_df.shape[0]):
        sample = row['text']
        data_df.loc[index, 'text'] = str(preprocess(sample))
    return data_df

df = clean_text(df.copy())
df.head()

100%|██████████| 1133/1133 [00:39<00:00, 28.66it/s]


Unnamed: 0,name,text
0,1st_aid.txt,herbalherb1st aidcalendulacomfreyremediessickm...
1,a-team,uunet c utexas edu usc ucsd ucbvax cae wisc ed...
2,a_fish_c.apo,murph buscard fidonet org brian murphy subject...
3,a_tv_t-p.com,survey result computer use fan alt tv twin pea...
4,abbott.txt,abbott costello first abbott well costello goi...


In [None]:
df.to_csv('cleaned_data.csv', index=False)

In [None]:
postings = {}

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    tokens = word_tokenize(str(row['text']))
    for token in tokens:
        if token in postings:
            postings[token].add(index)
        else:
            postings[token] = {index}

100%|██████████| 1133/1133 [00:06<00:00, 186.49it/s]


In [None]:
len(postings)

64226

In [None]:
def not_func(word):
    a = postings[word]
    b = set(range(1133))
    return b.difference(a), len(a)

def and_func(a, b):
    return a.intersection(b)

def or_func(a, b):
    return a.union(b)

def rem_not_query(query_words, commands):
    tup = []
    comparisons = 0
    while 'not' in commands:
        i = commands.index('not')
        word = query_words[i]
        word_postings, comparison = not_func(word)
        tup.append(word_postings)
        commands.pop(i)
        query_words[i] = None
    return tup, comparisons

def process_query(query_words, commands, tup):
    if query_words[0] in postings:
        a = postings[query_words[0]]
    else:
        a = set()

    query_words.pop(0)
    if len(query_words)==0:
        return a, 0

    comparisons = 0
    for i in range(len(commands)):
        if query_words[i] is None:
            b = tup.pop(0)
        else:
            if query_words[i] in postings:
                b = postings[query_words[i]]
            else:
                b = set()

        if commands[i] == 'and':
            a = and_func(a, b)
            comparisons += min(len(a), len(b))
        elif commands[i] == 'or':
            a = or_func(a, b)
            comparisons += min(len(a), len(b))

    return sorted(a), comparisons

def execute_query(query, operation_seq):
    query = preprocess(query)
    query_words = word_tokenize(str(query))

    operation_seq = [x.lower() for x in operation_seq]

    commands = []
    for i in operation_seq:
        commands += i.split(' ')

    tup, comparison1 = rem_not_query(query_words, commands)
    final_set, comparison2 = process_query(query_words, commands, tup)
    
    print(f'Documents: {final_set}')
    print(f'Number of documents matched: {len(final_set)}')
    print(f'No. of comparisons required: {comparison1+comparison2}')
    return list(final_set)

In [None]:
query = "lion stood thoughtfully for a moment"
command = ['OR', 'OR', 'OR']
lists = execute_query(query, command)

Documents: [3, 20, 31, 33, 37, 39, 40, 56, 67, 72, 83, 88, 91, 105, 116, 127, 135, 137, 169, 171, 173, 176, 177, 180, 184, 190, 191, 192, 209, 212, 218, 222, 223, 225, 231, 241, 246, 248, 250, 252, 256, 265, 269, 278, 285, 286, 287, 291, 295, 299, 308, 315, 318, 321, 322, 328, 337, 339, 344, 348, 351, 352, 353, 354, 361, 368, 376, 379, 391, 403, 413, 418, 424, 429, 437, 440, 441, 450, 451, 461, 479, 501, 506, 510, 511, 512, 519, 520, 526, 527, 529, 533, 540, 541, 555, 556, 576, 579, 582, 583, 589, 590, 592, 595, 596, 597, 602, 610, 616, 617, 619, 623, 624, 630, 634, 635, 640, 643, 644, 655, 656, 658, 663, 669, 671, 672, 682, 685, 688, 689, 692, 694, 695, 704, 705, 707, 708, 710, 713, 714, 724, 745, 748, 749, 752, 753, 762, 771, 772, 777, 781, 782, 783, 785, 791, 799, 806, 812, 813, 814, 815, 816, 817, 822, 824, 836, 837, 838, 842, 846, 850, 854, 855, 859, 867, 889, 911, 921, 925, 931, 947, 949, 952, 955, 956, 975, 978, 981, 982, 985, 988, 1010, 1020, 1028, 1029, 1031, 1032, 1034, 1036,

In [None]:
np.array(df.iloc[lists]['name'])

array(['a_tv_t-p.com', 'aeonint.txt', 'allusion', 'ambrose.bie',
       'anim_lif.txt', 'anime.lif', 'annoy.fascist', 'art-fart.hum',
       'b-2.jok', 'badday.hum', 'barney.txt', 'bbh_intv.txt', 'beauty.tm',
       'beesherb.txt', 'bitnet.txt', 'bmdn01.txt', 'boneles2.txt',
       'booze1.fun', 'butwrong.hum', 'bw-phwan.hat', 'bw.txt',
       'cabbage.txt', 'caesardr.sal', 'calculus.txt', 'candy.txt',
       'cartoon.law', 'cartoon.laws', 'cartoon_.txt',
       'chickenheadbbs.txt', 'childhoo.jok', 'christop.int', 'clancy.txt',
       'classicm.hum', 'cmu.share', 'cogdis.txt', 'collected_quotes.txt',
       'commutin.jok', 'computer.txt', 'conan.txt', 'consp.txt',
       'cookie.1', 'coyote.txt', 'cuchy.hum', 'cybrtrsh.txt', 'dead3.txt',
       'dead4.txt', 'dead5.txt', 'deep.txt', 'devils.jok', 'dingding.hum',
       'doggun.sto', 'drinks.gui', 'dromes.txt', 'drunk.txt',
       'dthought.txt', 'econridl.fun', 'engineer.hum', 'english.txt',
       'epi_.txt', 'epi_tton.txt', 'episimp2

In [None]:
query = "telephone,paved, roads"
command = ['OR NOT', 'AND NOT']
lists = execute_query(query, command)

Documents: [0, 2, 3, 4, 6, 8, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 31, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 131, 132, 133, 134, 136, 137, 138, 139, 140, 141, 142, 143, 144, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 2

In [None]:
np.array(df.iloc[lists]['name'])

array(['1st_aid.txt', 'a_fish_c.apo', 'a_tv_t-p.com', 'abbott.txt',
       'acetab1.txt', 'acne1.txt', 'acronym.txt', 'adameve.hum',
       'adcopy.hum', 'addrmeri.txt', 'admin.txt', 'adrian_e.faq',
       'ads.txt', 'adt_miam.txt', 'advrtize.txt', 'aeonint.txt',
       'age.txt', 'aggie.txt', 'airlines', 'alabama.txt', 'alcatax.txt',
       'alcohol.hum', 'alflog.txt', 'all_grai', 'allusion', 'ambrose.bie',
       'amchap2.txt', 'analogy.hum', 'aniherb.txt', 'anim_lif.txt',
       'anime.lif', 'annoy.fascist', 'anorexia.txt', 'answers',
       'anthropo.stu', 'antibiot.txt', 'antimead.bev', 'aphrodis.txt',
       'appbred.brd', 'appetiz.rcp', 'applepie.des', 'apsaucke.des',
       'apsnet.txt', 'arab.dic', 'arcadian.txt', 'argotdic.txt',
       'arnold.txt', 'art-fart.hum', 'arthriti.txt', 'atherosc.txt',
       'atombomb.hum', 'att.txt', 'aussie.lng', 'avengers.lis',
       'awespinh.sal', 'ayurved.txt', 'b12.txt', 'b-2.jok', 'back1.txt',
       'bad', 'bad-d', 'bad.jok', 'badday.hum

In [None]:
N = int(input())
phrase = str(input())
command = str(input()).split(' ')
lists = execute_query(phrase, command)
np.array(df.iloc[lists]['name'])

1
metaphysical

Documents: {537, 1071, 630, 391}
Number of documents matched: 4
No. of comparisons required: 0


array(['jason.fun', 'vegkill.txt', 'manners.txt', 'flux_fix.txt'],
      dtype=object)