In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
import en_core_web_sm
from pattern.en import suggest
import pandas as pd

In [21]:
#nltk.download('stopwords')
#nltk.download('punkt')
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()

In [22]:
#file = open("A Boy's Will - Robert Frost/1.txt", "r")

In [23]:
#txt = file.read()

In [24]:
#list_new = init_process(txt)

In [25]:
DOCS_SIZE = 50

In [26]:
stop_words = set(stopwords.words('english'))

In [27]:
def init_process(txt):
    no_new = re.sub('\n', ' ', txt)
    no_spl = re.sub('ñ', ' ', no_new)
    first_parse = re.sub(r'[^\w]', ' ', no_spl)
    return first_parse

In [28]:
def stemming(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [29]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [30]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [31]:
def correct_spelling(w):
    word_wlf = reduce_lengthening(w)
    correct_word = suggest(word_wlf)
    return correct_word[0][0]

In [32]:
def spelling_correction(words):
    correct = [correct_spelling(w) for w in words]
    return correct

In [33]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stop_words]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    #filtered_text = ' '.join(filtered_tokens)    
    return filtered_tokens

In [34]:
def normalise(file_name):
    file = open(file_name, "r")
    read_txt = file.read()
    list_new = init_process(read_txt)
    stemmed = stemming(list_new)
    lemma = lemmatize_text(stemmed)
    new_words = remove_stopwords(lemma)
    final = spelling_correction(new_words)
    return final

In [35]:
def refine_text():
    for i in range(1,51):
        new_text_file = str(i)+".txt"
        file_name = "poems/"+new_text_file
        refined_list = normalise(file_name)
        refined_text = ' '.join(refined_list)
        text_file = open(("refined/"+new_text_file), "w")
        text_file.write(refined_text)
        text_file.close()
    return

In [36]:
#refine_text() # Dont refine everytime

In [18]:
# dictionary = {}
# for i in range(1,51):
#     file_name = ("refined/"+str(i)+".txt")
#     text = open(file_name, "r").read()
#     tokens = tokenizer.tokenize(text)
#     for t in tokens:
#         if t in dictionary:
#             dictionary.get(t).append(i)
#             dictionary[t] = list(set(dictionary.get(t)))
#         else:
#             dictionary[t] = [i]

In [37]:
dictionary = {}
def biwordindexing():
    for i in range(1,51):
        file_name = ("refined/"+str(i)+".txt")
        text = open(file_name, "r").read()
        tokens = tokenizer.tokenize(text)
        for j in range(0,len(tokens)-1):
            t = tokens[j]+" "+tokens[j+1]
            if t in dictionary:
                dictionary.get(t).append(i)
                dictionary[t] = list(set(dictionary.get(t)))
            else:
                dictionary[t] = [i]
biwordindexing()

In [None]:
# dictionary = {}
# for i in range(1, (DOCSIZE+1)):
#     file_name = ("refined/"+str(i)+".txt")
#     text = open(file_name, "r").read()
#     tokens = tokenizer.tokenize(text)
#     for t in tokens:
#         if t in dictionary:
#             dictionary.get(t).append(i)
#             dictionary[t] = list(set(dictionary.get(t)))
#         else:
#             dictionary[t] = [i]

In [64]:
# Dont do everytime
# text_file = open("inverted_list.csv", "w")
# text_file.write("Words, Inverted Index \n")
# for item in dictionary.keys():
#     join_items = ", ".join(str(d) for d in dictionary.get(item))
#     text = item+", "+str(join_items)
#     text_file.write(text+" \n")
# text_file.close()

In [25]:
def and_intersect(list1, list2):
    mer_list = []
    i = 0
    j = 0
    while (i<len(list1) and j<len(list2)):
        if (list1[i] == list2[j]):
            mer_list.append(list1[i])
            i = i+1
            j = j+1
        else:
            if (list1[i] > list2[j]):
                j = j+1
            else:
                i = i+1
    return mer_list

In [26]:
def or_intersect(list1, list2):
    mer_list = []
    i = 0
    j = 0
    while (i<len(list1) and j<len(list2)):
        if (list1[i] == list2[j]):
            mer_list.append(list1[i])
            i = i+1
            j = j+1
        else:
            if (list1[i] > list2[j]):
                mer_list.append(list2[j])
                j = j+1
            else:
                mer_list.append(list1[i])
                i = i+1
    return mer_list

In [30]:
def not_list(list1):
    not_list = []
    for i in range(1, (DOCS_SIZE + 1)):
        if i in list1:
            pass
        else:
            not_list.append(i)
    return not_list

In [31]:
def perform_binary_operations(word1, word2):
    list_1 = dictionary.get(word1)
    list_2 = dictionary.get(word2)
    lists_and = and_intersect(list_1, list_2)
    lists_or = or_intersect(list_1, list_2)
    list1_not = not_list(list_1)
    list2_not = not_list(list_2)
    text_file = open(word1+"_"+word2+"_operation.csv", "w")
    text_file.write("Words, Inverted Index \n")
    join_items = ", ".join(str(d) for d in list_1)
    text = word1+", "+str(join_items)
    text_file.write(text+" \n")
    join_items = ", ".join(str(d) for d in list_2)
    text = word2+", "+str(join_items)
    text_file.write(text+" \n")
    join_items = ", ".join(str(d) for d in lists_and)
    text = word1+" AND "+word2+", "+str(join_items)
    text_file.write(text+" \n")
    join_items = ", ".join(str(d) for d in lists_or)
    text = word1+" OR "+word2+", "+str(join_items)
    text_file.write(text+" \n")
    join_items = ", ".join(str(d) for d in list1_not)
    text = "NOT "+word1+", "+str(join_items)
    text_file.write(text+" \n")
    join_items = ", ".join(str(d) for d in list2_not)
    text = "NOT "+word2+", "+str(join_items)
    text_file.write(text+" \n")
    text_file.close()

In [33]:
perform_binary_operations("know", "far")

In [34]:
def multiple_and(list1):
    and_list = and_intersect(list1[0], list1[1])
    for i in range(2, len(list1)):
        and_list = and_intersect(list1[i], and_list)
    return and_list

In [39]:
def multiple_or(list1):
    or_list = or_intersect(list1[0], list1[1])
    for i in range(2, len(list1)):
        or_list = or_intersect(list1[i], or_list)
    return or_list

In [44]:
new_list = [dictionary.get('come'), dictionary.get('leave'), dictionary.get('know'), dictionary.get('far')]
multiple_and(new_list)

[2]

In [55]:
### Code to make inverted index - not working
# import InvertedIndex
# import InvertedIndexQuery

# i = InvertedIndex.Index()

# filename = '1.txt'
# file_to_index = open(filename).read() 
# document_key = filename

#     # index the document, using document_key as the document's
#     # id.
# i.index(file_to_index, document_key)

# filename = '2.txt'
# file_to_index = open(filename).read()
# document_key = filename

# i.index(file_to_index, document_key)

# search_results = InvertedIndexQuery.query('Python and spam', i)
# search_results.sort()

# cnt = 0
# for document in search_results:
#     cnt = cnt + 1
#     print ('%d) %s'.format(cnt, document[1])) 

In [38]:
dictionary.get("vanish adobe")