In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
import en_core_web_sm
from pattern.en import suggest
import pandas as pd
from nltk import ngrams

In [2]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
def init_process(txt):
    no_new = re.sub('\n', ' ', txt)
    no_spl = re.sub('Ã±', ' ', no_new)
    first_parse = re.sub(r'[^\w]', ' ', no_spl)
    #word_tokens = word_tokenize(first_parse)
    #filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return remove_nums(first_parse)

In [5]:
def stemming(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [6]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [7]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [8]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stop_words]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    #filtered_text = ' '.join(filtered_tokens)    
    return filtered_tokens

In [9]:
def process(file_name):
    file = open(file_name, "r")
    read_txt = file.read()
    list_new = init_process(read_txt)
    #stemmed = stemming(list_new)
    #lemma = lemmatize_text(stemmed)
    new_words = remove_stopwords(list_new)
    return new_words

In [10]:
def remove_nums(text):
    no_new = re.sub('[0-9]', '', text)
    return no_new

In [11]:
def refine_text():
    file_name = "india.txt"
    refined_list = process(file_name)
    refined_text = ' '.join(refined_list)
    text_file = open("india_ref_1.txt", "w")
    text_file.write(refined_text)
    text_file.close()
    return

In [12]:
refine_text()

In [13]:
def prodBigrams(text):
    spaced = ''
    n = 2
    for ch in text:
        spaced = spaced + ch + ' '
    tokenized = spaced.split(" ")
    myList = list(ngrams(tokenized, n))
    Bigrams = []
    for i in myList:
        Bigrams.append((''.join([w + '' for w in i])).strip())
    return Bigrams

In [14]:
dictionary = {}
file_name = ("india_ref_1.txt")
text = open(file_name, "r").read()
tokens = tokenizer.tokenize(text)
for i in range(0,len(tokens)):
    t = tokens[i]
    new_tok = prodBigrams(t)
    for nt in new_tok:
        if nt in dictionary:
            dictionary.get(nt).append(i+1)
            dictionary[nt] = list(set(dictionary.get(nt)))
        else:
            dictionary[nt] = [i+1]

In [15]:
for key, value in dictionary.items():
    dictionary[key] = sorted(value)

In [16]:
# Dont do everytime
# text_file = open("list.csv", "w")
# text_file.write("Bigrams, Inverted Index \n")
# for item in dictionary.keys():
#     join_items = ", ".join(str(d) for d in dictionary.get(item))
#     text = item+", "+str(join_items)
#     text_file.write(text+" \n")
# text_file.close()

In [17]:
# Spelling Correction

In [18]:
def and_intersect(list1, list2):
    mer_list = []
    i = 0
    j = 0
    while (i<len(list1) and j<len(list2)):
        if (list1[i] == list2[j]):
            mer_list.append(list1[i])
            i = i+1
            j = j+1
        else:
            if (list1[i] > list2[j]):
                j = j+1
            else:
                i = i+1
    return mer_list

In [19]:
# mer_list = []
# for i in range(0,(len(kgrams))):
#     for j in range(i,(len(kgrams))):
#         mer_list.append(and_intersect(dictionary.get(kgrams[i]), dictionary.get(kgrams[j])))

In [45]:
def check_term(word):
    word = 'Indya'
    kgrams = prodBigrams(word)
    count_list = {}
    for i in range(0,len(kgrams)):
        if (kgrams[i] in dictionary):
            for t in dictionary.get(kgrams[i]):
                if t in count_list:
                    count_list[t] = (count_list.get(t)+1)
                else:
                    count_list[t] = 1
    return get_suggestions(count_list)

In [46]:
def get_suggestions(count_list):
    sorted_list = sorted(count_list.items(), key=lambda x: x[1], reverse=True)
    max_val = list(sorted_list)[0][1]
    file_name = ("india_ref_1.txt")
    text = open(file_name, "r").read()
    tokens = tokenizer.tokenize(text)
    suggestions = []
    temp = max_val
    for i in iter(sorted_list):
        temp = i[1]
        if (temp == max_val):
            suggestions.append(tokens[(i[0]-1)])
        else:
            break
    return set(suggestions)

In [48]:
check_term('Indiya')

{'India', 'Indonesia', 'Pandyas'}