In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
import en_core_web_sm
from pattern.en import suggest
import pandas as pd
from nltk import ngrams

In [2]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
def init_process(txt):
    no_new = re.sub('\n', ' ', txt)
    no_spl = re.sub('ñ', ' ', no_new)
    first_parse = re.sub(r'[^\w]', ' ', no_spl)
    #word_tokens = word_tokenize(first_parse)
    #filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return remove_nums(first_parse)

In [5]:
def stemming(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [6]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [7]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [8]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stop_words]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    #filtered_text = ' '.join(filtered_tokens)    
    return filtered_tokens

In [9]:
def process(file_name):
    file = open(file_name, "r")
    read_txt = file.read()
    list_new = init_process(read_txt)
    #stemmed = stemming(list_new)
    #lemma = lemmatize_text(stemmed)
    new_words = remove_stopwords(list_new)
    return new_words

In [10]:
def remove_nums(text):
    no_new = re.sub('[0-9]', '', text)
    return no_new

In [11]:
def refine_text():
    file_name = "india.txt"
    refined_list = process(file_name)
    refined_text = ' '.join(refined_list)
    text_file = open("india_ref_1.txt", "w")
    text_file.write(refined_text)
    text_file.close()
    return

In [12]:
refine_text()

In [52]:
def prodBigrams(text):
    spaced = ''
    n = 4
    for ch in text:
        spaced = spaced + ch + ' '
    tokenized = spaced.split(" ")
    myList = list(ngrams(tokenized, n))
    Bigrams = []
    for i in myList:
        Bigrams.append((''.join([w + '' for w in i])).strip())
    return Bigrams

In [53]:
dictionary = {}
file_name = ("india_ref_1.txt")
text = open(file_name, "r").read()
tokens = tokenizer.tokenize(text)
for i in range(0,len(tokens)):
    t = tokens[i]
    new_tok = prodBigrams(t)
    for nt in new_tok:
        if nt in dictionary:
            dictionary.get(nt).append(i+1)
            dictionary[nt] = list(set(dictionary.get(nt)))
        else:
            dictionary[nt] = [i+1]

  import sys


In [54]:
for key, value in dictionary.items():
    dictionary[key] = sorted(value)

In [55]:
def and_intersect(list1, list2):
    mer_list = []
    i = 0
    j = 0
    while (i<len(list1) and j<len(list2)):
        if (list1[i] == list2[j]):
            mer_list.append(list1[i])
            i = i+1
            j = j+1
        else:
            if (list1[i] > list2[j]):
                j = j+1
            else:
                i = i+1
    return mer_list

In [56]:
word = "Indya"
kgrams = prodBigrams(word)
kgrams

['Indy', 'ndya', 'dya']

In [57]:
mer_list = []
for i in range(0,(len(kgrams))):
    for j in range(i,(len(kgrams))):
        mer_list.append(and_intersect(dictionary.get(kgrams[i]), dictionary.get(kgrams[j])))

TypeError: object of type 'NoneType' has no len()

In [58]:
count_list = {}
for i in range(0,len(kgrams)):
    for t in dictionary.get(kgrams[i]):
        if t in count_list:
            count_list[t] = (count_list.get(t)+1)
        else:
            count_list[t] = 1

TypeError: 'NoneType' object is not iterable

In [49]:
sorted_by_value = sorted(count_list.items(), key=lambda kv: kv[1], reverse=True)
max_val = count_list.get(list(count_list.keys())[0]) 
file_name = ("india_ref_1.txt")
text = open(file_name, "r").read()
tokens = tokenizer.tokenize(text)
suggestions = []
temp = max_val
for i in iter(count_list.keys()):
    temp = count_list.get(i)
    if (temp == max_val):
        suggestions.append(tokens[(i-1)])
    else:
        break

In [51]:
set(suggestions)

{'India', 'Indian', 'Indians', 'Indo', 'Indoi', 'Indonesia', 'Indus'}