# Imports

In [1]:
import glob
import nltk
import gensim
from collections import Counter
import numpy as np
from config import Config
from itertools import chain
import spacy
from spacy import displacy

In [2]:
def input_data(text_file_directory=Config.TEXT_DATA_PATH):
    """
    Reads the text documents contained in text_file_directory
    :param text_file_directory: Path to directory (String)
    :return: List where one element in a word in the text. The text order is kept. (List of String)
    """
    list_of_words = []
    for file_path in glob.glob(text_file_directory):
        with open(file_path, 'r', errors="ignore") as f:
            content = ''.join(f.readlines())
            content.replace('  ', ' ')
            tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
            current_list_of_words = [x.lower() for x in tokenizer.tokenize(content)]
            try:
                pos_tagged_words = nltk.pos_tag(current_list_of_words)
            except LookupError:
                nltk.download('averaged_perceptron_tagger')
            # list_of_nouns = [word for (word, tag) in pos_tagged_words if 'NN' in tag]
            list_of_words.append(current_list_of_words)
    return list_of_words

In [3]:
data = input_data()

In [4]:
' '.join(data[0])

'analysis china dominates airwaves with vat and lower prices the market has been awash with speculation over the imminent announcement of vat on fertilizer sales in china details are elusive still but most believe and certainly hope that the impact on export sales will be negligible in the meantime signs of greater chinese export availabilility for august shipment has led to some weaker prices to be agreed for prompt shipment with reports of sales concluded in non indian markets in the 450s mt fob for the first time since august last year suppliers looking to place lighter coloured product or product that is not guaranteed 18 46 0 are having to concede lower prices indian buyers meanwhile are putting pressure on suppliers to lower prices further sensing the weakness creeping into the market for prompt shipment and looking for new prices closer to 470 mt cfr as yet these lower prices have not been agreed in either india or pakistan and it remains to be seen how much if at all suppliers 

In [5]:
print(f"We have {len(data)} documents. The average document has {np.mean([len(x) for x in data])} words.")

We have 100 documents. The average document has 7771.3 words.


# Flatten words list and analysis

In [6]:
raw_words = list(chain(*data))

In [7]:
count_words = Counter(raw_words)
count_words.most_common(20)

[('to', 28355),
 ('mt', 27832),
 ('the', 26343),
 ('in', 16969),
 ('and', 12598),
 ('of', 12294),
 ('dap', 11794),
 ('for', 11104),
 ('from', 10669),
 ('at', 8923),
 ('a', 8875),
 ('000', 8138),
 ('year', 6617),
 ('is', 6441),
 ('on', 6058),
 ('fob', 6037),
 ('map', 5749),
 ('cfr', 5349),
 ('with', 5139),
 ('are', 5098)]

In [8]:
def least_common_values(array, to_find=None):
    from operator import itemgetter
    counter = Counter(array)
    if to_find is None:
        return sorted(counter.items(), key=itemgetter(1), reverse=False)
    return heapq.nsmallest(to_find, counter.items(), key=itemgetter(1))

In [9]:
min_frequency = 10
rare_words = [word for (word, count) in least_common_values(raw_words) if count < min_frequency]

In [None]:
blacklist = ["mt", "com", "crugroup", "'s", "analysis"]
words = list(filter(lambda x: (x not in blacklist) and (x not in rare_words), raw_words))

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(' '.join(data[0]))
displacy.serve(doc, style='ent')


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer

