In [None]:
import itertools
import operator
import re
import sys
from collections import Counter
from os import listdir
from os import path

import matplotlib.pyplot as plt
import textract
from matplotlib.pyplot import figure
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

def plot_hist(top_50):
    #Plot the result 
    figure(num=None, figsize=(20, 6), dpi=80, facecolor='w', edgecolor='k')
    plt.bar(range(len(top_50)), list(top_50.values()), align='center')
    plt.xticks(range(len(top_50)), list(top_50.keys()), fontsize=18, rotation='vertical')
    
def remove_punctuation_and_stop_words(extracted_text):
    # Get rid of all the punctuations here i keep the apostrophe. 
    print('Removing punctuations and stop words')
    remove_punctuations = re.sub("[^\w'\s]",'',extracted_text)
    # extract tokens 
    # For word tokenization contractions are considered two words because meaning-wise they are.
    # using tweet_tokenizer to avoid contractions ref: https://stackoverflow.com/questions/34714162/preventing-splitting-at-apostrophies-when-tokenizing-words-using-nltk
    tweet_tokenizer = TweetTokenizer()
    extracted_tokens = tweet_tokenizer.tokenize(remove_punctuations)

    #set stop-words for english
    stop_words = set(stopwords.words('english'))

    #list of filtered tokens for the current book
    filtered_tokens = [a for a in extracted_tokens if not a.lower() in stop_words]
    return filtered_tokens


def select_top_50(filtered_tokens):
    print('Selecting the top 50 words')
    #create word to frequency for all the tokens
    word_to_freq = Counter(filtered_tokens)

    #sort by frequency of all the tokens
    sorted_dict = dict(sorted(word_to_freq.items(), key=operator.itemgetter(1), reverse=True))

    #select top 50.
    top_50 = dict(itertools.islice(sorted_dict.items(),0,49))
    return top_50


def read_files(d):
    extracted_text = ""
    for x in listdir('data'):
        print("Reading file .... ", x)
        extracted_text += textract.process(path.join(d, x)).decode('utf-8')
        print("The file size loaded currently ...", sys.getsizeof(extracted_text))
    return extracted_text
    
#### MAIN #####

text = read_files('data')

tokens = remove_punctuation_and_stop_words(text)

top_50 = select_top_50(tokens)

plot_hist(top_50)