In [None]:

############################################ Language Processing ###################################################

# Write a function that takes text as a string and count the words

def count_words(text):
    """
    Count the number of times each word occurs in text(string). Return dictionary 
    where keys are unique words and values are word counts. Skip punctuations.
    """
    text = text.lower()  # to avoid double count as dictionary may count same word differently based on 
    # capitalized or not capitalized
    skips = [".", ",", ";", ":", "?","'", '"']
    for char in skips:
        text = text.replace(char, "")
    word_counts = {}
    for word in text.split(" "):
        #know word
        if word in word_counts:
            word_counts[word] += 1
        #unknow word
        else:
            word_counts[word] = 1
    return word_counts

#now check the function
text = "This is test text. We're keeping this text short to keep things manageable"
count_words(text)

# Alternative way to do the same is as follow

from collections import Counter
def count_words_fast(text):
    """
    Count the number of times each word occurs in text(string). Return dictionary 
    where keys are unique words and values are word counts. Skip punctuations.
    """
    text = text.lower()  # to avoid double count as dictionary may count same word differently based on 
    # capitalized or not capitalized
    skips = [".", ",", ";", ":", "?","'", '"']
    for char in skips:
        text = text.replace(char, "")
    word_counts = Counter(text.split(" "))
    return word_counts

text = "This is test text. We're keeping this text short to keep things manageable"
count_words(text)

# let's check whether they give the same result

count_words(text) == count_words_fast(text)
# expected outcome is True

###################################################################################################################

# define a function to read a book

def read_book(title_path):
    """
    Read a book and return it as a string
    """
    with open(title_path,"r", encoding="utf8") as current_file:
        text= current_file.read()
        text = text.replace("\n","").replace("\r","")
    return text

# now let's check our function

text = read_book(".\Books\Books_EngFr\Books_EngFr\English\shakespeare\Romeo and Juliet.txt") 
#change the directory according to your file location
# here we collect books from gutenberg.org which is a open source digital library
print(len(text))
ind= text.find("What's in a name?")
print(ind)
sample_text = text[ind: ind+1000]
print(sample_text)

# it is expected to have the following output
# 169275
# 42757
# What's in a name? That which we call a rose    By any other name would smell as sweet.    So Romeo would, were he 
# not Romeo call'd,    Retain that dear perfection which he owes    Without that title. Romeo, doff thy name;    
# And for that name, which is no part of thee,    Take all myself.  Rom. I take thee at thy word.    
# Call me but love, and I'll be new baptiz'd;    Henceforth I never will be Romeo.  Jul. What man art thou that, 
# thus bescreen'd in night,    So stumblest on my counsel?  Rom. By a name    I know not how to tell thee who I am.    
# My name, dear saint, is hateful to myself,    Because it is an enemy to thee.    Had I it written, I would tear the 
# word.  Jul. My ears have yet not drunk a hundred words    Of that tongue's utterance, yet I know the sound.    
# Art thou not Romeo, and a Montague?  Rom. Neither, fair saint, if either thee dislike.  Jul. How cam'st thou hither, 
# tell me, and wherefore?    The orchard walls are high and hard to climb,    And the place death, conside

#######################################################################################################################

# Now define a function that computes word frequency statistics

def word_stats(word_counts):
    """ Return number of unique word and word frequencies """
    num_unique = len(word_counts)
    counts = word_counts.values()
    return (num_unique, counts)

# check the function 
text = read_book(".\Books\Books_EngFr\Books_EngFr\English\shakespeare\Romeo and Juliet.txt")
word_counts = count_words(text)
(num_unique, counts) = word_stats(word_counts)
print(num_unique)
print(sum(counts))

# Now let's compare the english and the german translated version of the Romeo and Juliet
# first read the german translated version
text_ger = read_book(".\Books\Books_GerPort\German\shakespeare\Romeo und Julia.txt")
word_counts_ger = count_words(text_ger)
(num_unique_ger, counts_ger) = word_stats(word_counts_ger)

# Now Compare

print('Unique words in English Version:', num_unique)
print('Total words used in English Version:',sum(counts))
print('\n')

print('Unique words in German Version:', num_unique_ger)
print('Total words used in German Version:',sum(counts_ger))

#################################################################################################################

# Read all books and make a table
import os
book_dir = '.\Books'
import pandas as pd
stats = pd.DataFrame(columns=("Language","Author","Title","Length", "Unique")) # creating a table
title_num =1
for language in os.listdir(book_dir):
    for author in os.listdir(book_dir+ "/" + language):
        for title in os.listdir(book_dir+ "/" + language+ "/" + author):
            inputfile = book_dir+ "/" + language+ "/" + author + "/" + title
            text = read_book(inputfile)
            (num_unique, counts) = word_stats(count_words(text))
            stats.loc[title_num]= language, author.capitalize(), title.replace('.txt',''), sum(counts), num_unique
            title_num +=1

###################################################################################################


# plot the result
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
subset = stats[stats.Language =='English']
plt.loglog(subset.Length, subset.Unique,'o', label = 'English', color = 'crimson')
subset = stats[stats.Language =='Frech']
plt.loglog(subset.Length, subset.Unique,'o', label = 'Frech', color = 'forestgreen')
subset = stats[stats.Language =='German']
plt.loglog(subset.Length, subset.Unique,'o', label = 'German', color = 'orange')
subset = stats[stats.Language =='Portuguese']
plt.loglog(subset.Length, subset.Unique,'o', label = 'Portuguese', color = 'blueviolet')
plt.legend()
plt.xlabel('Book Length')
plt.ylabel('Number of Unique Words')
plt.savefig('lang-plot.pdf')

############################################# THE END ####################################################