## Coding Exercise #0708

In [None]:
# Install WordCloud once.
# !pip install wordcloud

In [None]:
from wordcloud import WordCloud
import numpy as np
import nltk
import re
import os
import matplotlib.pyplot as plt
from PIL import Image                         # From the Pillow library import the Image module.
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
%matplotlib inline

### 1. Visualize the text data as a WordCloud:

In [None]:
# Go to the directory where the data file is located. 
# os.chdir(r'~~')                # Please, replace the path with your own.   

In [None]:
f = open("./three_little_pigs.txt",'r',encoding='UTF8')
my_book = f.readlines()
f.close()

#### 1.1. Pre-processing:

In [None]:
n_min = 4                                                           # Minimum number of characters. 
corpus = []
lemmatizer = WordNetLemmatizer()
for a_line in my_book:
    pre = re.sub(r'\W', ' ', a_line)                                # Substitute the non-alphanumerics character by space.
    pre = re.sub(r'\d+','', pre)                                    # Remove numbers.
    pre = nltk.word_tokenize(pre)                                   # Tokenize into words.
    pre = [x for x in pre if len(x) > n_min]                        # Minimum length.
    pre = [x.lower() for x in pre]                                  # Convert into the lowercase.
    pre = [x for x in pre if x not in stopwords.words('english')]   # Remove stopwords.
    pre = [lemmatizer.lemmatize(x) for x in pre]                    # Lemmatize.
    corpus += pre                                                   # Back to the corpus.

In [None]:
len(corpus)

#### 1.2. Generate a basic wordcloud:

In [None]:
# WordCloud requires that the imput data be a single long string.
a_long_sentence = ' '.join(corpus)

In [None]:
wc = WordCloud(background_color='white', max_words=30)              # Customize the output.
wc.generate(a_long_sentence)
# wc.words_                                                          # Check for the top ranking words.                                                         

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")                                    # Turn off the axes.
plt.show()

#### 1.3. Generate wordcloud using a mask:

In [None]:
# Pick a background mask.
#img = Image.open('background_1.png')                    # Elipse.
#img = Image.open('background_2.png')                   # Speech bubble.
#img = Image.open('background_3.png')                    # Heart.
img = Image.open('map.jpeg')                    # Circle.
back_mask = np.array(img)

In [None]:
wc = WordCloud(background_color='white', max_words=30, mask=back_mask)            # Customize the output. 
wc.generate(a_long_sentence) 

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")                                    # Turn off the axes.
plt.savefig("out.png")                             # Save to an external file.  
plt.show()

In [None]:
mask = np.array(Image.open('map.jpeg'))
wc = WordCloud(mask=mask, background_color="white",
               max_words=2000, max_font_size=256,
               random_state=42, width=mask.shape[1],
               height=mask.shape[0])
wc.generate(a_long_sentence)
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()