# Doing things with text 2

## Making word clouds from a preprocessed text

Install wordcloud package:

In [None]:
pip install wordcloud

### Import packages

In [None]:
from bs4 import BeautifulSoup
import unicodedata
import re
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()
from wordcloud import WordCloud
import matplotlib.pyplot as plt 

### Import and read text file

In [None]:
indir = '/path/to/indir/'
outdir = '/path/to/outdir/'

In [None]:
file = indir + 'infile.txt'

In [None]:
with open(file, encoding='utf8') as f:
    text = f.read()

In [None]:
print(text[:100])

## Preprocessing

In [None]:
def remove_html(text):
    """ Use the library BeautifulSoup (bs4) to remove html tags """
    soup = BeautifulSoup(text, "lxml")
    clean_text = soup.get_text()
    return clean_text

def remove_short_words(words, n=3):
    new_words = []
    for word in words:
        if len(word) >= n:
            new_words.append(word)
    return new_words

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Remove all integer occurrences in list of tokenized words
    """
    new_words = []
    for word in words:
        if not word.isdigit():
            new_words.append(word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_stop_words_languages(words, languages=['dutch', 'french']):
    """ Remove stop words from specified languages """
    all_stop_words = []
    for language in languages:
        all_stop_words.extend(stopwords.words(language))
    stop_words = list(set(all_stop_words))
    return remove_stop_words(words, stop_words)

def remove_stop_words(words, stop_words):
    """ Given a list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

In [None]:
def all_operations(words):
    words = remove_non_ascii(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = to_lowercase(words)
    words = remove_stop_words_languages(words)
    words = remove_short_words(words)
    return words

#### Call functions

In [None]:
clean_text = remove_html(text)

In [None]:
words = word_tokenize(clean_text) # N.B. Tokenization can take a (very) long time

In [None]:
input_as_list = all_operations(words)

In [None]:
print(input_as_list[:100])

create new list 'long_input_as_list' for words only equal to or larger than n letters

In [None]:
n = 6

long_input_as_list = []
for word in input_as_list:
    if len(word) >= n:
        long_input_as_list.append(word)

#### Turn input_as_list into stringÂ¶

In [None]:
input_as_string = " ".join(long_input_as_list)

In [None]:
print(input_as_string[:100])

## Analysis

### Word clouds

See for more visualisation options: https://towardsdatascience.com/how-to-create-beautiful-word-clouds-in-python-cfcf85141214

In [None]:
sw = ['you', 'all', 'him', 'int', 'ext', 'and', 'the', 'what', 'that', 'with', 'said', 'this', 'when', 'them', 'were', 'from', 'will', 'there', 'they', 'then', 'their', 'your', 'would', 'only', 'even', 'know', 'could', 'have', 'where', 'come', 'been', 'made', 'well']
text_cloud = WordCloud(background_color='black', stopwords=sw, colormap='Set2',random_state=5).generate(input_as_string)

In [None]:
my_dpi = 300

fig = plt.figure(figsize = (15,8))
plt.imshow(text_cloud, interpolation='bilinear')
plt.axis('off')
plt.savefig(outdir + 'starwars_2005.png', dpi=my_dpi, bbox_inches='tight')
plt.show()