# Doing things with text 3

## Counting words from a preprocessed text

### Import packages

In [None]:
from bs4 import BeautifulSoup
import unicodedata
import re
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
from collections import Counter

### Import and read text file

In [None]:
indir = '/path/to/indir/'
outdir = '/path/to/outdir/'

In [None]:
file = indir + 'infile.txt'

In [None]:
with open(file, encoding='utf8') as f:
    text = f.read()

In [None]:
print(text[:100])

## Preprocessing

In [None]:
def remove_html(text):
    """ Use the library BeautifulSoup (bs4) to remove html tags """
    soup = BeautifulSoup(text, "lxml")
    clean_text = soup.get_text()
    return clean_text

def remove_short_words(words, n=5):
    new_words = []
    for word in words:
        if len(word) >= n:
            new_words.append(word)
    return new_words

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Remove all integer occurrences in list of tokenized words
    """
    new_words = []
    for word in words:
        if not word.isdigit():
            new_words.append(word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_stop_words_languages(words, languages=['dutch', 'french']):
    """ Remove stop words from specified languages """
    all_stop_words = []
    for language in languages:
        all_stop_words.extend(stopwords.words(language))
    stop_words = list(set(all_stop_words))
    return remove_stop_words(words, stop_words)

def remove_stop_words(words, stop_words):
    """ Given a list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

In [None]:
def all_operations(words):
    words = remove_non_ascii(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = to_lowercase(words)
    words = remove_stop_words_languages(words)
    words = remove_short_words(words)
    return words

#### Call functions

In [None]:
clean_text = remove_html(text)

In [None]:
words = word_tokenize(clean_text)

In [None]:
input_as_list = all_operations(words)

In [None]:
print(input_as_list[:100])

#### Turn input_as_list into string¶

In [None]:
input_as_string = " ".join(input_as_list)

In [None]:
print(input_as_string[:100])

## Analysis

### Word clouds

In [None]:
sw = ['that', 'with', 'said', 'this', 'when', 'them', 'were', 'from', 'will', 'there', 'they', 'then', 'their', 'your', 'would', 'only', 'even', 'know', 'could', 'have', 'where', 'come', 'been', 'made', 'well']
text_cloud = WordCloud(background_color='white', stopwords=sw).generate(input_as_string)

In [None]:
plt.imshow(text_cloud, interpolation='bilinear')
plt.axis('off')
plt.savefig(outdir + '2015.png', dpi=300, bbox_inches='tight')
plt.show()

### Word counts

In [None]:
word_counts = Counter(input_as_list)

In [None]:
most_common_words = word_counts.most_common(50)

In [None]:
print('Most common words:')
for word, count in most_common_words:
    print('%s: %10d' %(word, count))

In [None]:
#### From https://stackoverflow.com/questions/63018726/counter-and-plot-the-most-common-word-in-a-text ####

y = [count for word, count in most_common_words]
x = [word for word, count in most_common_words]

plt.rcParams["figure.figsize"] = (20,10)
plt.bar(x, y, color='crimson')
plt.title("Term frequencies in text")
plt.ylabel("Counts")
#plt.yscale('log') # optionally set a log scale for the y-axis
plt.xticks(rotation=45)
for i, (word, count) in enumerate(most_common_words):
    plt.text(i, count, f' {count} ', rotation=45,
             ha='center', va='top' if i < 10 else 'bottom', color='white' if i < 10 else 'black')
plt.xlim(-0.6, len(x)-0.4) # optionally set tighter x lims
plt.tight_layout() # change the whitespace such that all labels fit nicely
plt.show()