# Doing things with text 4

## Importing (and analyzing) multiple texts as one corpus

### Import packages

In [2]:
pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.9.2-cp311-cp311-macosx_10_9_x86_64.whl (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.8/159.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from bs4 import BeautifulSoup
import unicodedata
import re
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
from collections import Counter

#### Define in- and out-directories

Indir is a folder on your computer with multiple text files. Outdir is a folder (to be made) to store cleaned versions of the files

In [None]:
indir = r'/Users/huijn001/surfdrive/data_lokaal/medisch_contact_txt/'
outdir = r'/Users/huijn001/surfdrive/data_lokaal/medisch_contact_txt_clean/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already

#### User defined stopwords (for wordcloud and Counter). Change if needed!

In [None]:
stopword_list = ['that', 'with', 'said', 'this', 'when', 'them', 'were', 'from', 'will', 'there', 'they', 'then', 'their', 'your', 'would', 'only', 'even', 'know', 'could', 'have', 'where', 'come', 'been', 'made', 'well', 'would', 'their', 'could', 'there']

## Preprocessing

In [None]:
def remove_html(text):
    """ Use the library BeautifulSoup (bs4) to remove html tags """
    soup = BeautifulSoup(text, "lxml")
    clean_text = soup.get_text()
    return clean_text

def remove_short_words(words, n=5):
    new_words = []
    for word in words:
        if len(word) >= n:
            new_words.append(word)
    return new_words

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Remove all integer occurrences in list of tokenized words
    """
    new_words = []
    for word in words:
        if not word.isdigit():
            new_words.append(word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_stop_words_languages(words, languages=['dutch', 'french']):
    """ Remove stop words from specified languages """
    all_stop_words = []
    for language in languages:
        all_stop_words.extend(stopwords.words(language))
    stop_words = list(set(all_stop_words))
    return remove_stop_words(words, stop_words)

def remove_stop_words(words, stop_words):
    """ Given a list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

In [None]:
def all_operations(words):
    words = remove_short_words(words)
    words = remove_non_ascii(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = to_lowercase(words)
    words = remove_stop_words_languages(words)
    return words

#### Call functions

N.B. Uncheck lines 23 and 25 to save cleaned text files to outdir

In [None]:
all_texts_list = []

# list all files in a given directory
files = os.listdir(indir)

for infile in files:
    # avoid opening files such as .DS_Store
    if infile.startswith('.'):
        continue
    # open the file and do something with it, close when done
    with open(indir+infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
    # remove html
    clean_text = remove_html(text)
    # tokenize to words (needed for subsequent operations)
    words = word_tokenize(clean_text)
    words = all_operations(words)
    # open output file for writing, create it if it doesn't exist
    with open(outdir+infile, "w") as f:
    # write out all words (converting the list to a string with spaces)
        f.write(" ".join(words))
    all_texts_list.extend(words)

## Analysis

### Word clouds

In [None]:
all_texts_string = " ".join(all_texts_list)

In [None]:
text_cloud = WordCloud(background_color='white', stopwords=stopword_list).generate(all_texts_string)

In [None]:
plt.imshow(text_cloud, interpolation='bilinear')
plt.axis('off')
#plt.savefig('/Users/huijn001/Desktop/got.png', dpi=300, bbox_inches='tight') # To save word cloud to your computer
plt.show()

### Word counts

In [None]:
print(all_texts_list)

In [None]:
word_counts = Counter(all_texts_list)

In [None]:
most_common_words = word_counts.most_common(100)

In [None]:
print(word_counts)

In [None]:
print('Most common words:')
for word, count in most_common_words:
    print('%s: %7d' % (word, count))

#### Add user defined stop words based on the Counter output

N.B. If you want to rerun Counter, make sure to use all_texts_list_clean instead of all_texts_list!

In [None]:
def remove_user_defined_stopword_list(words):
    """ Given a hardcoded list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in stopword_list:
            new_words.append(word)
    return new_words

In [None]:
all_texts_list_clean = remove_user_defined_stopword_list(all_texts_list)

#### Visualize word counts in all texts in a bar chart

In [None]:
#### From https://stackoverflow.com/questions/63018726/counter-and-plot-the-most-common-word-in-a-text ####

y = [count for word, count in most_common_words]
x = [word for word, count in most_common_words]

plt.rcParams["figure.figsize"] = (20,10)
plt.bar(x, y, color='crimson')
plt.title("Term frequencies in text")
plt.ylabel("Counts")
#plt.yscale('log') # optionally set a log scale for the y-axis
plt.xticks(rotation=45)
for i, (word, count) in enumerate(most_common_words):
    plt.text(i, count, f' {count} ', rotation=45,
             ha='center', va='top' if i < 10 else 'bottom', color='white' if i < 10 else 'black')
plt.xlim(-0.6, len(x)-0.4) # optionally set tighter x lims
plt.tight_layout() # change the whitespace such that all labels fit nicely
plt.show()