# Doing things with text

The **first time** you use NLTK, make sure to download the appropriate packages with the following command:

In [None]:
import nltk
nltk.download()

### Import packages

In [None]:
from bs4 import BeautifulSoup
import unicodedata
import re
import os
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()

### Import and read text file

In [None]:
indir = '/path/to/indir/'
outdir = '/path/to/outdir/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already

In [None]:
file = indir + 'infile.txt' # change 'infile' for actual file name

In [None]:
with open(file, encoding='utf8') as f:
    text = f.read()

In [None]:
print(text[:400])

### Preprocessing

In [None]:
def remove_html(text):
    """ Use the library BeautifulSoup (bs4) to remove html tags """
    soup = BeautifulSoup(text, "lxml")
    clean_text = soup.get_text()
    return clean_text

def remove_short_words(words, n=3):
    new_words = []
    for word in words:
        if len(word) >= n:
            new_words.append(word)
    return new_words

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Remove all integer occurrences in list of tokenized words
    """
    new_words = []
    for word in words:
        if not word.isdigit():
            new_words.append(word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_stop_words_languages(words, languages=['dutch', 'french']):
    """ Remove stop words from specified languages """
    all_stop_words = []
    for language in languages:
        all_stop_words.extend(stopwords.words(language))
    stop_words = list(set(all_stop_words))
    return remove_stop_words(words, stop_words)

def remove_stop_words(words, stop_words):
    """ Given a list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

In [None]:
def all_operations(words):
    words = remove_non_ascii(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = to_lowercase(words)
    words = remove_stop_words_languages(words)
    words = remove_short_words(words)
    return words

#### Call functions

In [None]:
clean_text = remove_html(text)

In [None]:
words = word_tokenize(clean_text)

In [None]:
print(words[:100])

In [None]:
input_as_list = all_operations(words)

In [None]:
print(input_as_list[:100])

## Count total number of words

Function to count words in a string by splitting on whitespace

In [None]:
def word_count(string):
    words = string.split()
    return len(words)

**Count total number of tokens (words) in raw text from 'file' before preprocessing**

In [None]:
print("The total number of words in \'%s\' before preprocessing is: %s" 
      %(str(file), word_count(text))) # Call function for 'text'

**Count total number of tokens (words) in text from 'file' after preprocessing**

In [None]:
print("The total number of words in \'%s\' after preprocessing is: %s" 
      %(str(file), len(input_as_list))) # Calculate length of list 'input_as_list'

**Calculate number of tokens removed by preprocessing**

In [None]:
print("The total number of tokens removed by preprocessing is: %s" 
      %(word_count(text) - len(input_as_list)))

#### Write input_as_list to file

In [None]:
with open(outdir + 'outfile.txt', "w") as f: # If outfile.txt does not exist, it will be made
    f.write(" ".join(input_as_list))