# Clean and write text data

This script preprocesses all texts in in_dir and stores them as a list of tokens to do other stuff (option 1) or writes them back to in_dir (overwrite existing files) or to new files in another directory called out_dir (option 2). 

In [None]:
import os
import unicodedata
import copy
import re
from collections import Counter
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()
from nltk.stem import LancasterStemmer, PorterStemmer, RegexpStemmer, SnowballStemmer

In [None]:
indir = r'/path/to/files/'
outdir = r'/path/to/outdir/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already
stopword_list = []

### Option 1

Loop through directory function without writing

In [None]:
def loop_through_directory(indir, stopwordfile=None, stem=False):
    # list all files in a given directory
    files = os.listdir(indir)
    for infile in files:
        # avoid opening files such as .DS_Store
        if infile.startswith('.'):
            continue
        # open the file and do something with it, close when done
        with open(indir+infile, "r") as f:
            # try / except clause to catch encoding errors
            try:
                text = f.read()
            except Exception:
                print(Exception)
        # remove html
        clean_text = remove_html(text)
        # tokenize to words (needed for subsequent operations)
        words = word_tokenize(clean_text)
        words = all_operations(words, stopwordfile, stem)

### Option 2

Loop through directory functie with writing to 1) in_dir (overwriting original files) or 2) out_dir (new files)

In [None]:
def loop_through_directory(indir, stopwordfile=None, stem=False):
    # list all files in a given directory
    files = os.listdir(indir)
    for infile in files:
        # avoid opening files such as .DS_Store
        if infile.startswith('.'):
            continue
        # open the file and do something with it, close when done
        with open(indir+infile, "r") as f:
            # try / except clause to catch encoding errors
            try:
                text = f.read()
            except Exception:
                print(Exception)
        # remove html
        clean_text = remove_html(text)
        # tokenize to words (needed for subsequent operations)
        words = word_tokenize(clean_text)
        words = all_operations(words, stopwordfile, stem)
        # open output file for writing, create it if it doesn't exist
        with open(outdir+infile, "w") as f: # change indir to outdir if you don't want to overwrite original files
            # write out all words (converting the list to a string with spaces)
            f.write(" ".join(words))

In [None]:
def all_operations(words, stopwordfile, stem):
    words = remove_short_words(words)
    words = remove_non_ascii(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = to_lowercase(words)
    words = remove_stop_words_languages(words)
    words = remove_user_defined_list(words)
    words = remove_unique_words(words)
    if stopwordfile:
        words = remove_user_defined_words(words, stopwordfile)
    if stem:
        words = stem_words(words)
    return words

In [None]:
#Count words in words, add word to words only if word is not unique (to get rid of most crap)
def remove_unique_words(words):
    word_counts = Counter(words)
    unique_words = [word for word in word_counts if word_counts[word] == 1]
    words_without_uniques = [i for i in words if i not in unique_words]
    return words_without_uniques

In [None]:
def remove_html(text):
    """ Use the library BeautifulSoup (bs4) to remove html tags """
    soup = BeautifulSoup(text, "lxml")
    clean_text = soup.get_text()
    return clean_text

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
 
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Remove all integer occurrences in list of tokenized words
    """
    new_words = []
    for word in words:
        if not word.isdigit():
            new_words.append(word)
    return new_words

def remove_short_words(words, n=4):
    """ Remove all words shorter than n characters """
    new_words = []
    for word in words:
        if len(word) >= n:
            new_words.append(word)
    return new_words

def remove_stop_words_languages(words, languages=['english']):
    """ Remove stop words from specified languages """
    all_stop_words = []
    for language in languages:
        all_stop_words.extend(stopwords.words(language))
    stop_words = list(set(all_stop_words))
    return remove_stop_words(words, stop_words)

def remove_user_defined_words(words, word_file):
    """ Remove all words specified by user in word_file """
    with open(word_file) as f:
        text = f.read()
    stop_words = word_tokenize(text)
    return remove_stop_words(words, stop_words)

def remove_user_defined_list(words):
    """ Given a hardcoded list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in stopword_list:
            new_words.append(word)
    return new_words

def remove_stop_words(words, stop_words):
    """ Given a list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

def stem_words(words, stemmer=LancasterStemmer()):
    """Stem words in list of tokenized words
    stemmer specifies the specific stemmer from nltk to use
    """
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

In [None]:
run_script = loop_through_directory(indir)