In [1]:
# Imports
import re
import nltk
import gensim
from nltk.stem import SnowballStemmer   
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from spellchecker import SpellChecker

In [2]:
# Custom Jupyter Progress Bar

def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [3]:
# Function to get text in selective lowercase form

def abbr_or_lower(word):
    if re.match('([A-Z]+[a-z]*){2,}', word):
        return word
    else:
        return word.lower()

In [4]:
# Function for different forms of tokeniztion

def tokenize(words, modulation, lowercase='basic'):
    tokens = re.split(r'\W+', words)
    stems = []
    
    # Get comprehensive set of stopwords
    stop_words = STOPWORDS.union(set(stopwords.words('english'))) 
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        
        # All text as lowercase
        if lowercase == 'basic':
            lowers = abbr_or_lower(token).lower()
        # Custom lowercase to allow all caps text for emphasis
        else:
            lowers = abbr_or_lower(token)
            
        if lowers not in stop_words:
            if re.search('[a-zA-Z]', lowers):
                
                # Lowercase 
                if modulation == 0:
                    stems.append(lowers)
                    
                # Stemming
                if modulation == 1:
                    porter = SnowballStemmer("english")
                    stems.append(porter.stem(lowers))
                    
                # Lemmatizing
                if modulation == 2:
                    lmtzr = WordNetLemmatizer()
                    stems.append(lmtzr.lemmatize(lowers))
                
    return stems

In [5]:
# Function to get list of unique words in text

def unique_str(list_of_strings):
    res = ()
    for item in list_of_strings:
        res = list(set(res) | set(item))
    return res

In [6]:
# Function to get features using key words

def get_feature(scraped_text, feature_list):
    
    empty_list = []
    
    # Loop through features to check for outdoor spaces
    for i in scraped_text:
        if len(set(i).intersection(feature_list)) == 0:
            empty_list.append(0)
        else:
            empty_list.append(1)
            
    return empty_list

In [7]:
# Function to find common words

# text = cleaned text format of multiple corpora
# hint = text to look for, string
# method = text starting with hint, ends with hint, or hint in text

def find_words(text, hint, method):
    keywords = []
    for i in text:
        for j in i:
            if method == 'startswith':
                if j.startswith((hint)) == True:
                    keywords.append(j)
            if method == 'endswith':
                if j.endswith((hint)) == True:
                    keywords.append(j)
            if method == 'in':
                if hint in j:
                    keywords.append(j)
    return set(keywords)

In [8]:
# Spell Check

def spell_check(text):
    
    # 'text' input is a string of text
    
    # Instantiate Spell Checker
    spell = SpellChecker()
    spell.word_frequency.add('km')
    spell.word_frequency.add('vrt')
    spell.word_frequency.add('dm')
    
    # Change text to list of strings
    tokens = re.split(r'\W+', text)
    
    list_of_words = []
    
    for token in tokens:
        lowers = abbr_or_lower(token)
        list_of_words.append(lowers)
        
    try:
        list_of_words.remove('')
    except:
        pass
        
    # Identify misspelled text
    misspelled = spell.unknown(list_of_words)

    if len(misspelled) == 0:
        text = text

    else:
        # Switch mispelled words with corrections
        for word in misspelled:
            corr = spell.correction(word)
            text = text.replace(str(word), str(corr))
            
    return text