This notebook outlines several methods for tokenizing text into words (and sentences), including:

* whitespace
* nltk (Penn Treebank tokenizer)
* nltk (Twitter-aware)
* spaCy
* custom regular expressions

highlighting differences between them.

In [1]:
import nltk, re, json
import spacy
from collections import Counter

In [2]:
# If you haven't downloaded the NLTK sentence segmentation model before, do so here
nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/wesley/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# spaCy lemmatization needs tagger but disable the rest
nlp = spacy.load('en', disable=['tagger,ner,parser'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('ner')
nlp.remove_pipe('parser');

In [4]:
def read_tweets_from_json(filename):
    tweets=[]
    with open(filename, encoding="utf-8") as file:
        data=json.load(file)
        for tweet in data:
            tweets.append(tweet["text"])
    return tweets        

trump_tweets.json comes from the Trump Twitter collection here (downloaded 1/19/19)
http://www.trumptwitterarchive.com/archive

In [5]:
filename="../data/trump_tweets.json"

In [6]:
tweets=read_tweets_from_json(filename)

In [7]:
whitespace_tokens=[]
for tweet in tweets:
    whitespace_tokens.append(tweet.split())

In [8]:
nltk_tokens=[]
for tweet in tweets:
    nltk_tokens.append(nltk.word_tokenize(tweet, language="english"))

In [9]:
nltk_casual_tokens=[]
for tweet in tweets:
    nltk_casual_tokens.append(nltk.casual_tokenize(tweet))

In [10]:
spacy_tokens=[]
for tweet in tweets:
    spacy_tokens.append([token.text for token in nlp(tweet)])

In [11]:
# Shorter version of http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py

# The order here is important (match from first to last)

# Keep usernames together (any token starting with @, followed by A-Z, a-z, 0-9)
regexes=(r"(?:@[\w_]+)",

# Keep hashtags together (any token starting with #, followed by A-Z, a-z, 0-9, _, or -)
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",

# Keep words with apostrophes, hyphens and underscores together
r"(?:[a-z][a-z’'\-_]+[a-z])",

# Keep all other sequences of A-Z, a-z, 0-9, _ together
r"(?:[\w_]+)",

# Everything else that's not whitespace
r"(?:\S)"
)

big_regex="|".join(regexes)

my_extensible_tokenizer = re.compile(big_regex, re.VERBOSE | re.I | re.UNICODE)

def my_extensible_tokenize(text):
    return my_extensible_tokenizer.findall(text)

In [12]:
extensible_tokens=[]
for tweet in tweets:
    extensible_tokens.append(my_extensible_tokenize(tweet))

Q1: Write a function to print out the first 5 tokenized tweets in each of the five tokenizers above. Examine those tweets; how would you characterize the differences?



In [17]:
' '.join(2)

TypeError: can only join an iterable

In [20]:
for idx, (one, two, three, four, five) in enumerate(zip(nltk_tokens, nltk_casual_tokens, spacy_tokens, whitespace_tokens, extensible_tokens)):
    if idx >= 5:
        break
    print("NLTK      :\t%s" % ' '.join(one))
    print("CASUAL    :\t%s" % ' '.join(two))
    print("SPACY     :\t%s" % ' '.join(three))
    print("WHITESPACE:\t%s" % ' '.join(four))
    print("EXTENSIBLE:\t%s" % ' '.join(five))


    print()


NLTK      :	Mexico is doing NOTHING to stop the Caravan which is now fully formed and heading to the United States . We stopped the last two - many are still in Mexico but can ’ t get through our Wall , but it takes a lot of Border Agents if there is no Wall . Not easy !
CASUAL    :	Mexico is doing NOTHING to stop the Caravan which is now fully formed and heading to the United States . We stopped the last two - many are still in Mexico but can ’ t get through our Wall , but it takes a lot of Border Agents if there is no Wall . Not easy !
SPACY     :	Mexico is doing NOTHING to stop the Caravan which is now fully formed and heading to the United States . We stopped the last two - many are still in Mexico but ca n’t get through our Wall , but it takes a lot of Border Agents if there is no Wall . Not easy !
WHITESPACE:	Mexico is doing NOTHING to stop the Caravan which is now fully formed and heading to the United States. We stopped the last two - many are still in Mexico but can’t get thro

In [13]:
print(whitespace_tokens[5], nltk_tokens[5], nltk_casual_tokens[5], spacy_tokens[5], extensible_tokens[5])

['Fake', 'News', 'is', 'truly', 'the', 'ENEMY', 'OF', 'THE', 'PEOPLE!'] ['Fake', 'News', 'is', 'truly', 'the', 'ENEMY', 'OF', 'THE', 'PEOPLE', '!'] ['Fake', 'News', 'is', 'truly', 'the', 'ENEMY', 'OF', 'THE', 'PEOPLE', '!'] ['Fake', 'News', 'is', 'truly', 'the', 'ENEMY', 'OF', 'THE', 'PEOPLE', '!'] ['Fake', 'News', 'is', 'truly', 'the', 'ENEMY', 'OF', 'THE', 'PEOPLE', '!']


Q2: Write a function `compare(tokenization_one, tokenization_two)` that compares two tokenizations of the same text and finds the 20 most frequent tokens that don't appear in the other.



In [24]:
def counter_helper(tokens, counter):
    
    for tweets in tokens:
        for t in tweets:
            counter[t] += 1


def compare(one, two):
    
    tokens_one_counter, tokens_two_counter = Counter(), Counter()
    
    counter_helper(one, tokens_one_counter)
    counter_helper(two, tokens_two_counter)
    
    for key in 
    
    return tokens_one_counter, tokens_two_counter

In [25]:
compare(nltk_casual_tokens, nltk_tokens)

(Counter({'Mexico': 210,
          'is': 9824,
          'doing': 646,
          'NOTHING': 48,
          'to': 14827,
          'stop': 339,
          'the': 21240,
          'Caravan': 13,
          'which': 441,
          'now': 1116,
          'fully': 91,
          'formed': 9,
          'and': 11181,
          'heading': 41,
          'United': 360,
          'States': 326,
          '.': 37167,
          'We': 1643,
          'stopped': 46,
          'last': 692,
          'two': 298,
          '-': 5674,
          'many': 891,
          'are': 4098,
          'still': 241,
          'in': 8785,
          'but': 1417,
          'can': 1354,
          '’': 3284,
          't': 965,
          'get': 1314,
          'through': 197,
          'our': 2833,
          'Wall': 211,
          ',': 20341,
          'it': 3649,
          'takes': 74,
          'a': 10659,
          'lot': 222,
          'of': 9778,
          'Border': 319,
          'Agents': 13,
          'if': 806,
     

Q3: Use one of the NLTK tokenizers; write code to determine how many sentences are in this dataset, and what the average number of words per sentence is.



Q4 (check-plus): modify the extensible tokenizer above to keep urls together (e.g., www.google.com or http://www.google.com)

In [None]:
# Keep usernames together (any token starting with @, followed by A-Z, a-z, 0-9)
regexes=(r"(?:@[\w_]+)",

# Keep hashtags together (any token starting with #, followed by A-Z, a-z, 0-9, _, or -)
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",

# Keep urls together
# FILL IN HERE
         
# Keep words with apostrophes, hyphens and underscores together
r"(?:[a-z][a-z’'\-_]+[a-z])",

# Keep all other sequences of A-Z, a-z, 0-9, _ together
r"(?:[\w_]+)",

# Everything else that's not whitespace
r"(?:\S)"
)

big_regex="|".join(regexes)

my_url_extensible_tokenizer = re.compile(big_regex, re.VERBOSE | re.I | re.UNICODE)

def my_extensible_tokenize_with_urls(text):
    return my_url_extensible_tokenizer.findall(text)

In [None]:
print ('\n'.join(my_extensible_tokenize_with_urls("The course website is http://people.ischool.berkeley.edu/~dbamman/info256.html")))