This notebook evaluates different methods for tokenization and stemming/lemmatization
and assesses the impact on binary sentiment classification, using a train/dev dataset of sample of 1000 reviews from the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/).  Each tokenization method is evaluated on the same learning algorithm ($\ell_2$-regularized logistic regression); the only difference is the tokenization process. For more, see: http://sentiment.christopherpotts.net/tokenizing.html

In [1]:
import nltk
import spacy
from nltk.stem.porter import *
from TokenizationTest import TokenizationTest
from happyfuntokenizing import Tokenizer as potts

In [3]:
# spaCy lemmatization needs tagger but disable the rest
nlp = spacy.load('en', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

# load NLTK porter stemmer
stemmer = PorterStemmer()

# load Potts sentiment tokenizer
potts_tokenizer=potts()

In [4]:
def spacy_tokenizer(data):
    spacy_tokens=nlp(data)
    return [token.text for token in spacy_tokens]

def spacy_lemmatizer(data):
    spacy_tokens=nlp(data)
    return [token.lemma_ for token in spacy_tokens]

In [5]:
tester=TokenizationTest("../data/sentiment.1000.train.txt", "../data/sentiment.1000.dev.txt")

In [6]:
tester.evaluate(str.split)

Function 'split' Accuracy: 0.858


In [7]:
tester.evaluate(stemmer.stem)

Function 'stem' Accuracy: 0.825




In [8]:
tester.evaluate(nltk.word_tokenize)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/wesley/nltk_data'
    - '/Users/wesley/anaconda3/nltk_data'
    - '/Users/wesley/anaconda3/share/nltk_data'
    - '/Users/wesley/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [9]:
tester.evaluate(spacy_tokenizer)

Function 'spacy_tokenizer' Accuracy: 0.871


In [10]:
tester.evaluate(spacy_lemmatizer)

Function 'spacy_lemmatizer' Accuracy: 0.872


In [11]:
tester.evaluate(potts_tokenizer.tokenize)

Function 'tokenize' Accuracy: 0.885
