# AllenNLP in 100 Line
- Stanford Sentiment TreeBank

### Tokenisation (Lemma + Stem)

In [27]:
import nltk
from nltk.tokenize import word_tokenize

In [36]:
s = '''Good muffins cost $3.88\nin New York.  and this is &&&, !@£ %£$£$%, ; //,/.,/,  very much the best of this sun to high low ground. Please buy me two of them.\n\nThanks.'''

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Ocean/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [28]:
# word_tokenize() uses TreebankWordTokenizer internally
word_tokenize(s)

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [6]:
from nltk.tokenize import sent_tokenize

In [7]:
# sent_tokenizer() uses PunktSentenceTokenizer internally
sent_tokenize(s)

['Good muffins cost $3.88\nin New York.',
 'Please buy me two of them.',
 'Thanks.']

In [8]:
import spacy

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp(s)

In [11]:
[token.text for token in doc]

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 '\n',
 'in',
 'New',
 'York',
 '.',
 ' ',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 '\n\n',
 'Thanks',
 '.']

In [12]:
# https://github.com/explosion/spaCy/issues/93
[sent.string.strip() for sent in  doc.sents]

['Good muffins cost $3.88\nin New York.',
 'Please buy me two of them.',
 'Thanks.']

In [13]:
from nltk.stem.porter import PorterStemmer

In [14]:
stemmer = PorterStemmer()

In [15]:
words = ['caresses', 'flies', 'dies', 'mules', 'denied',
           'died', 'agreed', 'owned', 'humbled', 'sized',
           'meetings', 'stating', 'siezing', 'itemization',
           'sensational', 'traditional', 'reference', 'colonizer',
           'plotted']

In [16]:
[stemmer.stem(word) for word in words]

['caress',
 'fli',
 'die',
 'mule',
 'deni',
 'die',
 'agre',
 'own',
 'humbl',
 'size',
 'meet',
 'state',
 'siez',
 'item',
 'sensat',
 'tradit',
 'refer',
 'colon',
 'plot']

In [17]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [19]:
[lemmatizer.lemmatize(plural) for plural in words]

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/Users/Ocean/nltk_data'
    - '/Users/Ocean/.virtualenvs/general/bin/../nltk_data'
    - '/Users/Ocean/.virtualenvs/general/bin/../share/nltk_data'
    - '/Users/Ocean/.virtualenvs/general/bin/../lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [20]:
doc = nlp(' '.join(words))

In [21]:
[token.lemma_ for token in doc]

['caress',
 'fly',
 'die',
 'mule',
 'deny',
 'die',
 'agree',
 'own',
 'humble',
 'sized',
 'meeting',
 'state',
 'sieze',
 'itemization',
 'sensational',
 'traditional',
 'reference',
 'colonizer',
 'plot']

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_non_alphanum

alphanumeric = '[A-Za-z0-9]+(?=\\s+)'
vec_alphanumeric = CountVectorizer(strip_accents='unicode', token_pattern=alphanumeric)
def token_cleaner(sent):
    wip = remove_stopwords(sent)
    wip = strip_non_alphanum(wip)
    wip = strip_punctuation(wip)
    return wip

In [50]:
token_cleaner(s)

'Good muffins cost  3 88 New York                              best sun high low ground  Please buy them  Thanks '

In [48]:
s

'Good muffins cost $3.88\nin New York.  and this is &&&, !@£ %£$£$%, ; //,/.,/,  very much the best of this sun to high low ground. Please buy me two of them.\n\nThanks.'

In [39]:
remove_stopwords(s)

'Good muffins cost $3.88 New York. &&&, !@£ %£$£$%, ; //,/.,/, best sun high low ground. Please buy them. Thanks.'

In [40]:
strip_non_alphanum(s)

'Good muffins cost  3 88 in New York   and this is                              very much the best of this sun to high low ground  Please buy me two of them   Thanks '

In [41]:
strip_punctuation(s)

'Good muffins cost  3 88\nin New York   and this is    £  £ £       very much the best of this sun to high low ground  Please buy me two of them \n\nThanks '

In [42]:
strip_punctuation2(s)

'Good muffins cost  3 88\nin New York   and this is    £  £ £       very much the best of this sun to high low ground  Please buy me two of them \n\nThanks '

In [43]:
preprocess_string(s)

['good',
 'muffin',
 'cost',
 'new',
 'york',
 'best',
 'sun',
 'high',
 'low',
 'ground',
 'bui',
 'thank']