In [2]:
import pandas as pd
import numpy as np

In [3]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [4]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

### 1. lowercase everything

In [13]:
original = original.lower()

### 2. remove accented and ASCII characters

In [14]:
import unicodedata

original = unicodedata.normalize('NFKD', original).encode('ascii', 'ignore').decode('utf-8')
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### 3. Remove special characters

In [15]:
import re

In [22]:
original = re.sub('[^a-z0-9\'\s]', '', original)

### 4. Tokenize

In [23]:
import nltk

In [24]:
tokenize = nltk.tokenize.ToktokTokenizer()
tokenize

<nltk.tokenize.toktok.ToktokTokenizer at 0x14ed8b430>

In [26]:
tokenize.tokenize(original, return_str=False)

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematicians',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdos',
 "'",
 's',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'as',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [39]:
original = tokenize.tokenize(original, return_str=True)
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### 5. Stemming or Lemmatizing

#### Stemming

In [40]:
ps = nltk.porter.PorterStemmer()
ps

<PorterStemmer>

In [41]:
ps.stem('calling'), ps.stem('calls', 'called'), ps.stem('caller')

('call', 'call', 'caller')

In [42]:
ps.stem('house'), ps.stem('housing'), ps.stem('home')

('hous', 'hous', 'home')

In [43]:
ps.stem(original)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necess"

In [44]:
ps.stem('contributed')

'contribut'

In [47]:
stems = [ps.stem(word) for word in original.split()]
stems

['paul',
 'erdo',
 'and',
 'georg',
 'polya',
 'were',
 'influenti',
 'hungarian',
 'mathematician',
 'who',
 'contribut',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdo',
 "'",
 's',
 'name',
 'contain',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'doubl',
 'acut',
 'accent',
 'but',
 'is',
 'often',
 'incorrectli',
 'written',
 'as',
 'erdo',
 'or',
 'erdo',
 'either',
 'by',
 'mistak',
 'or',
 'out',
 'of',
 'typograph',
 'necess']

In [46]:
stems = [ps.stem(word) for word in original.split()]
' '.join(stems)

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

#### Lemmatize

In [48]:
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [50]:
# run this once per computer like a pip install
# nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    | Downloading pac

[nltk_data]    | Downloading package perluniprops to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping misc/perluniprops.zip.
[nltk_data]    | Downloading package pil to /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping corpora/pil.zip.
[nltk_data]    | Downloading package pl196x to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping corpora/pl196x.zip.
[nltk_data]    | Downloading package porter_test to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping stemmers/porter_test.zip.
[nltk_data]    | Downloading package ppattach to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping corpora/ppattach.zip.
[nltk_data]    | Downloading package problem_reports to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unzipping corpora/problem_reports.zip.
[nltk_data]    | Downloading package product_reviews_1 to
[nltk_data]    |     /Users/qmcbt/nltk_data...
[nltk_data]    |   Unz

True

In [53]:
wnl = nltk.stem.WordNetLemmatizer()
wnl

<WordNetLemmatizer>

In [56]:
wnl.lemmatize('calling'), wnl.lemmatize('calls'), wnl.lemmatize('called'), wnl.lemmatize('caller')

('calling', 'call', 'called', 'caller')

In [57]:
wnl.lemmatize('house'),wnl.lemmatize('housing')

('house', 'housing')

In [58]:
wnl.lemmatize('mouse'), wnl.lemmatize('mice')

('mouse', 'mouse')

In [59]:
ps.stem('mouse'), ps.stem('mice')

('mous', 'mice')

In [61]:
lemmas = [wnl.lemmatize(word) for word in original.split()]
lemmas

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematician',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdos',
 "'",
 's',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'a',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [62]:
lemmas = [wnl.lemmatize(word) for word in original.split()]
' '.join(lemmas)

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

### 6. Remove Stopwords

In [63]:
from nltk.corpus import stopwords

In [64]:
# run this once per computer like a pip install
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/qmcbt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
stopwords_english = stopwords.words('english')
stopwords_english[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [66]:
len(stopwords_english)

179

In [67]:
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [68]:
stopwords_english.append('o')

In [69]:
len(stopwords_english)

180

In [70]:
stopwords_english.append("'")

In [71]:
len(stopwords_english)

181

In [72]:
# words that will be left if we remove stopwords_english
[word for word in original.split () if word not in stopwords_english]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematicians',
 'contributed',
 'lot',
 'field',
 'erdos',
 'name',
 'contains',
 'hungarian',
 'letter',
 'double',
 'acute',
 'accent',
 'often',
 'incorrectly',
 'written',
 'erdos',
 'erdos',
 'either',
 'mistake',
 'typographical',
 'necessity']