# Text Cleaning

In [1]:
url = 'http://www.gutenberg.org/ebooks/1661.txt.utf-8'
file_name = 'sherlock.txt'

In [2]:
import urllib.request
# Download the file from `url` and save it locally under `file_name`:

with urllib.request.urlopen(url) as response:
    with open(file_name, 'wb') as out_file:
        data = response.read() # a `bytes` object
        out_file.write(data)

HTTPError: HTTP Error 406: Not Acceptable

In [3]:
!ls {*.txt}

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
!head -2 sherlock.txt

'head' is not recognized as an internal or external command,
operable program or batch file.


In [5]:
!sed -i 1,33d sherlock.txt

In [6]:
!head -5 sherlock.txt

THE ADVENTURES OF SHERLOCK HOLMES

by

SIR ARTHUR CONAN DOYLE


## Load Data

In [5]:
#let's the load data to RAM
text = open(file_name, 'r', encoding='utf-8').read()  # note that I add an encoding='utf-8' parameter to preserve information
print(text[:5])

THE A


In [6]:
print(f'The file is loaded as datatype: {type(text)} and has {len(text)} characters in it')

The file is loaded as datatype: <class 'str'> and has 581204 characters in it


### Exploring Loaded Data

In [7]:
# how many unique characters do we see? 
# For reference, ASCII has 127 characters in it - so we expect this to have at most 127 characters
unique_chars = list(set(text))
unique_chars.sort()
print(unique_chars)
print(f'There are {len(unique_chars)} unique characters, including both ASCII and Unicode character')

['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'è', 'é']
There are 85 unique characters, including both ASCII and Unicode character


## Tokenization 

### Split by Whitespace

In [8]:
words = text.split()
print(len(words))

107431


In [9]:
print(words[90:200])  #start with the first chapeter, ignoring the index for now

['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name.', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex.', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler.', 'All', 'emotions,', 'and', 'that', 'one', 'particularly,', 'were', 'abhorrent', 'to', 'his', 'cold,', 'precise', 'but', 'admirably', 'balanced', 'mind.', 'He', 'was,', 'I', 'take', 'it,', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen,', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position.', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions,', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer.', 'They', 'were', 'admirable', 'things', 'for']


In [10]:
# Let's look at another example: 
'red-headed woman on the street'.split()

['red-headed', 'woman', 'on', 'the', 'street']

### Split by Word Extraction
**Introducing Regex**

In [11]:
import re
re.split('\W+', 'Words, words, words.')

['Words', 'words', 'words', '']

In [12]:
words_alphanumeric = re.split('\W+', text)

In [13]:
len(words_alphanumeric), len(words)

(109111, 107431)

In [16]:
print(words_alphanumeric[90:200])

['BOHEMIA', 'I', 'To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', 'All', 'emotions', 'and', 'that', 'one', 'particularly', 'were', 'abhorrent', 'to', 'his', 'cold', 'precise', 'but', 'admirably', 'balanced', 'mind', 'He', 'was', 'I', 'take', 'it', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer', 'They', 'were', 'admirable']


In [17]:
words_break = re.split('\W+', "Isn't he coming home for dinner with the red-headed girl?")
print(words_break)

['Isn', 't', 'he', 'coming', 'home', 'for', 'dinner', 'with', 'the', 'red', 'headed', 'girl', '']


### spaCy for Tokenization

Tokenization is essentially splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as individual words or terms. Each of these smaller units are called tokens.

In [18]:
%%time
import spacy
nlp = spacy.load('en')

Wall time: 2.46 s


In [19]:
doc = nlp(text)

In [20]:
print(list(doc)[150:200])

[whole, of, her, sex, ., It, was, not, that, he, felt, 
, any, emotion, akin, to, love, for, Irene, Adler, ., All, emotions, ,, and, that, 
, one, particularly, ,, were, abhorrent, to, his, cold, ,, precise, but, 
, admirably, balanced, mind, ., He, was, ,, I, take, it, ,]


Conveniently, spaCy tokenizes all *punctuations and words* and returned those as individual tokens as well. Let's try the example which we didn't like earlier:

In [21]:
words = nlp("Isn't he coming home for dinner with the red-headed girl?")
print([token for token in words])

[Is, n't, he, coming, home, for, dinner, with, the, red, -, headed, girl, ?]


In [22]:
sentences = list(doc.sents)
print(sentences[13:18])

[I. A SCANDAL IN BOHEMIA

I.

To Sherlock Holmes, she is always THE woman., I have seldom heard
him mention her under any other name., In his eyes she eclipses
and predominates the whole of her sex., It was not that he felt
any emotion akin to love for Irene Adler.]


### Keras for Tokenization

In [24]:
from keras.preprocessing.text import text_to_word_sequence
result = text_to_word_sequence(text)
print(result[1:20])

['adventures', 'of', 'sherlock', 'holmes', 'by', 'sir', 'arthur', 'conan', 'doyle', 'i', 'a', 'scandal', 'in', 'bohemia', 'ii', 'the', 'red', 'headed', 'league']


### NLTK for Tokenization

Using NLTK we can tokenize the whole content into 2 ways 

1) Sentence Tokenization
2) Word Tokenization

##### Sentence Tokenization
We use the sent_tokenize() method to split a document or paragraph into sentences


In [21]:
import nltk
from nltk.tokenize import sent_tokenize
sentence_tokenization=sent_tokenize(text)
print(sentence_tokenization[150:155])

['Heavy bands of astrakhan were slashed\nacross the sleeves and fronts of his double-breasted coat, while\nthe deep blue cloak which was thrown over his shoulders was lined\nwith flame-coloured silk and secured at the neck with a brooch\nwhich consisted of a single flaming beryl.', 'Boots which extended\nhalfway up his calves, and which were trimmed at the tops with\nrich brown fur, completed the impression of barbaric opulence\nwhich was suggested by his whole appearance.', 'He carried a\nbroad-brimmed hat in his hand, while he wore across the upper\npart of his face, extending down past the cheekbones, a black\nvizard mask, which he had apparently adjusted that very moment,\nfor his hand was still raised to it as he entered.', 'From the lower\npart of the face he appeared to be a man of strong character,\nwith a thick, hanging lip, and a long, straight chin suggestive\nof resolution pushed to the length of obstinacy.', '"You had my note?"']


##### Word Tokenization
We use the word_tokenize() method to split a sentence into tokens or words

In [22]:
from nltk.tokenize import word_tokenize
word_tokenization=word_tokenize(text)
print(word_tokenization[1:30])

['ADVENTURES', 'OF', 'SHERLOCK', 'HOLMES', 'by', 'SIR', 'ARTHUR', 'CONAN', 'DOYLE', 'I', '.', 'A', 'Scandal', 'in', 'Bohemia', 'II', '.', 'The', 'Red-headed', 'League', 'III', '.', 'A', 'Case', 'of', 'Identity', 'IV', '.', 'The']


#### STOP WORD REMOVAL & CASE CHANGE

spaCy has already marked each token as a stop word or not and stored it in `is_stop` attribute of each token. This makes it very handy for text cleaning. Let's take a quick look: 

In [49]:
sentence_example = "the AI/AGI uprising cannot happen without the progress of NLP"

In [54]:
[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]

[(the, True, False),
 (AI, False, False),
 (/, False, True),
 (AGI, True, False),
 (uprising, False, False),
 (can, True, False),
 (not, True, False),
 (happen, False, False),
 (without, True, False),
 (the, True, False),
 (progress, False, False),
 (of, True, False),
 (NLP, True, False)]

In [57]:
for token in doc[:5]:
    print(token, token.is_stop, token.is_punct)

THE False False
ADVENTURES False False
OF False False
SHERLOCK False False
HOLMES False False


In [30]:
text_lower = text.lower()  # native python function
doc_lower = nlp(text_lower)

In [32]:
for token in doc_lower[:5]:
    print(token, token.is_stop)

the True
adventures False
of True
sherlock False
holmes False


In [28]:
from spacy.lang.en.stop_words import STOP_WORDS
f'spaCy has a dictionary of {len(list(STOP_WORDS))} stop words'

'spaCy has a dictionary of 305 stop words'

In [58]:
domain_stop_words = ["NLP", "Processing", "AGI"]
for word in domain_stop_words:
    STOP_WORDS.add(word)

In [59]:
[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]

[(the, True, False),
 (AI, False, False),
 (/, False, True),
 (AGI, True, False),
 (uprising, False, False),
 (can, True, False),
 (not, True, False),
 (happen, False, False),
 (without, True, False),
 (the, True, False),
 (progress, False, False),
 (of, True, False),
 (NLP, True, False)]

In [61]:
[str(token) for token in nlp(sentence_example) if not token.is_stop and not token.is_punct]

['AI', 'uprising', 'happen', 'progress']

In [62]:
[str(token) for token in nlp(sentence_example) if not token.is_stop]

['AI', '/', 'uprising', 'happen', 'progress']

## Stemming and Lemmatization

### spaCy for Lemmatization
**spaCy only supports lemmatization** 

An underscore at end, such as `lemma_` tells spaCy we are looking for something which is human readable. spaCy stores the internal hash or identifier which spaCy stores in `token.lemma`. 

In [72]:
lemma_sentence_example = "Their Apples & Banana fruit salads are amazing. Would you like meeting me at the cafe?"
[(token, token.lemma_, token.lemma, token.pos_ ) for token in nlp(lemma_sentence_example)]

[(Their, '-PRON-', 561228191312463089, 'ADJ'),
 (Apples, 'apples', 14374618037326464786, 'PROPN'),
 (&, '&', 15473034735919704609, 'CCONJ'),
 (Banana, 'banana', 2525716904149915114, 'PROPN'),
 (fruit, 'fruit', 17674554054627885835, 'NOUN'),
 (salads, 'salad', 16382906660984395826, 'NOUN'),
 (are, 'be', 10382539506755952630, 'VERB'),
 (amazing, 'amazing', 12968186374132960503, 'ADJ'),
 (., '.', 12646065887601541794, 'PUNCT'),
 (Would, 'would', 6992604926141104606, 'VERB'),
 (you, '-PRON-', 561228191312463089, 'PRON'),
 (like, 'like', 18194338103975822726, 'VERB'),
 (meeting, 'meet', 6880656908171229526, 'VERB'),
 (me, '-PRON-', 561228191312463089, 'PRON'),
 (at, 'at', 11667289587015813222, 'ADP'),
 (the, 'the', 7425985699627899538, 'DET'),
 (cafe, 'cafe', 10569699879655997926, 'NOUN'),
 (?, '?', 8205403955989537350, 'PUNCT')]

##### NLTK for Stemming and Lemmatization

##### Stemming

In [26]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

sentences = nltk.sent_tokenize(text)
stemmer = PorterStemmer()

# Stemming
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)
print(sentences[1:10])

['A scandal bohemia II .', 'the red-head leagu iii .', 'A case ident IV .', 'the boscomb valley mysteri V. the five orang pip VI .', 'the man twist lip vii .', 'the adventur blue carbuncl viii .', 'the adventur speckl band IX .', "the adventur engin 's thumb X .", 'the adventur nobl bachelor XI .']


In [27]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
sentences1 = nltk.sent_tokenize(text)
lemmatizer = WordNetLemmatizer()

# Lemmatization
for i in range(len(sentences1)):
    words = nltk.word_tokenize(sentences1[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences1[i] = ' '.join(words) 
print(sentences1[1:10])

['A Scandal Bohemia II .', 'The Red-headed League III .', 'A Case Identity IV .', 'The Boscombe Valley Mystery V. The Five Orange Pips VI .', 'The Man Twisted Lip VII .', 'The Adventure Blue Carbuncle VIII .', 'The Adventure Speckled Band IX .', "The Adventure Engineer 's Thumb X .", 'The Adventure Noble Bachelor XI .']
