# Text Cleaning

In [None]:
url = 'http://www.gutenberg.org/ebooks/1661.txt.utf-8'
file_name = 'sherlock.txt'

In [None]:
import urllib.request
# Download the file from `url` and save it locally under `file_name`:

with urllib.request.urlopen(url) as response:
    with open(file_name, 'wb') as out_file:
        data = response.read() # a `bytes` object
        out_file.write(data)

In [None]:
!ls {*.txt}

In [None]:
!head -2 sherlock.txt

In [None]:
!sed -i 1,33d sherlock.txt

In [None]:
!head -5 sherlock.txt

## Load Data

In [None]:
#let's the load data to RAM
text = open(file_name, 'r', encoding='utf-8').read()  # note that I add an encoding='utf-8' parameter to preserve information
print(text[:5])

In [None]:
print(f'The file is loaded as datatype: {type(text)} and has {len(text)} characters in it')

### Exploring Loaded Data

In [None]:
# how many unique characters do we see? 
# For reference, ASCII has 127 characters in it - so we expect this to have at most 127 characters
unique_chars = list(set(text))
unique_chars.sort()
print(unique_chars)
print(f'There are {len(unique_chars)} unique characters, including both ASCII and Unicode character')

## Tokenization 

### Split by Whitespace

In [None]:
words = text.split()
print(len(words))

In [None]:
print(words[90:200])  #start with the first chapeter, ignoring the index for now

In [None]:
# Let's look at another example: 
'red-headed woman on the street'.split()

### Split by Word Extraction
**Introducing Regex**

In [None]:
import re
re.split('\W+', 'Words, words, words.')

In [None]:
words_alphanumeric = re.split('\W+', text)

In [None]:
len(words_alphanumeric), len(words)

In [None]:
print(words_alphanumeric[90:200])

In [None]:
words_break = re.split('\W+', "Isn't he coming home for dinner with the red-headed girl?")
print(words_break)

### spaCy for Tokenization

In [None]:
%%time
import spacy
nlp = spacy.load('en')

In [None]:
doc = nlp(text)

In [None]:
print(list(doc)[150:200])

Conveniently, spaCy tokenizes all *punctuations and words* and returned those as individual tokens as well. Let's try the example which we didn't like earlier:

In [None]:
words = nlp("Isn't he coming home for dinner with the red-headed girl?")
print([token for token in words])

In [None]:
sentences = list(doc.sents)
print(sentences[13:18])

#### STOP WORD REMOVAL & CASE CHANGE

spaCy has already marked each token as a stop word or not and stored it in `is_stop` attribute of each token. This makes it very handy for text cleaning. Let's take a quick look: 

In [None]:
sentence_example = "the AI/AGI uprising cannot happen without the progress of NLP"

In [None]:
[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]

In [None]:
for token in doc[:5]:
    print(token, token.is_stop, token.is_punct)

In [None]:
text_lower = text.lower()  # native python function
doc_lower = nlp(text_lower)

In [None]:
for token in doc_lower[:5]:
    print(token, token.is_stop)

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
f'spaCy has a dictionary of {len(list(STOP_WORDS))} stop words'

In [None]:
domain_stop_words = ["NLP", "Processing", "AGI"]
for word in domain_stop_words:
    STOP_WORDS.add(word)

In [None]:
[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]

In [None]:
[str(token) for token in nlp(sentence_example) if not token.is_stop and not token.is_punct]

In [None]:
[str(token) for token in nlp(sentence_example) if not token.is_stop]

## Stemming and Lemmatization

### spaCy for Lemmatization
**spaCy only supports lemmatization** 

An underscore at end, such as `lemma_` tells spaCy we are looking for something which is human readable. spaCy stores the internal hash or identifier which spaCy stores in `token.lemma`. 

In [None]:
lemma_sentence_example = "Their Apples & Banana fruit salads are amazing. Would you like meeting me at the cafe?"
[(token, token.lemma_, token.lemma, token.pos_ ) for token in nlp(lemma_sentence_example)]