# Class - Introduction to Natural Language Processing

In [None]:
#-----------------------------
# Natural Language Toolkit 
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('wordnet')
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
#-----------------------------
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, WordPunctTokenizer
from nltk.stem import PorterStemmer
import requests #web
from collections import Counter #counting words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string 
import re
#-----------------------------
# other libraries we might use
#import string

#-----------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (9.71,6)
import matplotlib.cm as cm
import seaborn as sns; 
sns.axes_style("whitegrid")
sns.set_context("talk")
np.random.seed(0)


# simple search and counting

In [None]:
raw_text="You don't know about me without you have read a book by the name of The Adventures of Tom Sawyer; but that ain't no matter. That book was made by Mr. Mark Twain, and he told the truth, mainly. There was things which he stretched, but mainly he told the truth. That is nothing. I never seen anybody but lied one time or another, without it was Aunt Polly, or the widow, or maybe Mary. Aunt Polly--Tom's Aunt Polly, she is--and Mary, and the Widow Douglas is all told about in that book, which is mostly a true book, with some stretchers, as I said before.\nNow the way that the book winds up is this: Tom and me found the money that the robbers hid in the cave, and it made us rich. We got six thousand dollars apiece--all gold. It was an awful sight of money when it was piled up. Well, Judge Thatcher he took it and put it out at interest, and it fetched us a dollar a day apiece all the year round--more than a body could tell what to do with. The Widow Douglas she took me for her son, and allowed she would sivilize me; but it was rough living in the house all the time, considering how dismal regular and decent the widow was in all her ways; and so when I couldn't stand it no longer I lit out. I got into my old rags and my sugar-hogshead again, and was free and satisfied. But Tom Sawyer he hunted me up and said he was going to start a band of robbers, and I might join if I would go back to the widow and be respectable. So I went back.\nThe widow she cried over me, and called me a poor lost lamb, and she called me a lot of other names, too, but she never meant no harm by it. She put me in them new clothes again, and I couldn't do nothing but sweat and sweat, and feel all cramped up. Well, then, the old thing commenced again. The widow rung a bell for supper, and you had to come to time. When you got to the table you couldn't go right to eating, but you had to wait for the widow to tuck down her head and grumble a little over the victuals, though there warn't really anything the matter with them,--that is, nothing only everything was cooked by itself. In a barrel of odds and ends it is different; things get mixed up, and the juice kind of swaps around, and the things go better."
print(raw_text[:100])

In [None]:
re.findall(r'things', raw_text, re.IGNORECASE)

Try 3 of the regular expression matches from Exercise 7, using re.findall(). 

If you are using "()" in your regular expression, make sure to put a pair of parenthesis around the entire expression so the result includes the entire string match. 
1. Using re.findall(), formulate a regular expression that matches all capitalized words ("You","Tom", "Sawyer","Aunt","Polly", etc.). 
2. Count the number of times each of the following words are used:
* Polly
* the
3. Using re.sub(), replace all instances of the word "you" with "I".
4. Tokenize the text using white space with regex.

1. Find all capitalized words.

In [None]:
capitalized = re.findall('([A-Z][a-z]+)', raw_text)
print(capitalized)

2. Count the number of times each of the following words are used:
* Polly
* the
* words ending with "-ed"


| metacharacter | description |
| - | :- |
| \d | Whole Number 0 - 9 | 
| \w | Used to find a word character. A word character is a character from a-z, A-Z, 0-9, including the _ (underscore) character. | 
| \b | Used to find a match at the beginning or end of a word. | 
| [0-9]  | Used to find any character between the brackets. |
| [a-z]  | Used to find any character between the brackets. |
| [A-z ] | Any character from uppercase A to lowercase z |
| (x\|y) | Used to find any of the alternatives specified |
| * | Used to match 0 or more of the previous (e.g. xy*z could correspond to "xz", "xyz", "xyyz", etc.) |



In [None]:
numPolly = len(re.findall("Polly", raw_text))                  #don't want to ignore case
numThe   = len(re.findall("the", raw_text,   re.IGNORECASE))
numIng   = len(re.findall(r'\b(\w+ing)\b',   raw_text))        #find -ing words
print("Polly counted       : %d"%numPolly)
print("The counted         : %d"%numThe)
print("-ing word counted   : %d"%numIng)

3. Produce a list of 'so ...ly' phrases, that is, 'so' modifying a word ending in 'ly'. How many are there?

In [None]:
listLY   = set(re.findall(r'\b(\w+ly)\b',   raw_text))     #\w any characters +  ly anchored at end of word
print("-ly words       : ")
print(listLY)

4. Using re.sub(), replace all instances of the word "you" with "I".

In [None]:
out1 = re.sub('You', 'I', raw_text)     #note that 'you' is not replaced
print(out1[:50])   
out2 = re.sub('[Y,y]ou', 'I', raw_text) #both 'you' and 'You' replaced ### BEST
print(out2[:50])    
out3 = raw_text.replace('You', 'I')     #alternative use '.replace', doesn't handle lower/upper case
print(out3[:50])

4. Tokenize the text using white space with regex.

In [None]:
out4 = re.split(' ',raw_text)
print(out4[:50])

# NLTK: Natural language toolkit

# Tokenizing

Once the text has been segmented into its tokens (paragraphs,
sentences, words), most NLP pipelines do a number of other basic
procedures for text normalization, e.g.:
* Tokenizing 
* Part of speech tagging
* Lowercasing
* Stemming
* Lemmatization
* Stopword removal
* TFIDF vectorization (term frequency–inverse document frequency)

### Start simple

This process of segmenting a string of characters into words is known as tokenization. Tokenization is a prelude to pretty much everything else we might want to do in NLP, since it tells our processing software what our basic units are. We will discuss tokenization in more detail shortly.

We also pointed out that we could compile a list of the unique vocabulary items in a string by using set() to eliminate duplicates:

In [None]:
data = "One morning I shot an elephant in my pajamas. How he got into my pajamas, I'll never know." #groucho marx
words = data.split() #default is to split on ' ' whitespace
num_words  = len(words)
num_unique = len(set(words))
print("Number of words        : %d"%num_words)
print("Number of unique words : %d"%num_unique)

In [None]:
print(set(words))

Its only tokenizing based on whitespace. There's not regard for punctuation, capitalization, apostrophes...

# Text from gutenberg

We will be loading **Dracula** by Bram Stoker.

In [None]:
url_dracula="http://www.gutenberg.org/cache/epub/345/pg345.txt"

In [None]:
def load_book(url):
    response = requests.get(url)
    full_text = response.content
    raw = full_text.decode("utf-8-sig")    
    return raw

In [None]:
raw_text= load_book(url_dracula)
print(raw_text[:505])

## Preprocessing

#### Extract words from text using NLP
You'll now use nltk, the Natural Language Toolkit, to

Tokenize the text (fancy term for splitting into tokens, such as words);
Remove stopwords (words such as 'a' and 'the' that occur a great deal in ~ nearly all English language texts.



In [None]:
tokens = nltk.word_tokenize(raw_text) #better than just using regex

We cannot reliably detect where the content begins and ends, and so have to resort to manual inspection of the file, to discover unique strings that mark the beginning and the end, before trimming raw to be just the content and nothing else.

Sometimes it may be easiest to grab a first part of the text (enough words to be unique) and search for that.

In [None]:
raw_text.find("JONATHAN HARKER\'S JOURNAL")

In [None]:
raw_text.rfind("THE END")

In [None]:
text = raw_text[4115:861589]

In [None]:
text[:200]

## Tokenize
You want to tokenize your text, that is, split it into a list a words.

Tokenization is the process of breaking raw text into its building
parts: words, phrases, symbols, or other meaningful elements
called tokens.
A list of tokens is almost always the first step to any other NLP
task.


There are many tokenizers to choose from: sent_tokenize, word_tokenize, TreebankWordTokenizer, etc.

First let's tokenize by **sentence**.

In [None]:
text_sent = nltk.tokenize.sent_tokenize(text)
text_sent[:5]

We can also construct our own tokenizer using regular expressions, or regex.

Here we just split the text using **whitespace**.

In [None]:
# Create tokenizer
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
# Create tokens
tokens = tokenizer.tokenize(text)
tokens[:15]

You may also wish to use the 'word_tokenizer' built-in.  

In [None]:
tokenizer = nltk.tokenize.word_tokenize
tokens = tokenizer(text)
print(tokens[:15])

## Part of Speech tagging

Let's tag the Parts of Speech to each of the words.

POS tagging does not always provide the same label for a given word, but decides on the correct label for the specific context – disambiguates across the word classes.

In [None]:
tagged_words = nltk.pos_tag(tokens)
print(tagged_words[:55])

We can see that it has correctly tagged the tokens.  I'll list a few of these below:
* CC – coordinating conjunction
* RB – adverb
* IN – preposition
* NN – noun
* JJ – adjective

#### Note: if we aren't going to need these POS tags for later analysis, then we can skip this step.

## Remove punctuation and numbers

punctuation is <code>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</code>

Let's strip the punctuation from the text.

In [None]:
tokens[:10]

In [None]:
tokens_noPunct = [word for word in tokens if word[0].isalpha()]
tokens_noPunct[:15]

We can see that this has had the desired effect, mostly.  It now comes down to what we want out of the text.  The more we want to filter and tokenize, the more complex it can get from here.


### Normalizing Case: lowercase
It is common to convert all words to one case.

This means that the vocabulary will shrink in size, but some distinctions are lost (e.g. “Apple” the company vs “apple” the fruit is a commonly used example).

We can convert all words to lowercase by calling the lower() function on each word.


In [None]:
tokens_lower = [word.lower() for word in tokens_noPunct]
print(tokens_lower[:15])

Cleaning text is really hard, problem specific, and full of tradeoffs.

Remember, simple is better.

Simpler text data, simpler models, smaller vocabularies. You can always make things more complex later.

## Word frequency

Without the NLTK package, creating a frequency distribution plot (histogram) for a BoW is possible, but will take multiple lines of code to do so. Through the use of the FreqDist class, we are able to obtain the frequencies of every token in the BoW with one single line of code:

In [None]:
# Calculate frequency distribution
fdist1 = nltk.FreqDist(tokens_lower)
# Output top 10 words
for word, frequency in fdist1.most_common(10):
    print(u'{};{}'.format(word, frequency))


We can use the built-in function to visualize:

In [None]:
fdist1.plot(20);

#### Note: Most of these are stop words.

## Remove stop words

It is common practice to remove words that appear alot in the English language such as 'the', 'of' and 'a' (known as stopwords) because they're not so interesting. For more on all of these techniques, check out our Natural Language Processing Fundamentals in Python course.

The package nltk has a list of stopwords in English which you'll now store it.

If you get an error here, run the command 
<code>nltk.download('stopwords')</code>
to install the stopwords on your system.


For some applications like documentation classification, it may make sense to remove stop words. Others, maybe not.

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))
print(len(stop_words))
print(stop_words)


In [None]:
tokens_noStop = [w for w in tokens_lower if not w in stop_words]
print(tokens_noStop[:20])

### Let's visualize the result with the stop words removed

In [None]:
# Calculate frequency distribution
fdist2 = nltk.FreqDist(tokens_noStop )
fdist2.plot(20);

## how to plot more words on a log scale?

In [None]:
#Sort word frequency distribution by number of times each word occurs
sorted_counts = sorted(fdist2.items() , key = lambda x: x[1] ,reverse = True)
x_vec         = [i+1 for i in range(len(sorted_counts))] # get rank of each word
y_vec         = [freq for (word,freq) in sorted_counts]  # get count only
plt.loglog(x_vec, y_vec)
plt.xlabel('word rank')
plt.ylabel('word counts')
plt.show()

## Some simple Statistics

####  What are some of the long words that appear in the text?

In [None]:
#remove duplicate words 
word_bank     = set(tokens_noStop)
#returns only words longer than 16 letters
lengthy_words = [word for word in word_bank if len(word) > 15]
#print the lengthy words
print(lengthy_words)

In [None]:
def find_longest_word(word_list):  
    longest_word =  max(word_list, key=len)
    return longest_word

In [None]:
longest_word  = find_longest_word(word_bank)
print(longest_word)

Not sure that counts as the longest word.

#### The longest sentence

In [None]:
text_sent = nltk.tokenize.sent_tokenize(text)
longest = max([len(s) for s in text_sent])
print(longest)
print([s for s in text_sent if (len(s) == longest)]) #this is characters not words

**Word Properties Table** Next there is a table of word properties, which you should compute (skip unique word stems, since we will do stemming in class on Wed).  Make a table that prints out:
1. number of words
2. number of unique words
3. average word length
4. longest word

You can decide for yourself if you want to try this again after you eliminate punctuation and function words (stop words) or not.  It's your collection!  


To get the average number of words in a sentence.

In [None]:
def average_words_in_sentence(sentences):
    counts = []
    for sentence in sentences:
        counts.append(len(sentence.split()))
    return float(sum(counts))/len(counts) #number of words, not characters

In [None]:
avg_word_length = sum(len(word) for word in tokens_noStop) / len(tokens_noStop)
avg_sent_length = sum(len(sent) for sent in text_sent) / len(text_sent) #this counts characters, not words
#unique words
word_bank          = set(tokens_noStop)
num_unique_words   = len(word_bank)
avg_word_sent      = average_words_in_sentence(text_sent)

In [None]:
#print each below
print("Average sentence length     : %d   characters"%avg_sent_length ) #in characters
print("Average word length         : %3.2f characters"%avg_word_length )#in characters
print("Number of unique words      : %d"%num_unique_words )             #
print("Average words in sentence   : %3.2f"%avg_word_sent )             #in words

### Stemming

Stemming refers to the process of reducing each word to its root or base.

For example “fishing,” “fished,” “fisher” all reduce to the stem “fish.”

Some applications, like document classification, may benefit from stemming in order to both reduce the vocabulary and to focus on the sense or sentiment of a document rather than deeper meaning.

There are many stemming algorithms, although a popular and long-standing method is the Porter Stemming algorithm. This method is available in NLTK via the PorterStemmer class.


In [None]:
porter = PorterStemmer()
for word in ['walking', 'walks', 'walked']:
    print(porter.stem(word))


In [None]:
tokens_stemmed = [porter.stem(word) for word in tokens_noStop]
print(tokens_stemmed[:100])

You can see that words have been reduced to their stems, such as 
* “arrive” has become “arriv“
* “early” has become “earli“ 
* “morning” has become “morn“ 

### Lematization

Stemming can often create non-existent words, whereas lemmas are actual words

We can use NLTK WordNet Lemmatizer uses the WordNet Database to lookup lemmas, however, lemmatizing requires the use of POS.

In [None]:
wnl = nltk.WordNetLemmatizer()
  
print("rocks :", wnl.lemmatize("rocks"))
print("corpora :", wnl.lemmatize("corpora"))
# a denotes adjective in "pos"
print("better :", wnl.lemmatize("better", pos ="a"))

## Stemming and Lemmatization
Lemmatisation is closely related to stemming. The difference is that a stemmer operates on a single word without knowledge of the context, and therefore cannot discriminate between words which have different meanings depending on part of speech. However, stemmers are typically easier to implement and run faster, and the reduced accuracy may not matter for some applications.

For instance:

The word "better" has "good" as its lemma. This link is missed by stemming, as it requires a dictionary look-up.

The word "walk" is the base form for word "walking", and hence this is matched in both stemming and lemmatisation.

The word "meeting" can be either the base form of a noun or a form of a verb ("to meet") depending on the context, e.g., "in our last meeting" or "We are meeting again tomorrow". Unlike stemming, lemmatisation can in principle select the appropriate lemma depending on the context.

## Building word vectors, counting words



In [None]:
word_list = ['dracula','bat','fool','red','pride','sense']
for w in word_list:
    print("%s \t\t: %d"%(w,fdist1[w]))


# Text comparison

### Cosine Similarity

Let's now compare documents in the set to other documents in the set, using **cosine similarity**.

Consider two vectors $\vec{a}$ and $\vec{b}$.  Using the dot-product, we can determine if they point in the same direction.  


$$ cos \theta = \dfrac{\vec{a} \cdot \vec{b}}{\| \vec{a} \|  \cdot  \|  \vec{b} \|  }
$$

where cosine is the dot/scalar product of two vectors divided by the product of their Euclidean norms.

 
The nice thing about cosine similarity is that it is normalized: no matter what the input vectors are, the output is between 0 and 1. One way to think of this is that cosine similarity is just, um, the cosine function, which has this property (for non-negative  **a**  and  **b** ). Another way to think of it is, to work through the situations of maximum and minimum similarity between two context vectors, starting from the definition above.




### Tokenize our data.

In [None]:
#define function to convert raw text into tokens as detailed above
def preprocessText2Word(raw_text): #tokenize, lowercase, remove stopwords, lemmatize, remove punctuation
    tokenizer      = nltk.tokenize.word_tokenize
    stop_words     = set(nltk.corpus.stopwords.words('english'))
    wnl            = nltk.WordNetLemmatizer()
    #-------
    tokens         = tokenizer(raw_text)                                     #step 2
    tokens         = [ word.lower() for word in tokens ]                     #step 3 make all tokens lowercase
    tokens         = [ w for w in tokens if not w in stop_words ]            #step 4 remove stop words
    tokens         = [word for word in tokens if word.isalpha()]             #step 5 remove non-alpha characters     
    tokens         = [ porter.stem( t ) for t in tokens ]              #step 6 stem or lemmatize
    return tokens


In [None]:
allDocs = (
"The sky is blue",
"The sun is bright",
"The sun in the sky is bright",
"We can see the shining sun, the bright sun"
)
#=======================  
# tokenize all 'documents'
allDocs_processed=[]
for a in allDocs:
    tokens = preprocessText2Word(a) #this function does all the heavy-lifting, tokenization, remove stop words
    allDocs_processed.append(tokens) #convert from tokens back to text 
#convert output to tuple (expected by tfidf)    
allDocs_processed=tuple(allDocs_processed) #convert it to a tuple


### Use Tfidf to vectorize our data.

Using pre-processed, tokenized data, we can vectorize the data by the frequency of these tokens.

With our cleaned up text, we can now use it whatever analysis we want. Unfortunately, calculating tf-idf is not available in NLTK but use the one from scikit-learn.  

In [None]:
#=======================   
# since we want to pre-process the data ourself, we initialize with the following inputs
tfidf    = TfidfVectorizer(preprocessor=' '.join,lowercase=False)   # initialize the TFIDF vectorizer
tfs_vecs = tfidf.fit_transform(allDocs_processed)                                # vectorize on the training data
#=======================
cosine_similarity(tfs_vecs)

### We can also find the similarity of a new vector with our training set.

In [None]:
query        = ("the very bright sun")
query_tokens = preprocessText2Word(query)
query_matrix = tfidf.transform([query_tokens])     # we need the [] to make it a list
print(cosine_similarity(query_matrix, tfs_vecs))   # find the cosine similarity