In [2]:
from nltk.corpus import names
import random
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from nltk import word_tokenize
import numpy as np
import nltk
from pprint import pprint


# Natural Language Processing

*Adapted from the online nltk book [Natural Language Processing with Python](http://www.nltk.org/book) as well as examples from Rutu Mehta's [Pydata talk](https://www.youtube.com/watch?v=gJwFHSeFg44)*

In [3]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


## NLTK has some really great tools for searching and understanding large bodies of text

In [3]:
text1.concordance("monstrous")

Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us , 
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But 
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u


In [4]:
text1.similar("monstrous")

imperial subtly impalpable pitiable curious abundant perilous
trustworthy untoward singular lamentable few determined maddens
horrible tyrannical lazy mystifying christian exasperate


In [None]:
text2.similar("monstrous")

very exceedingly so heartily a great good amazingly as sweet
remarkably extremely vast


In [None]:
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

You can look and plot the relative frequency of different words

In [4]:
fdist1 = FreqDist(text7)

In [5]:
fdist1.most_common(10)

[(u',', 4885),
 (u'the', 4045),
 (u'.', 3828),
 (u'of', 2319),
 (u'to', 2164),
 (u'a', 1878),
 (u'in', 1572),
 (u'and', 1511),
 (u'*-1', 1123),
 (u'0', 1099)]

Let's find the most common words that are over 15 letters long

In [6]:
fdist1 = FreqDist(text4)
for x in fdist1.most_common(10000):
    if len(x[0]) > 15 and x[1] > 3:
        print x


(u'responsibilities', 27)
(u'instrumentalities', 6)
(u'misunderstanding', 5)
(u'constitutionally', 4)


We can also quickly find collocations (words that appear close to each other)

In [7]:
text7.collocations()

million *U*; New York; billion *U*; Wall Street; program trading; Mrs.
Yeargin; vice president; Stock Exchange; Big Board; Georgia Gulf;
chief executive; Dow Jones; S&P 500; says *T*-1; York Stock; last
year; Sea Containers; South Korea; American Express; San Francisco


### Wordnet is a collection of words often used in spellcheck. We can use wordnet to do some interesting things:

![Word Puzzle](WordPuzzle.png)
Word Puzzle easily solved with nltk: Which words use at most one of the white letters, and definitely contain the center letter?

In [8]:
fq = FreqDist('egivrvonl')
for w in nltk.corpus.words.words():
    if 'r' in w:
        if FreqDist(w) < fq:
            if len(w) >= 6:
                print w

glover
gorlin
govern
grovel
ignore
involver
lienor
linger
longer
lovering
noiler
overling
region
renvoi
ringle
roving
violer
virole


### We know nltk can do a lot. But we use sklearn. How can we combine the two? How can we add a lemmatizer to sklearn?


In [12]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

First, let's understand a tokenizer in nltk

In [13]:
s = ["He either was or wasn't really excited about the baseball games"]

In [14]:
tokens = word_tokenize(s[0])
tokens

['He',
 'either',
 'was',
 'or',
 'was',
 "n't",
 'really',
 'excited',
 'about',
 'the',
 'baseball',
 'games']

Now, how about the Lemmatizer

In [15]:
wnl = WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens]

['He',
 'either',
 u'wa',
 'or',
 u'wa',
 "n't",
 'really',
 'excited',
 'about',
 'the',
 'baseball',
 u'game']

Reminder, we use CountVectorizer() to convert words into numbers. Each word becomes a column in the new matrix. What if we want to use our own tokenizer. How can we do so?

In [22]:
CountVectorizer??

### Aside: Intro to Classes

Class objects support two types of operations: attribute reference, and instatiation

In [39]:
class GAClass:
    num_students=10
    def announcement(self):
        print "There are %s in this class" % self.num_students

In [44]:
g = GAClass()
print "Attribute reference: " + str(g.num_students)
g.announcement()

Attribute reference: 10
There are 10 in this class


##### Why does it seem that announcement takes an argument, but we never passed any?

By default, when you call a class method, it passes in the object as the first parameter.

When calling a class method, by default it passes in the Object as the first parameter. Calling g.announcement() is the same as calling GAClass.announcment(g)

In [46]:
GAClass.announcement(g)

There are 10 in this class


You might sometimes want to create an object with certain properties, right from the beginning. In this case, you would want to create a special method called `__init__`

In [48]:
class GAClass:
    def __init__(self,n_students):
        self.num_students = n_students
    def announcement(self):
        print "There are %s in this class" % self.num_students
        

In [50]:
g = GAClass(15)
g.announcement()


There are 15 in this class


There is another special method called `__call__`. This let's you do things like g(10). Perhaps you wanted that to mean square the number.

In [51]:
class GAClass:
    def __init__(self,n_students):
        self.num_students = n_students
    def __call__(self, some_num):
        return some_num**2
    def announcement(self):
        print "There are %s in this class" % self.num_students
        

In [52]:
g = GAClass(15)

In [53]:
g(10)

100

In [None]:
#you can pass in a tokenizer/Lemmatizer into CountVectorizer, but it
#has to be a callable object.
class LemmaTokenizer(object):
    def __init__(self):
         self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [None]:
vect = CountVectorizer(tokenizer=LemmaTokenizer())  
cv = CountVectorizer()

In [None]:
s = ["He was or wasn't really excited about the baseball games"]

In [None]:
vect.fit(s)
vect.get_feature_names()

In [None]:
cv.fit(s)
cv.get_feature_names()

## Now, let's see if we can use some basic NLP features to extract the most popular topics during the Seatle pydata talks.

First let's load the data

In [18]:
pprint??

In [17]:
import json
data = []
with open('../../data/pydata_talks.csv') as fin:
    for line in fin:
        data.append(json.loads(line))
df = pd.DataFrame(data)

IOError: [Errno 2] No such file or directory: '../../data/pydata_talks.csv'

In [64]:
df.head()

Unnamed: 0,abstract,description,time,title
0,Registration and Breakfast,Registration and Breakfast,Friday 8 a.m.-9 a.m.,Registration and Breakfast
1,See description.,"Software and computers are everywhere, revolut...",Friday 9 a.m.-9:50 a.m.,Keynote: Computer Science: America's Untapped ...
2,Machine learning is the branch of computer sci...,This tutorial will offer an introduction to th...,Friday 10 a.m.-noon,Machine Learning with Scikit-Learn
3,The first part of the tutorial will cover basi...,The goal of this tutorial is to provide effici...,Friday 10 a.m.-noon,Python for Data Science: A Rapid On-ramp Primer
4,When I started learning more about statistics ...,We will learn how to make valid statistical in...,Friday 10 a.m.-noon,Simplified statistics through simulation


In [16]:
df['all_data'] = df.abstract + " " + df.description + " " + df.title

NameError: name 'df' is not defined

Now, lets try a basic CountVectorizer

In [148]:
cv = CountVectorizer()
X = cv.fit_transform(df.all_data)

In [149]:
fn = np.array(cv.get_feature_names())
common_words = fn[np.argsort(X.toarray().sum(axis=0))[-10:]]
pprint(list(common_words[::-1]))

[u'the', u'and', u'of', u'to', u'in', u'data', u'for', u'is', u'with', u'this']


We saw there was limited success there. We should enhance it by removing stop words

In [150]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(df.all_data)

In [151]:
fn = np.array(cv.get_feature_names())
common_words = fn[np.argsort(X.toarray().sum(axis=0))[-20:]]
pprint(list(common_words[::-1]))

[u'data',
 u'python',
 u'learning',
 u'talk',
 u'using',
 u'analysis',
 u'll',
 u'use',
 u'pandas',
 u'machine',
 u'models',
 u'tools',
 u'time',
 u'analytics',
 u'code',
 u'science',
 u'based',
 u'discuss',
 u'spark',
 u'model']


Perhaps we can try weighing words inversly by their frequency

In [158]:
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(df.all_data)

In [161]:
fn = np.array(cv.get_feature_names())
common_words = fn[np.argsort(X.toarray().sum(axis=0))[-20:]]
pprint(list(common_words[::-1]))

[u'data',
 u'soon',
 u'coming',
 u'python',
 u'lunch',
 u'learning',
 u'talk',
 u'breakfast',
 u'using',
 u'break',
 u'snacks',
 u'analysis',
 u'll',
 u'pandas',
 u'use',
 u'machine',
 u'spark',
 u'science',
 u'code',
 u'registration']


In [146]:
from nltk import pos_tag

In [169]:
filter(lambda x: x[1] == 'NN', pos_tag(common_words[::-1]))

[(u'talk', 'NN'),
 (u'analysis', 'NN'),
 (u'use', 'NN'),
 (u'machine', 'NN'),
 (u'science', 'NN'),
 (u'model', 'NN'),
 (u'time', 'NN'),
 (u'work', 'NN'),
 (u'software', 'NN'),
 (u'performance', 'NN'),
 (u'space', 'NN'),
 (u'deep', 'NN')]

### Building our own preprocessor

In [162]:
import re
class my_preprocessor(object):
    def __init__(self):
        self.spchars = re.compile('\`|\~|\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\_|\+|\=|\\|\||\{|\[|\]|\}|\:|\;|\'|\"|\<|\,|\>|\?|\/|\.|\-')
    def __call__(self, doc):
        return self.spchars.sub('', doc.lower())

In [163]:
cv = CountVectorizer(preprocessor=my_preprocessor(),stop_words='english')
X = cv.fit_transform(df.all_data)

In [167]:
fn = np.array(cv.get_feature_names())
common_words = fn[np.argsort(X.toarray().sum(axis=0))[-40:]]
pprint(list(common_words[::-1]))

[u'data',
 u'python',
 u'learning',
 u'talk',
 u'using',
 u'analysis',
 u'use',
 u'pandas',
 u'models',
 u'machine',
 u'tools',
 u'ill',
 u'analytics',
 u'code',
 u'science',
 u'discuss',
 u'new',
 u'model',
 u'like',
 u'used',
 u'tutorial',
 u'time',
 u'techniques',
 u'spark',
 u'work',
 u'library',
 u'methods',
 u'coming',
 u'software',
 u'based',
 u'algorithm',
 u'algorithms',
 u'building',
 u'soon',
 u'libraries',
 u'applications',
 u'statistical',
 u'performance',
 u'space',
 u'deep']


In [168]:
filter(lambda x: x[1] == 'NN', pos_tag(common_words[::-1]))

[(u'talk', 'NN'),
 (u'analysis', 'NN'),
 (u'use', 'NN'),
 (u'machine', 'NN'),
 (u'science', 'NN'),
 (u'model', 'NN'),
 (u'time', 'NN'),
 (u'work', 'NN'),
 (u'software', 'NN'),
 (u'performance', 'NN'),
 (u'space', 'NN'),
 (u'deep', 'NN')]