## Import NLTKs corpora, books etc.

!python -m nltk.downloader all

In [None]:
import nltk
import numpy as np
import pandas as pd

In [None]:
# Explore 
dir(nltk.corpus)

In [None]:
# Explore
print(nltk.corpus.__class__)
nltk.corpus

In [None]:
nltk.corpus.gutenberg.fileids()

In [None]:
from nltk import book

print(dir(nltk.book))

In [None]:
# nltk.book conatins 9 texts, can be seen above ::
from nltk.book import text4

In [None]:
type(text4)

In [None]:
print(text4[:10])
print(np.shape(text4))

In [None]:
len(text4)

In [None]:
# Number of unique words for words with wordlength > 5 in text4 doc

len(set([word.lower() for word in text4 if len(word) >= 5])) # Total unique : 9913

In [None]:
' '.join(text4[:200])

#### Words in context

Search word in text, diasplay the results together with the context:


In [None]:
text4.concordance?

In [None]:
#text4.concordance("people")
text4.concordance("citizen",width=30, lines=10)  #This isn't case sensitive

What other words appear in a similar range of contexts? 

In [None]:
text4.similar?

In [None]:
text4.similar("citizen", num=5) #This isn't case sensitive

Examine just the contexts that are shared by two or more words:

In [None]:
text4.common_contexts

In [None]:
text4.common_contexts(["war", "democracy",'peace'], num=5) # can be used for 2 or more words

Location of a word in the text: how many spaces from the beginning does it appear? 

This positional information can be displayed using a dispersion plot. 

You need NumPy and Matplotlib. 


In [None]:
# Start pylab inline mode, so figures will appear in the notebook
%pylab inline

In [None]:
from collections import Counter

# ?Counter
c = Counter(text4)

print(c.most_common(4))

print(sum(c.values()))

print(c['vote'])

In [None]:
import matplotlib.pyplot as plt



# Dispersion plot 
from nltk.draw.dispersion import dispersion_plot
dispersion_plot(text4, ["citizens", "democracy", "freedom", "war", "America", "vote","the"])

#### Counting
The length of a text from start to finish, in terms of the words and punctuation symbols that appear. All tokens. 







In [None]:
len(text4)

Count how often a word occurs in a text:


In [None]:
text4.count("war")

How many distinct words does the book of Genesis contain? 
The vocabulary of a text is just the set of tokens that it uses. 

In [None]:
len(set(text4)) #types

# Each word used on average x times. Richness of the text. 
len(text4) / len(set(text4)) 

#### Define functions: 

What do you think they do? 


In [None]:
def lexical_diversity(text):
    """
    """
    return len(set(text))/float(len(text))

In [None]:
def percentage(count, total):
    return count/float(total)

Then use the defined functions:

In [None]:
from __future__ import division #to get precise (float) division in Python 2.x. In Python 3.0 you get it automatically. 
lexical_diversity(text4)

In [None]:
percentage(text4.count('the'), len(text4)) 

#### Simple statistics

Counting Words Appearing in a Text (a frequency distribution). 


In [None]:
### Freqdist is doing pretty much the same thing which counter from collections is doing

from nltk import FreqDist
fdist1 = FreqDist(text4)
fdist1.items()

# ordered abstraction of dictionary
sorted(fdist1.items(), key=lambda kv: kv[1],reverse= True)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
# plot the top 20 tokens
fdist1.plot(20)

In [None]:
# Hmmm..this is a good start...but we don't care about words like "for" and "and"
from nltk.corpus import stopwords

# let's use NLTK's built-in list of "stopwords"
stoplist = stopwords.words('english')
print(stoplist)

np.shape(stoplist)

In [None]:
#Frequency plots after removing stop words
tokens = [token for token in text4 if token not in stoplist]

frequencyDistribution = nltk.FreqDist(tokens)
print(frequencyDistribution.freq)

plt.figure(figsize=(12, 6))
# plot the top 20 tokens
frequencyDistribution.plot(20)

In [None]:
#Frequency plots after removing single letter words

# We don't want these symbols
symbols = {"``", "''", ":", 'The', '--', ' ', ' - ', 'It', 'We'}

tokens = [token for token in tokens if len(token)>1 and token not in symbols]

frequencyDistribution = nltk.FreqDist(tokens)
print(frequencyDistribution.freq)

plt.figure(figsize=(12, 6))
# plot the top 20 tokens
frequencyDistribution.plot(20)
# MUCH BETTER!!

In [None]:
# fdist1 is a collection of all the words existing 
vocabulary1 = fdist1.keys() # list of all the distinct types in the text
vocabulary1 # look at first 3

- words that occur only once, called hapaxes: 

In [None]:
print(np.shape(fdist1.hapaxes()))
print(fdist1.hapaxes())

 
- words that meet a condition, are long for example
    

In [None]:
V = set(text4)
long_words = [w for w in V if len(w) > 10]
print(sorted(long_words))

- words that characterize a text (are relatively long, and occur frequently)

In [None]:
# Words with length > 12 and frequency > 7

fdist = FreqDist(text4)
sorted([w for w in set(text4) if len(w) > 12 and fdist[w] > 7])

In [None]:
np.shape(tokens)

In [None]:
#Note: You need to install wordcloud using pip
# Simple WordCloud
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import random

from wordcloud import WordCloud, STOPWORDS      #need to install wordcloud package

#text = 'all your base are belong to us all of your base base base'
wordcloud = WordCloud(width = 1000, height = 500).generate(' '.join(tokens))

In [None]:
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

#### Conditional frequency distributions

Working with the inaugural corpus:















In [None]:
from nltk.corpus import inaugural

inaugural.fileids()[:2]

print([fileid for fileid in inaugural.fileids()][-10:] )
# Get the first 4 characters of the file IDs

How are the words "America" and "citizen" are used over time?

In [None]:
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    
    
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'war']
    if w.lower().startswith(target))
cfd.plot()

Working with the news database corpus:


In [None]:
from nltk.corpus import brown

len(set(inaugural.words())) # this is same as text 4

# ?brown.words

In [None]:
news_words = brown.words(categories="news") 
print(news_words) # get the first words in the corpus

In [None]:
len(news_words)

In [None]:
freq = nltk.FreqDist(news_words)
freq.plot(30) # frequency of most commonly used words in the corpus

How are different verbs used in different news genres? 

In [None]:
from nltk import FreqDist

verbs = ["should", "may", "can"]
genres = ["news", "government", "romance"]

for g in genres:
    words=brown.words(categories=g)
    freq=FreqDist([w.lower() for w in words if w.lower() in verbs])
    print(g, freq)

In [None]:
freq # this is for romance -->  last iteration

#### Stopwords

What percentage of the words in a corpus are NOT stopwords? 


In [None]:
from nltk.corpus import stopwords
len(stopwords.words('english'))

In [None]:
def content_fraction(text):
    """
    """
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text)


In [None]:
# This contains 52.23% of non-stop words

print(content_fraction(nltk.corpus.inaugural.words()))

In [None]:
# This contains 59.09% of non-stop words

content_fraction(nltk.corpus.brown.words())

## Importing and accessing your own text

Useful libraries: 

In [None]:
import nltk, re, pprint
import requests
import urllib

# urllib.request.urlopen(link)

### User input


In [None]:
s = input("Enter some text: ")

### Online articles
Getting text out of HTML is a sufficiently common task that NLTK provides a helper function nltk.clean_html(), which takes an HTML string and returns raw text.

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "http://www.bbc.co.uk/news/education-24367153"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

In [None]:
raw = BeautifulSoup.get_text(soup)  

In [None]:
raw

In [None]:
#raw = nltk.clean_html(html)
tokens = nltk.word_tokenize(raw)
tokens[:15]

## Text similarity

We can use both NLTK and scikit-learn for this. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

Calculate tf-idf:

In [None]:
vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(["New Year's Eve in New York",
                            "New Year's Eve in London",
                            "York is closer to London than to New York",
                            "London is closer to Bucharest than to New York"])

Calculate cosine similarity:

In [None]:
# vect.vocabulary

In [None]:
tfidf.toarray()

In [None]:
cosine=(tfidf * tfidf.T).A
print(cosine)

## Trained classification with NLTK

#### Names-gender identification example

In [None]:
from nltk.corpus import names
import random

Select relevant fearures. Here, last letter of name. 

In [None]:
def gender_features(word):
    return {'last_letter': word[-1]}

What is the feature for the name Shrek? 

gender_features('Shrek')

What is the feature for your own name? 

In [None]:
# names.words('names.txt')

In [None]:
gender_features('iulia')

Train and test data: 

In [None]:
names = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')])

Arrange data randomly and extract features

In [None]:
random.shuffle(names)
featuresets = [(gender_features(n), g) for (n,g) in names]
from nltk.classify import apply_features # use apply if you're working with large corpora

Divide data into training and test sets:

In [None]:
names[:10] # list containing (name,gender)

In [None]:
train_set = apply_features(gender_features, names[500:1000])
test_set = apply_features(gender_features, names[:500])

Use a Naive Bayes Classifier:

In [None]:
train_set[:3]

In [None]:
# train_set contins text column, nltk.NaiveBayesClassifier works on that aswell
classifier = nltk.NaiveBayesClassifier.train(train_set)

Classify the test set and evaluate performance

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

What are the most informative features?

In [None]:
classifier.show_most_informative_features(5)

In [None]:
classifier.classify(gender_features('iulia'))

In [None]:
classifier.classify(gender_features('cioroianu'))