## Classifying texts with NLTK
This notebook will show how to classify labeled texts using NLTK.
We are going use the movie_review data set in NLTK corpus. This dataset contains movie review labeled with positive or negtive, so we can train and text our classifier.

In [2]:
import nltk
import random
from nltk.corpus import movie_reviews

# In each category (we have pos or neg), 
for category in movie_reviews.categories():
    #take all of the file IDs (each review has its own ID)
    for fileid in movie_reviews.fileids(category):
        #then store the word_tokenized version (a list of words) for the file ID, 
        #followed by the positive or negative label in one big list.
        documents.append([list(movie_reviews.words(fileid)),category])
    
#Next, we use random to shuffle our documents. 
#This is because we're going to be training and testing. 
#If we left them in order, chances are we'd train on all of the negatives, some positives, and then test only against positives. 
random.shuffle(documents)

print(documents[1])

[['for', 'a', 'film', 'touted', 'as', 'exploring', 'relationships', 'and', 'black', 'sexuality', ',', 'trois', 'is', 'surprisingly', 'tame', '.', 'despite', 'it', "'", 's', 'lurid', 'subject', 'matter', 'and', 'it', "'", 's', 'passing', 'nod', 'to', 'fatal', 'attraction', ',', 'it', 'moves', 'along', 'with', 'flat', ',', 'uninspired', 'dialogue', 'as', 'it', 'sets', 'up', 'a', 'surprising', 'climax', 'that', 'tries', 'mightily', 'to', 'overthrow', 'the', 'considerable', 'dead', 'weight', 'of', 'the', 'rest', 'of', 'the', 'film', '.', 'freshly', 'moved', 'to', 'atlanta', ',', 'jermaine', '(', 'dourdan', ')', 'and', 'his', 'wife', ',', 'jasmine', '(', 'moore', ')', ',', 'have', 'the', 'trappings', 'of', 'a', 'perfect', 'life', '.', 'they', 'have', 'a', 'beautiful', 'house', 'in', 'suburbia', '.', 'jermaine', 'is', 'a', 'lawyer', 'on', 'the', 'fast', 'track', 'at', 'his', 'firm', '.', 'jasmine', 'is', 'his', 'supportive', 'wife', ',', 'who', 'is', 'finishing', 'up', 'her', 'college', 'deg

In [10]:
# Remove stopwords ad puctuation
from nltk.corpus import stopwords
import string

stopWords = set(stopwords.words('english'))
re_documents = []
for doc in documents:
    words = []
    for w in doc[0]:
        if w not in stopWords and w not in string.punctuation:
            words.append(w)
    re_documents.append([words,doc[1]])
re_documents[0]

[['great',
  'things',
  'come',
  'end',
  'dot',
  'com',
  'era',
  'embodies',
  'perfectly',
  'beneath',
  'mound',
  'bankruptcy',
  'paperwork',
  'lies',
  'remains',
  'former',
  'dot',
  'com',
  'darling',
  'company',
  'kozmo',
  'com',
  'online',
  'convenience',
  'store',
  'stocked',
  'ice',
  'cream',
  'porn',
  'videos',
  'basic',
  'necessities',
  'urban',
  'dweller',
  'hand',
  'delivered',
  'couriers',
  'within',
  'hour',
  'designed',
  '1997',
  'two',
  'college',
  'roommates',
  '--',
  'joseph',
  'parks',
  '27',
  'year',
  'old',
  'goldman',
  'sachs',
  'banker',
  'yong',
  'kang',
  'kozmo',
  'flamed',
  'three',
  'short',
  'years',
  'raising',
  '280',
  'million',
  'venture',
  'capital',
  'funding',
  'partnerships',
  'bigwigs',
  'starbucks',
  'amazon',
  'com',
  'december',
  '1999',
  'company',
  'boasted',
  '4',
  '000',
  'employees',
  '11',
  'cities',
  'barking',
  'ceo',
  'park',
  'attracting',
  'kinds',
  'media

In [5]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)
print (all_words.most_common(15))
print (all_words['bad'])

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
1395
