# Text Classification

In [35]:
import nltk
import string
from nltk.corpus import movie_reviews,stopwords
from nltk.tokenize import word_tokenize
import pickle

### Dataset 

In [2]:
movie_reviews.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [3]:
len(movie_reviews.words())

1583820

In [5]:
# Distinct Words
len(set(movie_reviews.words()))

39768

In [10]:
#Checking some files
movie_reviews.fileids()[-5:]

['pos/cv995_21821.txt',
 'pos/cv996_11592.txt',
 'pos/cv997_5046.txt',
 'pos/cv998_14111.txt',
 'pos/cv999_13106.txt']

### 2. Text Preprocessing

In [26]:
reviews = ' '.join(movie_reviews.words())

In [27]:
reviews[:300]

'plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what \' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the '

##### 2.1 Removing Punctuations

In [28]:
filtered_reviews = reviews.translate(str.maketrans(' ',' ',string.punctuation))

In [29]:
filtered_reviews[:300]

'plot  two teen couples go to a church party  drink and then drive  they get into an accident  one of the guys dies  but his girlfriend continues to see him in her life  and has nightmares  what  s the deal  watch the movie and  sorta  find out    critique  a mind  fuck movie for the teen generation '

##### 2.2 Tokenize and Lower

In [30]:
tokens = word_tokenize(filtered_reviews.lower())

In [32]:
tokens[:10]

['plot', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', 'drink']

##### 2.3 Remove Stopwords

In [34]:
stopwords_eng = stopwords.words('English')
filtered_tokens = [word for word in tokens if word not in stopwords_eng]
filtered_tokens[:10]

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get']

##### 2.4 Frequency Words

In [36]:
counter_dict = nltk.FreqDist(filtered_tokens)
counter_dict.most_common(15)

[('film', 9519),
 ('one', 5853),
 ('movie', 5774),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2170),
 ('would', 2110),
 ('much', 2050),
 ('character', 2020),
 ('also', 1967),
 ('get', 1949),
 ('two', 1912),
 ('well', 1906)]

##### 2.5 Feature Selection

In [54]:
word_features = [word[0] for word in counter_dict.most_common(3000)]
word_features[:15]

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much',
 'character',
 'also',
 'get',
 'two',
 'well']

### 3. Create Document with Labels (Categories)

In [41]:
catg = movie_reviews.categories()
catg

['neg', 'pos']

In [42]:
movie_reviews.fileids(['neg'])

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [44]:
docs = [(list(movie_reviews.words(fileid)),category) 
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)
       ]

In [45]:
len(docs)

2000

In [49]:
docs[0]

(['plot',
  ':',
  'two',
  'teen',
  'couples',
  'go',
  'to',
  'a',
  'church',
  'party',
  ',',
  'drink',
  'and',
  'then',
  'drive',
  '.',
  'they',
  'get',
  'into',
  'an',
  'accident',
  '.',
  'one',
  'of',
  'the',
  'guys',
  'dies',
  ',',
  'but',
  'his',
  'girlfriend',
  'continues',
  'to',
  'see',
  'him',
  'in',
  'her',
  'life',
  ',',
  'and',
  'has',
  'nightmares',
  '.',
  'what',
  "'",
  's',
  'the',
  'deal',
  '?',
  'watch',
  'the',
  'movie',
  'and',
  '"',
  'sorta',
  '"',
  'find',
  'out',
  '.',
  '.',
  '.',
  'critique',
  ':',
  'a',
  'mind',
  '-',
  'fuck',
  'movie',
  'for',
  'the',
  'teen',
  'generation',
  'that',
  'touches',
  'on',
  'a',
  'very',
  'cool',
  'idea',
  ',',
  'but',
  'presents',
  'it',
  'in',
  'a',
  'very',
  'bad',
  'package',
  '.',
  'which',
  'is',
  'what',
  'makes',
  'this',
  'review',
  'an',
  'even',
  'harder',
  'one',
  'to',
  'write',
  ',',
  'since',
  'i',
  'generally',
  'a

### 4. Feature Engineering

In [56]:
# Selecting words that exist in 3000 common words
def search_features(doc,word_features):
    words = set(doc)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

In [57]:
search_features(docs[0][0],word_features)

{'film': True,
 'one': True,
 'movie': True,
 'like': True,
 'even': True,
 'good': True,
 'time': False,
 'story': False,
 'would': True,
 'much': False,
 'character': True,
 'also': True,
 'get': True,
 'two': True,
 'well': True,
 'characters': True,
 'first': False,
 'see': True,
 'way': True,
 'make': True,
 'life': True,
 'really': True,
 'films': True,
 'plot': True,
 'little': True,
 'people': True,
 'could': False,
 'scene': False,
 'bad': True,
 'man': False,
 'never': False,
 'best': False,
 'new': True,
 'scenes': True,
 'many': False,
 'director': True,
 'know': True,
 'movies': True,
 'action': False,
 'great': False,
 'another': False,
 'love': False,
 'go': True,
 'made': False,
 'us': True,
 'big': True,
 'end': False,
 'something': False,
 'back': True,
 'still': True,
 'world': True,
 'seems': True,
 'work': False,
 'makes': True,
 'however': False,
 'every': True,
 'though': False,
 'better': False,
 'real': False,
 'audience': True,
 'enough': False,
 'seen': False

In [58]:
featureSet = [(search_features(doc,word_features),category)
             for (doc,category) in docs]

In [62]:
len(featureSet[0][0])

3000

In [63]:
featureSet[0][1]

'neg'

In [121]:
featureSet[0]

({'film': True,
  'one': True,
  'movie': True,
  'like': True,
  'even': True,
  'good': True,
  'time': False,
  'story': False,
  'would': True,
  'much': False,
  'character': True,
  'also': True,
  'get': True,
  'two': True,
  'well': True,
  'characters': True,
  'first': False,
  'see': True,
  'way': True,
  'make': True,
  'life': True,
  'really': True,
  'films': True,
  'plot': True,
  'little': True,
  'people': True,
  'could': False,
  'scene': False,
  'bad': True,
  'man': False,
  'never': False,
  'best': False,
  'new': True,
  'scenes': True,
  'many': False,
  'director': True,
  'know': True,
  'movies': True,
  'action': False,
  'great': False,
  'another': False,
  'love': False,
  'go': True,
  'made': False,
  'us': True,
  'big': True,
  'end': False,
  'something': False,
  'back': True,
  'still': True,
  'world': True,
  'seems': True,
  'work': False,
  'makes': True,
  'however': False,
  'every': True,
  'though': False,
  'better': False,
  'real':

### 5. Training and Test Split

In [65]:
# 80 : 20 Split
X_train = featureSet[200:1800]
X_test = featureSet[0:200] + featureSet[1800:]

### 6. Classification

In [75]:
classifier = nltk.NaiveBayesClassifier.train(X_train)

In [76]:
model_accuracy = nltk.classify.accuracy(classifier,X_test)

In [77]:
model_accuracy

0.84

In [109]:
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     13.0 : 1.0
                 idiotic = True              neg : pos    =     10.2 : 1.0
                   mulan = True              pos : neg    =      9.0 : 1.0
                  seagal = True              neg : pos    =      9.0 : 1.0
             outstanding = True              pos : neg    =      8.8 : 1.0
                   anger = True              pos : neg    =      8.7 : 1.0
            breathtaking = True              pos : neg    =      8.4 : 1.0
                   inept = True              neg : pos    =      7.4 : 1.0
                   damon = True              pos : neg    =      7.3 : 1.0
                  finest = True              pos : neg    =      7.0 : 1.0
                    lame = True              neg : pos    =      6.5 : 1.0
             wonderfully = True              pos : neg    =      6.3 : 1.0
              schumacher = True              neg : pos    =      6.2 : 1.0

Homework : Test the model with our own Review

In [82]:
prob_result = classifier.prob_classify(X_test[0][0])

In [85]:
prob_result.prob('neg')

0.999998790793755

### 7. Saving and Loading the Model

In [101]:
#Save Model

In [95]:
filename = '../AML 2304/gnb_trained_model.pk1'

In [96]:
save_classifier = open(filename,'wb')
pickle.dump(classifier,save_classifier)
save_classifier.close()

In [102]:
# Load Trained Model

In [104]:
classifier_trained = open(filename,'rb')
loaded_classifier = pickle.load(classifier_trained)
classifier_trained.close()

In [106]:
model_accuracy = nltk.classify.accuracy(loaded_classifier,X_test)
model_accuracy

0.84

### Homework

In [167]:
my_review_text = word_tokenize('breathtaking outstanding a mulan finest damon good good Movie')

label = 'pos'

my_review_final = {}

for word in word_features:
    my_review_final[word] = (word in my_review_text)

# featureSet = [(search_features(doc,word_features),category)
#              for (doc,category) in docs]



In [168]:
my_review_text
# my_review_final

['breathtaking',
 'outstanding',
 'a',
 'mulan',
 'finest',
 'damon',
 'good',
 'good',
 'Movie']

In [169]:
type(X_test[0])

tuple

In [170]:
review = (my_review_final,label)
# my_review_final

In [171]:
# type(review)
classifier.classify(my_review_final)

'neg'