# Importing data and performing basic stuff

In [1]:
import nltk

In [2]:
from nltk.corpus import movie_reviews

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
movie_reviews.fileids()

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [5]:
movie_reviews.fileids('neg')

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [6]:
len(movie_reviews.fileids())

2000

In [7]:
movie_reviews.words(movie_reviews.fileids()[0])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

# Formatting data

In [8]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [9]:
import random
random.shuffle(documents)
documents[0:5]

[(['i', 'have', 'to', 'admit', 'that', 'i', 'disliked', ...], 'pos'),
 (['"', 'spawn', '"', 'features', 'good', 'guys', ',', ...], 'neg'),
 (['susan', 'granger', "'", 's', 'review', 'of', '"', ...], 'pos'),
 (['studio', '54', 'attracted', 'so', 'many', 'weird', ...], 'neg'),
 (['"', '.', '.', '.', 'because', 'i', "'", 'm', 'a', ...], 'neg')]

# Cleaning data

In [10]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [11]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [12]:
from nltk.corpus import stopwords
import string
stops = stopwords.words('english') + list(string.punctuation)

In [13]:
from nltk import pos_tag

In [14]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])                                 
            clean_word = lemmatizer.lemmatize(w, get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [15]:
import time

In [16]:
start = time.time()
documents = [(clean_review(document), category) for document, category in documents]
end = time.time()
print("Cleaning time: ", end - start)

Cleaning time:  340.1286940574646


In [17]:
documents[0]

(['admit',
  'disliked',
  'film',
  'initially',
  'certianly',
  'every',
  'taste',
  'sheer',
  'torture',
  'sit',
  'restless',
  'mood',
  'say',
  'right',
  'mood',
  'absolutely',
  'incredible',
  'second',
  'favorite',
  'movie',
  '1998',
  'would',
  'shoo',
  'first',
  'almost',
  'year',
  'perhaps',
  'big',
  'turn',
  'many',
  'film',
  'unconventionality',
  'hard',
  'press',
  'compare',
  'film',
  'see',
  'artsy',
  'incredibly',
  'slow',
  'amazingly',
  'work',
  'beautifully',
  'second',
  'view',
  'realize',
  'film',
  'follow',
  'three',
  'act',
  'structure',
  'think',
  'sort',
  'structure',
  'first',
  'saw',
  'first',
  'act',
  'serf',
  'set',
  'character',
  'sort',
  'exists',
  'even',
  'moreso',
  'set',
  'mood',
  'tension',
  'restlesness',
  'perhaps',
  'even',
  'feeling',
  'boredom',
  'shatter',
  'intense',
  'violence',
  'second',
  'encompasses',
  'movie',
  'majority',
  'film',
  'one',
  'extend',
  'battle',
  'sc

# Train - Test split

In [18]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

# Reformatting data so as to feed it to in built nltk classifiers

In [19]:
all_words = []
for doc in documents:
    all_words += doc[0]

In [20]:
freq = nltk.FreqDist(all_words)                 #will retrurn a freq distribution object
common = freq.most_common(3000)
features = [i[0] for i in common]

In [21]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'come',
 'also',
 'bad',
 'give',
 'life',
 'two',
 'look',
 'way',
 'know',
 'seem',
 'first',
 'end',
 '--',
 'year',
 'work',
 'thing',
 'plot',
 'say',
 'play',
 'really',
 'little',
 'show',
 'people',
 'could',
 'man',
 'star',
 'love',
 'never',
 'try',
 'great',
 'director',
 'best',
 'performance',
 'new',
 'big',
 'many',
 'action',
 'actor',
 'want',
 'u',
 'watch',
 'find',
 'think',
 'role',
 'act',
 'another',
 'back',
 'audience',
 'world',
 'something',
 'turn',
 'still',
 'day',
 'old',
 'set',
 'however',
 'use',
 'every',
 'begin',
 'though',
 'guy',
 'part',
 'comedy',
 'feel',
 'cast',
 'real',
 'enough',
 'around',
 'point',
 'interest',
 'last',
 'run',
 'write',
 'young',
 'may',
 'fact',
 'name',
 'long',
 'funny',
 'script',
 'actually',
 'right',
 'minute',
 'woman',
 'effect',
 'almost',
 'lot'

In [22]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [23]:
# NOTE: through the above function we are changing the format of the data
# because nltk classifiers requires the data to be in the format of :
# array of tuples , where each tuple has a dictionary of features and a category

In [24]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

# Classification using NLTK Naive Bayes

In [25]:
from nltk import NaiveBayesClassifier

In [26]:
classifier = NaiveBayesClassifier.train(training_data)

In [27]:
nltk.classify.accuracy(classifier, testing_data)

0.786

In [28]:
classifier.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     17.6 : 1.0
                 idiotic = True              neg : pos    =     11.6 : 1.0
                  seagal = True              neg : pos    =     10.9 : 1.0
                 emperor = True              pos : neg    =      8.4 : 1.0
               ludicrous = True              neg : pos    =      8.3 : 1.0
                  turkey = True              neg : pos    =      7.8 : 1.0
             wonderfully = True              pos : neg    =      7.0 : 1.0
              schumacher = True              neg : pos    =      6.9 : 1.0
                   anger = True              pos : neg    =      6.6 : 1.0
             beautifully = True              pos : neg    =      6.2 : 1.0
                   inept = True              neg : pos    =      6.2 : 1.0
                   ideal = True              pos : neg    =      6.0 : 1.0
                 refresh = True              pos : neg    =      6.0 : 1.0

# using sklearn classifiers but data in the format of nltk classifiers

In [29]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [30]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [31]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC())>

In [32]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.862

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)

In [35]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [36]:
nltk.classify.accuracy(classifier_sklearn1, testing_data)

0.832

# if we want to use sklearn classifier and have data in sklearn classifiers format 

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
train_set = {"the sky sky is blue", "the sun is bright"}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
a.todense()

matrix([[1, 0, 1],
        [1, 2, 1]], dtype=int64)

In [39]:
count_vec.get_feature_names()

['is', 'sky', 'the']

In [40]:
a = ["ad", "is"]
" ".join(a)

'ad is'

In [41]:
categories = [category for document, category in documents]

In [42]:
text_documents = [" ".join(document) for document, category in documents]

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [45]:
count_vec = CountVectorizer(max_features = 2000 , ngram_range=(1,3) , max_df=0.8 , min_df=0.1)
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 2, 0, ..., 0, 0, 0],
        [0, 0, 2, ..., 0, 0, 1],
        [0, 0, 1, ..., 0, 0, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 1]], dtype=int64)

In [46]:
count_vec.get_feature_names()

['able',
 'act',
 'action',
 'actor',
 'actually',
 'add',
 'age',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'american',
 'among',
 'another',
 'anyone',
 'anything',
 'appear',
 'around',
 'ask',
 'attempt',
 'attention',
 'audience',
 'away',
 'back',
 'bad',
 'base',
 'beautiful',
 'become',
 'becomes',
 'begin',
 'behind',
 'believe',
 'best',
 'big',
 'bit',
 'black',
 'body',
 'book',
 'boring',
 'boy',
 'break',
 'bring',
 'brother',
 'call',
 'camera',
 'car',
 'care',
 'career',
 'carry',
 'case',
 'cast',
 'certainly',
 'chance',
 'change',
 'character',
 'charm',
 'chase',
 'child',
 'city',
 'classic',
 'close',
 'co',
 'come',
 'comedy',
 'comic',
 'completely',
 'consider',
 'could',
 'couple',
 'course',
 'create',
 'credit',
 'cut',
 'dark',
 'daughter',
 'david',
 'day',
 'dead',
 'deal',
 'death',
 'despite',
 'dialogue',
 'die',
 'different',
 'direct',
 'direction',
 'director',
 'do',
 'drama',
 'dream',
 'early',
 'easy',
 'effec

In [47]:
x_test_features = count_vec.transform(x_test)

In [48]:
x_test_features

<500x449 sparse matrix of type '<class 'numpy.int64'>'
	with 48277 stored elements in Compressed Sparse Row format>

In [49]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features , y_train)
svc.score(x_test_features , y_test)

0.788

In [51]:
from sklearn.naive_bayes import GaussianNB
nb1 = GaussianNB()
nb1.fit(x_train_features.todense() , y_train)
nb1.score(x_test_features.todense() , y_test)

0.72