## NLP on Movie_Reviews Dataset

In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
print(len(movie_reviews.fileids('pos')))
print(len(movie_reviews.fileids('neg')))

1000
1000


In [4]:
movie_reviews.words(movie_reviews.fileids('pos')[5])

['on', 'june', '30', ',', '1960', ',', 'a', 'self', ...]

In [5]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [6]:
import random
random.shuffle(documents)
documents[:5]

[(['a', 'couple', 'of', 'criminals', '(', 'mario', 'van', ...], 'neg'),
 (['capsule', ':', 'combine', 'one', 'quart', 'of', ...], 'neg'),
 (['the', 'makers', 'of', 'jurassic', 'park', '&', 'the', ...], 'pos'),
 (['if', 'you', "'", 've', 'ever', 'perused', 'my', ...], 'pos'),
 (['numerous', 'comparisons', 'can', 'be', 'made', ...], 'neg')]

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
import string
stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stop_words.update(punctuations)
print(stop_words, string.punctuation)

{'his', '=', 'will', 'because', 'any', 'themselves', "didn't", 'but', 'or', 'll', 'after', 'wasn', 'are', 'out', 'then', '~', 'down', 'by', "you'll", 'those', 'yours', 'with', '-', 'very', 'shouldn', 'does', "she's", '<', 'we', "mustn't", '{', 'yourselves', 't', '&', 'your', 'it', "doesn't", '}', 'whom', 'from', 'd', 'between', 'where', 'm', 'i', "hadn't", 'ours', 'have', 'own', '!', 'her', 'were', 'now', 'these', 'doing', '?', 'here', 'there', '^', 'and', 'how', "mightn't", 'again', "isn't", 'few', '$', '@', 'wouldn', 'do', 'myself', 'their', 's', 'you', 'ain', 'during', 'why', "wouldn't", '/', 'isn', 'himself', 'itself', "weren't", 'be', 'herself', 'against', 'under', 'at', 'was', '_', 'an', 'a', 'its', "shan't", 'hadn', 'each', '.', '%', 'ourselves', 'over', 'being', 'as', 'should', 'if', 'haven', "you're", 'won', "don't", 'don', 'yourself', "should've", ';', "you'd", '+', 'other', 'both', 'shan', 'what', 'am', 'this', 'about', '#', '>', 'when', 'who', 'mustn', 'mightn', 'couldn', "

In [8]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop_words:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [10]:
documents = [(clean_review(document), category) for document, category in documents]

In [11]:
print(documents[0])

(['couple', 'criminal', 'mario', 'van', 'peebles', 'loretta', 'devine', 'move', 'rich', 'family', 'house', 'hope', 'con', 'jewel', 'however', 'someone', 'else', 'steal', 'jewel', 'able', 'get', 'writer', 'mario', 'van', 'peebles', 'delivers', 'clever', 'script', 'several', 'unexpected', 'plot', 'twist', 'director', 'mario', 'van', 'peebles', 'undermines', 'high', 'point', 'haphazard', 'camera', 'work', 'edit', 'pace', 'felt', 'though', 'film', 'wrap', 'hour', 'mark', 'ala', 'still', '35', 'minute', 'go', 'daniel', 'baldwin', 'believe', 'type', 'give', 'best', 'performance', 'film', 'outshine', 'talented', 'member', 'cast', 'r'], 'neg')


In [12]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [13]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [14]:
import nltk

In [15]:
freq = nltk.FreqDist(all_words) # Frequency Distribution i.e the freq of occurance of a particular word
common = freq.most_common(3000) # The most frequent words also referred as most_common(3000)
features = [i[0] for i in common]

In [16]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [17]:
get_feature_dict(training_documents[0][0])

{'film': True,
 'movie': False,
 'one': False,
 'make': False,
 'like': False,
 'character': False,
 'get': True,
 'see': False,
 'go': True,
 'time': False,
 'well': False,
 'scene': False,
 'even': False,
 'good': False,
 'story': False,
 'take': False,
 'would': False,
 'much': False,
 'come': False,
 'bad': False,
 'also': False,
 'life': False,
 'two': False,
 'seem': False,
 'look': False,
 'give': True,
 'way': False,
 'end': False,
 'first': False,
 'know': False,
 '--': False,
 'work': True,
 'year': False,
 'thing': False,
 'plot': True,
 'really': False,
 'play': False,
 'say': False,
 'little': False,
 'people': False,
 'show': False,
 'man': False,
 'could': False,
 'never': False,
 'great': False,
 'love': False,
 'try': False,
 'director': True,
 'performance': True,
 'big': False,
 'star': False,
 'new': False,
 'best': True,
 'many': False,
 'u': False,
 'want': False,
 'action': False,
 'actor': False,
 'watch': False,
 'think': False,
 'find': False,
 'role': False,


In [18]:
# training_documents

In [19]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

In [20]:
# training_data[0]

In [21]:
from nltk import NaiveBayesClassifier

In [22]:
classifier = NaiveBayesClassifier.train(training_data)

In [23]:
nltk.classify.accuracy(classifier, testing_data)

0.786

In [24]:
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     18.1 : 1.0
             outstanding = True              pos : neg    =     11.8 : 1.0
                 stiller = True              pos : neg    =     10.2 : 1.0
             wonderfully = True              pos : neg    =      9.1 : 1.0
                 winslet = True              pos : neg    =      8.8 : 1.0
                   jolie = True              neg : pos    =      7.3 : 1.0
                   mulan = True              pos : neg    =      6.7 : 1.0
              whatsoever = True              neg : pos    =      6.7 : 1.0
                 refresh = True              pos : neg    =      6.4 : 1.0
                 idiotic = True              neg : pos    =      6.0 : 1.0
              schumacher = True              neg : pos    =      5.9 : 1.0
                   waste = True              neg : pos    =      5.9 : 1.0
                  random = True              neg : pos    =      5.8 : 1.0

### Using CountVectorizer 
#### to basicallly convert the data into the format that sklearn requires(like a 2D X array and Y array being output) from the already cleaned data from nltk

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
train_set = {'the sky is sky blue', 'the sun is bright'}
count_vec = CountVectorizer(max_features=3)
a = count_vec.fit_transform(train_set)
a.todense()

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [27]:
count_vec.get_feature_names_out()

array(['is', 'sky', 'the'], dtype=object)

In [28]:
categories = [category for document, category in documents]

In [29]:
text_documents = [' '.join(document) for document, category in documents]
# text_documents

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [32]:
count_vec = CountVectorizer(max_features=2000, ngram_range=(2,3)) # The ngram usually used to 
x_train_features = count_vec.fit_transform(x_train)               # group n words to one feature,
x_train_features.todense()                                        # likely example - 'not good' is a bigram

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
count_vec.get_feature_names_out()

array(['10 10', '10 minute', '10 scale', ..., 'young woman', 'yun fat',
       'zeta jones'], dtype=object)

In [34]:
x_test_features = count_vec.transform(x_test)
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 12711 stored elements in Compressed Sparse Row format>

Now we have both separate arrays of x_train and x_test on which any sklearn algorithm can be used