# Movie Reviews Sentiment Analysis using NLP 

### Import nltk module and movie review data

In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk import pos_tag

In [2]:
# Check field ids of movie reviews
movie_reviews.fileids('neg')[0:10]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt']

In [3]:
# movie_reviews.words(id) breaks the review with movie_id = id into words
movie_reviews.words(movie_reviews.fileids()[2])

['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...]

### Store the review data in a documents list which is made up of a tuple containing two enteries: a list of words and the category of the review

In [4]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [5]:
# Shuffle the enteries to obtain a good mix of data
import random
random.shuffle(documents)
documents[0:5]

[(['in', '1970s', ',', 'many', 'european', ...], 'neg'),
 (['actually', ',', 'i', "'", 'm', 'fairly', 'sure', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['at', 'first', 'glance', ',', 'daylight', 'would', ...], 'pos'),
 (['in', '1912', ',', 'a', 'ship', 'set', 'sail', 'on', ...], 'pos')]

### Import WordNetLemmatizer and create a lemmatizer object

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

### A function to convert the Part Of Speech given by pos_tag to one accepted by the lemmatizer

In [7]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

### Make a list of all the stop words and punctuations

In [8]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

### Function to clean a list of words by doing two steps
   #### 1. Removing stop words
   #### 2. Lemmatizing the words (make all words as root words , Eg, playing, played are transformed to play)

In [9]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [10]:
# Call function on all reviews
documents = [(clean_review(document), category) for document, category in documents]

### Import the module Count Vectorizer to convert data in 2D array format

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

### Divide data into X and Y i.e text_documents and categories

In [12]:
text_documents = [" ".join(document) for document, category in documents]
categories = [category for document, category in documents]

### Split data into testing and training data

In [13]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [14]:
count_vec = CountVectorizer(max_features = 2000, ngram_range=(1,2), max_df = 0.8)
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 6, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [15]:
x_test_features = count_vec.transform(x_test)

### Apply any classification model to the data

#### 1. SVC

In [16]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features, y_train)
svc.score(x_test_features, y_test)

0.846

#### 2. Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train_features, y_train)
model.score(x_test_features, y_test)

0.86

#### 3. Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train_features, y_train)
model.score(x_test_features, y_test)

0.664

#### 4. Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train_features, y_train)

model.score(x_test_features, y_test)

0.828

#### 5. Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(x_train_features.toarray(), y_train)

model.score(x_test_features.toarray(), y_test)

0.74

#### 6. KNN

In [21]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5) 
model.fit(x_train_features.toarray(), y_train)

model.score(x_test_features.toarray(), y_test)

0.624

#### 7. Multinomial Naive Bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB() 
model.fit(x_train_features.toarray(), y_train)

model.score(x_test_features.toarray(), y_test)

0.858