In [1]:
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [2]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 19% 5.00M/25.7M [00:00<00:00, 24.0MB/s]
100% 25.7M/25.7M [00:00<00:00, 85.8MB/s]


In [3]:
!unzip imdb-dataset-of-50k-movie-reviews.zip 

Archive:  imdb-dataset-of-50k-movie-reviews.zip
replace IMDB Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [4]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
dataset = pd.read_csv('IMDB Dataset.csv')

In [6]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
pos = dataset[dataset['sentiment']=='positive']
neg = dataset[dataset['sentiment']=='negative']

In [8]:
pos.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive


In [9]:
data = pos.iloc[:5000]

In [10]:
data = pd.concat([data, neg[:5000]], ignore_index=True)

In [11]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive
4,"Probably my all-time favorite movie, a story o...",positive


In [12]:
data.shape

(10000, 2)

In [13]:
import random
data = data.sample(frac = 1)

In [14]:
data.reset_index(inplace=True)

In [15]:
data.drop(['index'], inplace=True, axis=1)

In [16]:
data.head()

Unnamed: 0,review,sentiment
0,"I gotta say, Clive Barker's Undying is by far ...",positive
1,Give me a break. How can anyone say that this ...,negative
2,The decline series is amazing and director PS ...,positive
3,I would not be giving away too much of the fil...,negative
4,This film was very well advertised. I am an av...,negative


In [17]:
doc = []

for i,j in zip(data['review'],data['sentiment']):
    word = nltk.word_tokenize(i)
    doc.append((word, j))
    
print(doc[0])
    

(['I', 'got', 'ta', 'say', ',', 'Clive', 'Barker', "'s", 'Undying', 'is', 'by', 'far', 'the', 'best', 'horror', 'game', 'to', 'have', 'ever', 'been', 'made', '.', 'I', "'ve", 'played', 'Resident', 'Evil', ',', 'Silent', 'Hill', 'and', 'the', 'Evil', 'Dead', 'and', 'Castlevania', 'games', 'but', 'none', 'of', 'them', 'have', 'captured', 'the', 'pure', 'glee', 'with', 'which', 'this', 'game', 'tackles', 'its', 'horrific', 'elements', '.', 'Barker', 'is', 'good', 'at', 'what', 'he', 'does', ',', 'which', 'is', 'attach', 'the', 'horror', 'to', 'our', 'world', ',', 'and', 'it', 'shows', 'as', 'his', 'hand', 'is', 'clearly', 'everywhere', 'in', 'this', 'game', '.', 'Heck', ',', 'even', 'his', 'voice', 'is', 'in', 'the', 'game', 'as', 'one', 'of', 'the', 'main', 'characters', '.', 'Full', 'of', 'lush', 'visuals', 'and', 'enough', 'atmosphere', 'to', 'shake', 'a', 'stick', 'at', ',', 'Undying', 'is', 'the', 'game', 'to', 'beat', 'in', 'my', 'books', 'as', 'the', 'best', 'horror', 'title', '.',

In [18]:
words = []
for i in data['review']:
        for a in (nltk.word_tokenize(i)):
            words.append(a.lower())
     

In [19]:
words[:10]

['i', 'got', 'ta', 'say', ',', 'clive', 'barker', "'s", 'undying', 'is']

In [20]:
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')

import string

In [21]:
clean_words = []
for word in words:
    if word not in stopwords_english and word not in string.punctuation:
        clean_words.append(word)

clean_words[:10]

['got',
 'ta',
 'say',
 'clive',
 'barker',
 "'s",
 'undying',
 'far',
 'best',
 'horror']

In [22]:
len(clean_words)

1305249

In [23]:
from nltk import FreqDist
freq = FreqDist(clean_words)

In [24]:
len(freq)

71434

In [25]:
most_common = freq.most_common(4000)
word_features = [item[0] for item in most_common]

In [26]:
def feature(document):
    doc = set(document)
    
    feature = {}
    
    for word in word_features:
        feature['contains(%s)' % word] = (word in doc)
        
    return feature


In [27]:
feature_set = [(feature(doc), category) for (doc, category) in doc]

In [28]:
len(feature_set)

10000

In [29]:
train = feature_set[:7001]
test = feature_set[7001:]

In [30]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train)

In [31]:
from nltk import classify 
 
accuracy = classify.accuracy(classifier, test)
print (accuracy) 

0.8432810936978993
