In [1]:
# Import Libraries
import nltk
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [3]:
# Prints total number of words in 'movie_reviews'
len(movie_reviews.words())

1583820

In [4]:
movie_reviews.categories()

['neg', 'pos']

In [5]:
# Displays frequency of words in ‘movie_reviews’
nltk.FreqDist(movie_reviews.words())

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [6]:
# Prints frequency of the word 'happy'.
nltk.FreqDist(movie_reviews.words())['happy']

215

In [7]:
# Displays frequency of 15 most common words in ‘movie_reviews’
nltk.FreqDist(movie_reviews.words()).most_common(15)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

In [8]:
# Prints all file ids
movie_reviews.fileids()
# Prints file ids of positive reviews
movie_reviews.fileids('pos')
# Prints file ids of negative reviews.
movie_reviews.fileids('neg')

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [9]:
# Prints all words in movie_review with file id ‘neg/cv001_19502.txt’
movie_reviews.words('neg/cv001_19502.txt')

['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...]

In [10]:
# all_words is a dictionary which contains the frequency of words in ‘movie_reviews’
all_words = nltk.FreqDist(movie_reviews.words())

In [11]:
# Defining the feature_vector
feature_vector = list(all_words)[:4000]

In [12]:
# Initialization

feature = {}

# One movie review is chosen

review = movie_reviews.words('neg/cv954_19932.txt')

# 'True' is assigned if word in feature_vector can also be found in review. Otherwise 'False'

for x in range(len(feature_vector)):
    feature[feature_vector[x]] = feature_vector[x] in review

# The words which are assigned 'True' are printed 

[x for x in feature_vector if feature[x] == True]

[',',
 'the',
 '.',
 'a',
 'and',
 'of',
 'to',
 "'",
 'is',
 'in',
 's',
 '"',
 'it',
 'that',
 '-',
 ')',
 '(',
 'as',
 'with',
 'for',
 'this',
 'film',
 'i',
 'he',
 'but',
 'on',
 'are',
 't',
 'by',
 'be',
 'one',
 'movie',
 'an',
 'who',
 'not',
 'you',
 'from',
 'at',
 'was',
 'have',
 'they',
 'has',
 'all',
 'there',
 'like',
 'so',
 'about',
 'more',
 'what',
 'when',
 'their',
 ':',
 'just',
 'can',
 'if',
 'we',
 'into',
 'only',
 'no',
 'time',
 'story',
 'would',
 'been',
 'much',
 'get',
 'other',
 'do',
 'two',
 'characters',
 'first',
 'see',
 '!',
 'way',
 'because',
 'make',
 'life',
 'off',
 'too',
 'does',
 'had',
 'while',
 'people',
 'over',
 'could',
 'me',
 'scene',
 'bad',
 'my',
 'best',
 'these',
 'don',
 'new',
 'scenes',
 'know',
 'through',
 'great',
 'another',
 'made',
 'end',
 'work',
 'those',
 'down',
 'every',
 'though',
 'better',
 'audience',
 'seen',
 'going',
 'year',
 'performance',
 'same',
 'old',
 'your',
 'years',
 'comedy',
 'funny',
 'ac

In [13]:
# Document is a list of (words of review, category of review)

document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]

document

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg'),
 (['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...], 'neg'),
 (['so', 'ask', 'yourself', 'what', '"', '8mm', '"', '(', ...], 'neg'),
 (['that', "'", 's', 'exactly', 'how', 'long', 'the', ...], 'neg'),
 (['call', 'it', 'a', 'road', 'trip', 'for', 'the', ...], 'neg'),
 (['plot', ':', 'a', 'young', 'french', 'boy', 'sees', ...], 'neg'),
 (['best', 'remembered', 'for', 'his', 'understated', ...], 'neg'),
 (['janeane', 'garofalo', 'in', 'a', 'romantic', ...], 'neg'),
 (['and', 'now', 'the', 'high', '-', 'flying', 'hong', ...], 'neg'),
 (['a', 'movie', 'like', 'mortal', 'kombat', ':', ...], 'neg'),
 (['she', 'was', 'the', 'femme', 'in', 

In [14]:
len(document)

2000

In [15]:
# we define a function that finds the features

def find_feature(word_list):
    
    # Initialization
    
    feature = {}
    
    # For loop to find the feature. 'True' is assigned if word in feature_vector can also be found in review. Otherwise 'False'
    
    for x in feature_vector:
        feature[x] = x in word_list
    
    return feature

In [16]:
# Checking the function 'find_feature'

find_feature(document[0][0])

{',': True,
 'the': True,
 '.': True,
 'a': True,
 'and': True,
 'of': True,
 'to': True,
 "'": True,
 'is': True,
 'in': True,
 's': True,
 '"': True,
 'it': True,
 'that': True,
 '-': True,
 ')': True,
 '(': True,
 'as': True,
 'with': True,
 'for': True,
 'his': True,
 'this': True,
 'film': True,
 'i': True,
 'he': True,
 'but': True,
 'on': True,
 'are': True,
 't': True,
 'by': True,
 'be': True,
 'one': True,
 'movie': True,
 'an': True,
 'who': True,
 'not': True,
 'you': True,
 'from': True,
 'at': False,
 'was': False,
 'have': True,
 'they': True,
 'has': True,
 'her': True,
 'all': True,
 '?': True,
 'there': True,
 'like': True,
 'so': True,
 'out': True,
 'about': True,
 'up': True,
 'more': True,
 'what': True,
 'when': True,
 'which': True,
 'or': True,
 'she': False,
 'their': False,
 ':': True,
 'some': False,
 'just': True,
 'can': False,
 'if': False,
 'we': True,
 'him': True,
 'into': True,
 'even': True,
 'only': True,
 'than': False,
 'no': True,
 'good': True,
