In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
movie_reviews.fileids()

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [4]:
len(movie_reviews.fileids())

2000

In [5]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [6]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [7]:
import random
random.shuffle(documents)
documents[0:5]

[(['i', 'feel', 'sorry', 'for', 'the', 'financial', ...], 'neg'),
 (['my', 'fellow', 'americans', 'is', 'a', 'movie', ...], 'pos'),
 (['come', 'on', 'hollywood', ',', 'surprise', 'me', '.', ...], 'neg'),
 (['certainly', 'no', 'one', 'would', 'rent', 'ed', ...], 'neg'),
 (['written', 'by', 'alex', 'cox', ',', 'tod', 'davies', ...], 'neg')]

In [8]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
from nltk import pos_tag
from nltk.corpus import stopwords
import string

stop = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stop.update(punctuations)

In [11]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])   ## pos_tag accepts an array, returns an array of tuples
            clean_word = lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [12]:
documents = [(clean_review(document),category) for document,category in documents]

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
categories = [category for doc,category in documents]

In [16]:
text_documents = [" ".join(doc) for doc,cat in documents]

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test = train_test_split(text_documents,categories)

In [22]:
count_vec = CountVectorizer(max_features = 3000)
a = count_vec.fit_transform(x_train)
a.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 2, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '12',
 '13',
 '15',
 '17',
 '18',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '2000',
 '2001',
 '25',
 '30',
 '40',
 '50',
 '54',
 '60',
 '70',
 '80',
 '8mm',
 '90',
 '_the',
 'abandon',
 'ability',
 'able',
 'aboard',
 'absolutely',
 'absurd',
 'abuse',
 'academy',
 'accent',
 'accept',
 'accident',
 'accidentally',
 'acclaim',
 'accompany',
 'accomplish',
 'accord',
 'account',
 'accuse',
 'achieve',
 'achievement',
 'across',
 'act',
 'action',
 'activity',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adapt',
 'adaptation',
 'add',
 'addict',
 'addition',
 'address',
 'admire',
 'admit',
 'admittedly',
 'adult',
 'advance',
 'advantage',
 'adventure',
 'advice',
 'affair',
 'affect',
 'affection',
 'affleck',
 'aforementioned',
 'afraid',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'agrees',
 'ahead',
 'aid',
 'aim',
 'air',
 'airplane',
 'al',
 'ala',
 'alan',
 'albeit',
 'albert',
 'alec',
 'alex',
 'ali',


In [27]:
x_train_features = a
x_train_features

<1500x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 282426 stored elements in Compressed Sparse Row format>

In [25]:
x_test_features = count_vec.transform(x_test)

In [26]:
x_test_features

<500x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 96894 stored elements in Compressed Sparse Row format>

In [28]:
from sklearn.svm import SVC

In [29]:
svc = SVC()
svc.fit(x_train_features, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [42]:
svc.score(x_test_features, y_test)

0.818

### adding n-gram range

In [98]:
count_vec = CountVectorizer(max_features = 2000,ngram_range = (1,3), max_df=0.7)
a = count_vec.fit_transform(x_train)
a.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 2, 0, ..., 0, 0, 0]], dtype=int64)

In [99]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '54',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'action film',
 'action movie',
 'action scene',
 'action sequence',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'alan',
 'alex',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'along way',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anne',
 'annie',
 'annoy',
 'another',
 'answer',
 'anthony'

In [100]:
x_train_features2 = a
x_train_features2

<1500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 249764 stored elements in Compressed Sparse Row format>

In [101]:
x_test_features2 = count_vec.transform(x_test)

In [102]:
svc = SVC()
svc.fit(x_train_features2, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [103]:
svc.score(x_test_features2, y_test)

0.828