In [3]:
from nltk.corpus import movie_reviews

In [4]:
type(movie_reviews)

nltk.corpus.util.LazyCorpusLoader

In [5]:
movie_reviews.fileids()
len(movie_reviews.fileids())


2000

In [6]:
movie_reviews.categories()

['neg', 'pos']

In [7]:
# 1. making data with its category document = [([list of text],category)]

In [8]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
        
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [9]:
# 2. shuffle the document in a random order

In [10]:
import random 
random.shuffle(documents)
documents[0:5]

[(['so', 'here', 'is', 'the', 'second', 'of', '1999', ...], 'pos'),
 (['here', "'", 's', 'a', 'word', 'analogy', ':', ...], 'pos'),
 (['the', 'keen', 'wisdom', 'of', 'an', 'elderly', ...], 'pos'),
 (['synopsis', ':', 'captain', 'picard', 'and', 'the', ...], 'pos'),
 (['a', 'new', 'entry', 'in', 'the', '"', 'revisionist', ...], 'neg')]

In [11]:
# 3. remove the stop words and use lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [14]:
# create stop words

from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [15]:
from nltk import pos_tag
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w]) # <-- postag always take array
            clean_word = lemmatizer.lemmatize(w,pos = get_simple_pos(pos[0][1])) #<-- postag return a tuple
            output_words.append(clean_word.lower())
    return output_words

In [16]:
documents = [(clean_review(document), category) for document, category in documents]

In [17]:
documents[0]

(['second',
  '1999',
  'remake',
  'classic',
  'horror',
  'movie',
  'first',
  'dumb',
  'pathetic',
  'ok',
  'remake',
  'classic',
  'haunt',
  'come',
  'highly',
  'await',
  'remake',
  'house',
  'haunt',
  'hill',
  'classic',
  'star',
  'vincent',
  'price',
  'man',
  'pay',
  'group',
  'people',
  '100',
  '000',
  'stay',
  'house',
  'survive',
  'night',
  'get',
  'money',
  'well',
  'get',
  'jurisdiction',
  'yet',
  'see',
  'film',
  'dvd',
  'way',
  'sure',
  'would',
  'really',
  'like',
  'remake',
  'sure',
  'enough',
  'come',
  'great',
  'storyline',
  'fall',
  'back',
  'geoffrey',
  'rush',
  'fantastic',
  'role',
  'incredible',
  'performance',
  'steven',
  'price',
  'owner',
  'amusement',
  'park',
  'destine',
  'scare',
  'wit',
  'people',
  'wife',
  'evelyn',
  'price',
  'throw',
  'party',
  'friend',
  'steven',
  'dismay',
  'idea',
  'make',
  'list',
  'somehow',
  'someone',
  'something',
  'change',
  'list',
  'five',
  'peop

In [18]:
# 4. Use countVectoriser

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
categories = [category for document, category in documents]


In [21]:
text_documents = [" ".join(document) for document, category in documents]

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
x_train,x_test,y_train,y_test = train_test_split(text_documents,categories)

In [33]:
count_vect = CountVectorizer(max_features = 2000, ngram_range=(1,2))
x_train_features = count_vect.fit_transform(x_train)
x_train_features.todense()


matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 1, 0]], dtype=int64)

In [34]:
count_vect.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'action film',
 'action movie',
 'action scene',
 'action sequence',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'alan',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'along way',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anne',
 'annie',
 'annoy',
 'another',
 'answer',
 'anthony',
 'anti',
 'anyone',
 'a

In [35]:
x_test_features = count_vect.transform(x_test)

In [36]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
svc.score(x_test_features,y_test)

0.784