In [67]:
from nltk.corpus import movie_reviews

In [68]:
 >>> import nltk
>>> nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\blank\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [69]:
movie_reviews.categories()

['neg', 'pos']

In [70]:
movie_reviews.fileids('pos')

['pos/cv000_29590.txt',
 'pos/cv001_18431.txt',
 'pos/cv002_15918.txt',
 'pos/cv003_11664.txt',
 'pos/cv004_11636.txt',
 'pos/cv005_29443.txt',
 'pos/cv006_15448.txt',
 'pos/cv007_4968.txt',
 'pos/cv008_29435.txt',
 'pos/cv009_29592.txt',
 'pos/cv010_29198.txt',
 'pos/cv011_12166.txt',
 'pos/cv012_29576.txt',
 'pos/cv013_10159.txt',
 'pos/cv014_13924.txt',
 'pos/cv015_29439.txt',
 'pos/cv016_4659.txt',
 'pos/cv017_22464.txt',
 'pos/cv018_20137.txt',
 'pos/cv019_14482.txt',
 'pos/cv020_8825.txt',
 'pos/cv021_15838.txt',
 'pos/cv022_12864.txt',
 'pos/cv023_12672.txt',
 'pos/cv024_6778.txt',
 'pos/cv025_3108.txt',
 'pos/cv026_29325.txt',
 'pos/cv027_25219.txt',
 'pos/cv028_26746.txt',
 'pos/cv029_18643.txt',
 'pos/cv030_21593.txt',
 'pos/cv031_18452.txt',
 'pos/cv032_22550.txt',
 'pos/cv033_24444.txt',
 'pos/cv034_29647.txt',
 'pos/cv035_3954.txt',
 'pos/cv036_16831.txt',
 'pos/cv037_18510.txt',
 'pos/cv038_9749.txt',
 'pos/cv039_6170.txt',
 'pos/cv040_8276.txt',
 'pos/cv041_21113.txt',
 

In [71]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

# cleaning data 

In [72]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
        

In [73]:
documents[4]

(['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')

In [74]:
import random
random.shuffle(documents)
documents[0:5]

[(['this', 'is', 'one', 'of', 'the', 'most', 'funny', ...], 'pos'),
 (['oliver', 'stone', "'", 's', 'latest', 'feature', ...], 'pos'),
 (['tim', 'burton', 'has', 'now', 'completed', 'his', ...], 'neg'),
 (['synopsis', ':', 'original', '"', 'jurassic', 'park', ...], 'neg'),
 (['susan', 'granger', "'", 's', 'review', 'of', '"', ...], 'neg')]

In [75]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else :
        return wordnet.NOUN
        

In [76]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [77]:
from nltk import pos_tag

In [78]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [79]:
def clean_review(words):
    output_words = []
    for w in words:
        if(w.lower() not in stops):
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words
            
                                                                            

In [80]:
documents = [(clean_review(document),category) for document,category in documents]

In [81]:
documents[4]

(['susan',
  'granger',
  'review',
  'musketeer',
  'universal',
  'picture',
  'hollywood',
  'launch',
  'another',
  'assault',
  'classic',
  'literature',
  '50',
  'million',
  'adaptation',
  'alexandre',
  'duma',
  'novel',
  'strong',
  'action',
  'weak',
  'drama',
  'fuse',
  'hong',
  'kong',
  'martial',
  'art',
  '17th',
  'century',
  'swordplay',
  'story',
  'chronicle',
  'adventure',
  'dash',
  'artagnan',
  'justin',
  'chamber',
  'leaf',
  'village',
  'gascogne',
  'head',
  'paris',
  'join',
  'king',
  'louis',
  'xiii',
  'elite',
  'guard',
  'royal',
  'musketeer',
  'search',
  'man',
  'kill',
  'parent',
  '14',
  'year',
  'earlier',
  'put',
  'conflict',
  'formidable',
  'febre',
  'tim',
  'roth',
  'vicious',
  'henchman',
  'connive',
  'cardinal',
  'richelieu',
  'stephen',
  'rea',
  'traditional',
  'musketeer',
  'trio',
  'aramis',
  'nick',
  'moran',
  'athos',
  'jan',
  'gregor',
  'kremp',
  'porthos',
  'steve',
  'speirs',
  'off

# Building feature set

In [82]:
training_doc = documents[0:1500]
testing_doc = documents[1500:]

In [83]:
all_words = []
for doc in training_doc:
    all_words += doc[0]

In [84]:
freq = nltk.FreqDist(all_words)

In [85]:
common = freq.most_common(3000)

In [86]:
features = []
for i in common :
    features.append(i[0])

In [87]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'give',
 'come',
 'also',
 'two',
 'life',
 'bad',
 'way',
 'look',
 'seem',
 'know',
 '--',
 'first',
 'end',
 'year',
 'work',
 'thing',
 'plot',
 'play',
 'say',
 'little',
 'really',
 'show',
 'people',
 'star',
 'love',
 'could',
 'man',
 'never',
 'director',
 'try',
 'performance',
 'new',
 'great',
 'best',
 'big',
 'actor',
 'many',
 'action',
 'u',
 'watch',
 'want',
 'find',
 'role',
 'act',
 'audience',
 'think',
 'another',
 'back',
 'turn',
 'still',
 'something',
 'set',
 'day',
 'however',
 'world',
 'old',
 'use',
 'guy',
 'begin',
 'though',
 'cast',
 'part',
 'comedy',
 'every',
 'enough',
 'around',
 'feel',
 'point',
 'name',
 'run',
 'young',
 'real',
 'woman',
 'interest',
 'write',
 'last',
 'long',
 'right',
 'funny',
 'fact',
 'may',
 'minute',
 'friend',
 'lot',
 'effect',
 'although',
 'script'

In [88]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [89]:
training_data = [(get_feature_dict(doc),category) for doc, category in training_doc]

In [90]:
testing_data = [(get_feature_dict(doc),category) for doc, category in testing_doc]

In [91]:
training_data[0]

({'film': True,
  'movie': False,
  'one': True,
  'make': True,
  'like': True,
  'character': False,
  'get': False,
  'see': True,
  'go': True,
  'time': True,
  'well': False,
  'scene': True,
  'even': True,
  'good': False,
  'story': True,
  'take': True,
  'would': True,
  'much': True,
  'give': False,
  'come': False,
  'also': True,
  'two': False,
  'life': True,
  'bad': False,
  'way': True,
  'look': False,
  'seem': False,
  'know': False,
  '--': True,
  'first': True,
  'end': True,
  'year': True,
  'work': False,
  'thing': True,
  'plot': False,
  'play': True,
  'say': True,
  'little': True,
  'really': True,
  'show': True,
  'people': False,
  'star': True,
  'love': True,
  'could': False,
  'man': False,
  'never': False,
  'director': True,
  'try': False,
  'performance': False,
  'new': False,
  'great': True,
  'best': True,
  'big': True,
  'actor': False,
  'many': False,
  'action': False,
  'u': False,
  'watch': False,
  'want': False,
  'find': Tru

# Classification 

In [92]:
from nltk import NaiveBayesClassifier

In [93]:
classifier = NaiveBayesClassifier.train(training_data)

In [94]:
nltk.classify.accuracy(classifier,testing_data)

0.772

In [95]:
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     21.2 : 1.0
                   anger = True              pos : neg    =     13.7 : 1.0
               stupidity = True              neg : pos    =      9.9 : 1.0
                  sinise = True              neg : pos    =      9.1 : 1.0
                 winslet = True              pos : neg    =      8.9 : 1.0
             outstanding = True              pos : neg    =      8.5 : 1.0
              federation = True              pos : neg    =      8.2 : 1.0
             wonderfully = True              pos : neg    =      7.8 : 1.0
                 idiotic = True              neg : pos    =      7.4 : 1.0
                  seagal = True              neg : pos    =      6.7 : 1.0
                   damon = True              pos : neg    =      6.6 : 1.0
                    jedi = True              pos : neg    =      6.5 : 1.0
             beautifully = True              pos : neg    =      6.5 : 1.0

# Using sklearn to classify

In [98]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [101]:
svc = SVC()
classifier_sklearn  = SklearnClassifier(svc)

In [103]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC())>

In [104]:
nltk.classify.accuracy(classifier_sklearn,testing_data)

0.838

In [105]:
from sklearn.ensemble import RandomForestClassifier

In [107]:
rfc = RandomForestClassifier()
classifier_sklearn1  = SklearnClassifier(rfc)

In [108]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [110]:
nltk.classify.accuracy(classifier_sklearn1,testing_data)

0.796

# why didnt i get 98 accuracy

In [111]:
#no idea

In [112]:
from sklearn.feature_extraction.text import CountVectorizer

In [113]:
train_set = {"the sky is blue","blue is a colour"}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)

In [114]:
a.todense()

matrix([[1, 0, 1],
        [1, 1, 1]], dtype=int64)

In [115]:
count_vec.get_feature_names()

['blue', 'colour', 'is']

In [116]:
a = ["ad","is"]
" ".join(a)

'ad is'

In [118]:
from sklearn.model_selection import train_test_split

In [119]:
categories = [category for document,category in documents] # for splitting into yTrain and yTest

In [120]:
text_documents = [" ".join(document) for document,category in documents]

In [121]:
#now the documents are in an array of text documents which is suitable for the nltk input

In [123]:
x_train,x_test,y_train,y_test = train_test_split(text_documents,categories)

In [149]:
count_vec = CountVectorizer(max_features = 2000,ngram_range = (2,3))
x_train_features = count_vec.fit_transform(x_train)

In [151]:
count_vec.get_feature_names()

['10 10',
 '10 minute',
 '10 scale',
 '10 scale scale',
 '10 year',
 '100 million',
 '14 year',
 '15 minute',
 '15 year',
 '19th century',
 '20 minute',
 '20 year',
 '20th century',
 '30 minute',
 '30 year',
 '90 minute',
 'absolutely nothing',
 'academy award',
 'ace ventura',
 'across country',
 'act ability',
 'act film',
 'act like',
 'act movie',
 'act one',
 'act talent',
 'action adventure',
 'action comedy',
 'action film',
 'action flick',
 'action hero',
 'action movie',
 'action packed',
 'action scene',
 'action sequence',
 'action thriller',
 'actor film',
 'actor play',
 'actually get',
 'adam sandler',
 'african american',
 'al pacino',
 'alan smithee',
 'albert brook',
 'ali larter',
 'alien film',
 'almost always',
 'almost entirely',
 'almost every',
 'along line',
 'along way',
 'already know',
 'also direct',
 'also feature',
 'also funny',
 'also get',
 'also give',
 'also good',
 'also happens',
 'also help',
 'also include',
 'also like',
 'also make',
 'also one

In [152]:
x_train_features

<1500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 41111 stored elements in Compressed Sparse Row format>

In [153]:
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [154]:
count_vec.get_feature_names()

['10 10',
 '10 minute',
 '10 scale',
 '10 scale scale',
 '10 year',
 '100 million',
 '14 year',
 '15 minute',
 '15 year',
 '19th century',
 '20 minute',
 '20 year',
 '20th century',
 '30 minute',
 '30 year',
 '90 minute',
 'absolutely nothing',
 'academy award',
 'ace ventura',
 'across country',
 'act ability',
 'act film',
 'act like',
 'act movie',
 'act one',
 'act talent',
 'action adventure',
 'action comedy',
 'action film',
 'action flick',
 'action hero',
 'action movie',
 'action packed',
 'action scene',
 'action sequence',
 'action thriller',
 'actor film',
 'actor play',
 'actually get',
 'adam sandler',
 'african american',
 'al pacino',
 'alan smithee',
 'albert brook',
 'ali larter',
 'alien film',
 'almost always',
 'almost entirely',
 'almost every',
 'along line',
 'along way',
 'already know',
 'also direct',
 'also feature',
 'also funny',
 'also get',
 'also give',
 'also good',
 'also happens',
 'also help',
 'also include',
 'also like',
 'also make',
 'also one

In [155]:
x_test_features = count_vec.fit_transform(x_test)

In [156]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 14102 stored elements in Compressed Sparse Row format>

In [157]:
svc = SVC()
svc.fit(x_train_features,y_train)

SVC()

In [158]:
svc.score(x_test_features,y_test)

0.524

In [148]:
#why am i getting low accuracy