In [1]:
from keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data()


In [2]:
print(X_train.shape)
print(X_test.shape)

(25000,)
(25000,)


In [5]:
import pandas as pd
word_to_index = imdb.get_word_index()
index_to_word = {}

for key, value in word_to_index.items():
    index_to_word[value+3] = key
for index, token in enumerate(('<pad>', '<sos>', '<unk>')):
    index_to_word[index] = token
    
train_reviews = []
for X in X_train:
    tmp = ' '.join([index_to_word[index] for index in X])
    train_reviews.append(tmp)

test_reviews = []
for X in X_test:
    tmp = ' '.join([index_to_word[index] for index in X])
    test_reviews.append(tmp)

train = pd.concat([pd.DataFrame(train_reviews), pd.DataFrame(y_train)], axis=1)
train.columns = ['reviews', 'label']
train['reviews'] = train['reviews'].str[6:]

test = pd.concat([pd.DataFrame(test_reviews), pd.DataFrame(y_test)], axis=1)
test.columns = ['reviews', 'label']
test['reviews'] = test['reviews'].str[6:]

print('<<<<<<<<<<<< Train Dataset for MNB >>>>>>>>>>>>>\n', train)
print('<'*13, 'Test Dataset form MNB', '>'*13, '\n', test)

<<<<<<<<<<<< Train Dataset for MNB >>>>>>>>>>>>>
                                                  reviews  label
0      this film was just brilliant casting location ...      1
1      big hair big boobs bad music and a giant safet...      0
2      this has to be one of the worst films of the 1...      0
3      the scots excel at storytelling the traditiona...      1
4      worst mistake of my life br br i picked this m...      0
...                                                  ...    ...
24995  this is a racist movie but worthy of study and...      1
24996  bela lugosi plays a doctor who will do anythin...      0
24997  in a far away galaxy is a planet called ceta i...      0
24998  six degrees had me hooked i looked forward to ...      1
24999  as a big fan of the original film it's hard to...      0

[25000 rows x 2 columns]
<<<<<<<<<<<<< Test Dataset form MNB >>>>>>>>>>>>> 
                                                  reviews  label
0      please give this one a miss br br

In [7]:
X_train, X_test = train['reviews'].values, test['reviews'].values
y_train, y_test = train['label'].values, test['label'].values

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(25000,) (25000,)
(25000,) (25000,)


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=False)
X_traincv = cv.fit_transform(X_train)
X_testcv = cv.transform(X_test)

X_traincv.shape, X_testcv.shape

((25000, 74703), (25000, 74703))

In [9]:
print(cv.inverse_transform(X_traincv)[0])

['this' 'film' 'was' 'just' 'brilliant' 'casting' 'location' 'scenery'
 'story' 'direction' 'everyone' 'really' 'suited' 'the' 'part' 'they'
 'played' 'and' 'you' 'could' 'imagine' 'being' 'there' 'robert' 'redford'
 'is' 'an' 'amazing' 'actor' 'now' 'same' 'director' 'norman' 'father'
 'came' 'from' 'scottish' 'island' 'as' 'myself' 'so' 'loved' 'fact'
 'real' 'connection' 'with' 'witty' 'remarks' 'throughout' 'were' 'great'
 'it' 'much' 'that' 'bought' 'soon' 'released' 'for' 'retail' 'would'
 'recommend' 'to' 'watch' 'fly' 'fishing' 'cried' 'at' 'end' 'sad' 'know'
 'what' 'say' 'if' 'cry' 'must' 'have' 'been' 'good' 'definitely' 'also'
 'congratulations' 'two' 'little' 'boy' 'of' 'paul' 'children' 'are'
 'often' 'left' 'out' 'praising' 'list' 'think' 'because' 'stars' 'play'
 'them' 'all' 'grown' 'up' 'such' 'big' 'profile' 'whole' 'but' 'these'
 'should' 'be' 'praised' 'done' 'don' 'lovely' 'true' 'someone' 'life'
 'after' 'shared' 'us']


In [15]:
print(cv.get_feature_names_out()[-10:])

['était' 'état' 'étc' 'évery' 'êxtase' 'ís' 'ísnt' 'østbye' 'über'
 'üvegtigris']


In [18]:
set(y_train)

{0, 1}

In [19]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_traincv, y_train)

In [21]:
from sklearn.metrics import accuracy_score, classification_report
pred = mnb.predict(X_testcv)
acc = accuracy_score(y_test, pred)
cr = classification_report(y_test, pred)

print('Accuracy :{:.2f} %'.format(acc * 100))
print(cr)

Accuracy :81.42 %
              precision    recall  f1-score   support

           0       0.78      0.88      0.83     12500
           1       0.86      0.75      0.80     12500

    accuracy                           0.81     25000
   macro avg       0.82      0.81      0.81     25000
weighted avg       0.82      0.81      0.81     25000

