In [None]:
import sklearn
import os
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

from sklearn.datasets import load_files

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 100
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset ='validation',
    seed = seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
import numpy as np
labels_train = np.concatenate([y for x, y in train_ds], axis=0)
movies_train = np.concatenate([x for x, y in train_ds], axis = 0)

labels_val = np.concatenate([y for x, y in val_ds], axis=0)
movies_val = np.concatenate([x for x, y in val_ds], axis = 0)

labels_test = np.concatenate([y for x, y in test_ds], axis=0)
movies_test = np.concatenate([x for x, y in test_ds], axis = 0)

validation_data = movies_val,labels_val


In [None]:
# initialize TfidfVectorizer to create the tfIdf representation of the corpus
# the parameters are: min_df is the percentage of documents that the word has to 
# occur in to be considered, the tokenizer to use, and the maximum
# number of words to consider
vectorizer = TfidfVectorizer(min_df=.1, 
                             tokenizer=nltk.word_tokenize, 
                             max_features= 1000) 

# fit and transform using training text 
# here is where we build the tfidf representation of the training data
movies_train_tfidf = vectorizer.fit_transform(movies_train)
#movies_train_tfidf = vectorizer.transform(movies_train)

In [None]:
# The next step is to predict the classes of the test data
# We will use Multinominal Naive Bayes as our classifier
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Initialize the classifier and train it
classifier = MultinomialNB()
classifier.fit(movies_train_tfidf, labels_train)

In [None]:
# find accuracy based on test set
movies_test_tfidf = vectorizer.transform(movies_test)
labels_pred = classifier.predict(movies_test_tfidf)
sklearn.metrics.accuracy_score(labels_test, labels_pred)

In [None]:

# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(labels_test, labels_pred,normalize=None)
print(conf_matrix)


In [None]:
# Displaying the confusion matrix
# uncomment the next line if you want the plots to appear inline
# matplotlib inline
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,f1_score,classification_report
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})

disp = ConfusionMatrixDisplay(confusion_matrix = conf_matrix,
                               display_labels = class_names)
print(class_names)
disp.plot(xticks_rotation=75,cmap=plt.cm.Blues)
plt.savefig('confusionTfidf.svg')
plt.show()
print(classification_report(labels_test, labels_pred, target_names=class_names))