In [1]:
import random
import numpy as np
random.seed(1337)
np.random.seed(1337)

In [2]:
import os
import random

dataset_train_pos_path = "../Datasets/aclImdb/train/pos/"
dataset_train_neg_path = "../Datasets/aclImdb/train/neg/"

dataset_test_pos_path = "../Datasets/aclImdb/test/pos/"
dataset_test_neg_path = "../Datasets/aclImdb/test/neg/"

In [3]:
def read_dataset(dataset_path, label):
    contents_labels = []
    files = os.listdir(dataset_path)
    for fn in files:
        path = os.path.join(dataset_path, fn)
        with open(path) as f:
            s = f.read()
            contents_labels.append((s, label))
    return contents_labels   

In [4]:
train_pos = read_dataset(dataset_train_pos_path, "pos")
train_neg = read_dataset(dataset_train_neg_path, "neg")

test_pos = read_dataset(dataset_test_pos_path, "pos")
test_neg = read_dataset(dataset_test_neg_path, "neg")

In [5]:
train = train_pos + train_neg
test = test_pos + test_neg

In [6]:
random.shuffle(train)
random.shuffle(test)

In [7]:
train_data, y_train = zip(*train)
test_data, y_test = zip(*test)

In [8]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)
print("The dimensions of our vectors:")
print(X_train.shape)
print("- - -")


The dimensions of our vectors:
(25000, 74849)
- - -
CPU times: user 13.4 s, sys: 440 ms, total: 13.8 s
Wall time: 14.7 s


In [9]:
%%time

from sklearn.svm import LinearSVC

svm_classifier = LinearSVC()
svm_classifier.fit(X_train, y_train)

predictions = svm_classifier.predict(X_test)

CPU times: user 799 ms, sys: 63 ms, total: 862 ms
Wall time: 1.17 s


In [10]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy: {}\n".format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))


Accuracy: 0.8772

              precision    recall  f1-score   support

         neg       0.87      0.89      0.88     12500
         pos       0.89      0.87      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [11]:
good_review = "The restaurant was really great! I ate wonderful food and had a very good time"
bad_review = "The restuarant was awful. The staff were rude and the food was horrible. I hated it"

restuarant_reviews = [good_review, bad_review]
vectors = vectorizer.transform(restuarant_reviews)
print(svm_classifier.predict(vectors))

['pos' 'neg']
