In [34]:
# standard imports
import os
import string
import re
from collections import Counter

# third party imports
import pandas as pd
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
import joblib
import numpy as np

In [35]:
# filepaths are true to the dataset folder (downloaded from https://ai.stanford.edu/~amaas/data/sentiment/)
# data files are divided into 'positive' and 'negative' folders
POS_DIR_TRAIN = 'aclimdb/train/pos'
NEG_DIR_TRAIN = 'aclimdb/train/neg'
POS_DIR_TEST = 'aclimdb/test/pos'
NEG_DIR_TEST = 'aclimdb/test/neg'

In [36]:
pos_files_train = os.listdir(POS_DIR_TRAIN)
neg_files_train = os.listdir(NEG_DIR_TRAIN)
pos_files_test = os.listdir(POS_DIR_TEST)
neg_files_test = os.listdir(NEG_DIR_TEST)

In [37]:
# preprocessing

pstemmer = PorterStemmer()
def clean_text(text):
    text = text.lower()
    text = text.replace('br', '')
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def preprocessor(text):
    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = [pstemmer.stem(token) for token in tokens if token not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens)

In [38]:
x_raw = []
y_raw = []

In [39]:

for filepath in pos_files_train:
    with open(os.path.join(POS_DIR_TRAIN, filepath), 'r', encoding='utf-8') as f:
        document = preprocessor(f.read())
        x_raw.append(document)
        y_raw.append('pos')

for filepath in neg_files_train:
    with open(os.path.join(NEG_DIR_TRAIN, filepath), 'r', encoding='utf-8') as f:
        document = preprocessor(f.read())
        x_raw.append(document)
        y_raw.append('neg')

for filepath in pos_files_test:
    with open(os.path.join(POS_DIR_TEST, filepath), 'r', encoding='utf-8') as f:
        document = preprocessor(f.read())
        x_raw.append(document)
        y_raw.append('pos')

for filepath in neg_files_test:
    with open(os.path.join(NEG_DIR_TEST, filepath), 'r', encoding='utf-8') as f:
        document = preprocessor(f.read())
        x_raw.append(document)
        y_raw.append('neg')

In [41]:
# to check balance of dataset
pos_count = 0
neg_count = 0
for y in y_raw:
    match y:
        case 'pos':
            pos_count += 1
        case 'neg':
            neg_count += 1

print('pos:', pos_count)
print('neg:', neg_count)

pos: 25000
neg: 25000


In [43]:
x_train, x_test, y_train, y_test = train_test_split(x_raw, y_raw, stratify=y_raw, test_size=0.30, random_state=42)

In [45]:
tfidf_transformer = TfidfTransformer()
vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)
def tfidf_transform(x):
    x_train_counts = vectorizer.fit_transform(x)
    x_train_counts.shape

    x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
    x_train_tfidf.shape

    return x_train_tfidf

In [46]:
x_train_idf = tfidf_transform(x_train)

In [47]:
x_test_idf = tfidf_transformer.transform(vectorizer.transform(x_test))

In [48]:
clf = svm.SVC(random_state=42,probability=True, kernel='linear')
clf.fit(x_train_idf, y_train)

In [50]:
# get result of experiment
print('Accuracy: ', clf.score(x_test_idf, y_test))

Accuracy:  0.9272666666666667


In [None]:
joblib.dump({'clf': clf, 'tfidf': tfidf_transformer, 'vectorizer': vectorizer, 'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}, 'linearsvm-extra-proba.joblib')

In [1]:
clf.predict_proba(tfidf_transformer.transform(vectorizer.transform(['today was not bad'])))

NameError: name 'clf' is not defined

In [19]:
probs = clf.predict_proba(x_test_idf)