In [2]:
#https://marcobonzanini.com/2015/01/19/sentiment-analysis-with-python-and-scikit-learn/
#https://gist.github.com/bonzanini/c9248a239bbab0e0d42e

In [3]:
import sys
import os
import time

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report


In [5]:
data_dir = ["./data/txt_sentoken/","./data/aclImdb_v1/train/","./data/aclImdb_v1/test/"]

In [6]:
classes = ['pos', 'neg']

    # Read the data
train_data = []
train_labels = []
test_data = []
test_labels = []

In [7]:
for curr_class in classes:
        for dir in data_dir:
            dirname = os.path.join(dir, curr_class)
            for fname in os.listdir(dirname):
                with open(os.path.join(dirname, fname), 'r') as f:
                    content = f.read()
                    if fname.startswith('cv9'):
                        test_data.append(content)
                        test_labels.append(curr_class)
                    else:
                        train_data.append(content)
                        train_labels.append(curr_class)

# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
                                 max_df = 0.8,
                                 sublinear_tf=True,
                                 use_idf=True)
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

In [8]:
# Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
t0 = time.time()
classifier_rbf.fit(train_vectors, train_labels)
t1 = time.time()
prediction_rbf = classifier_rbf.predict(test_vectors)
t2 = time.time()
time_rbf_train = t1-t0
time_rbf_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_labels)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
classifier_liblinear.fit(train_vectors, train_labels)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(test_vectors)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1

In [9]:
# Print results in a nice table
print("Results for SVC(kernel=rbf)")
print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
print(classification_report(test_labels, prediction_rbf))
print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(test_labels, prediction_linear))
print("Results for LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(test_labels, prediction_liblinear))

Results for SVC(kernel=rbf)
Training time: 2116.871869s; Prediction time: 11.698378s
             precision    recall  f1-score   support

        neg       0.50      1.00      0.67       100
        pos       0.00      0.00      0.00       100

avg / total       0.25      0.50      0.33       200

Results for SVC(kernel=linear)
Training time: 1855.812136s; Prediction time: 4.933268s
             precision    recall  f1-score   support

        neg       0.93      0.95      0.94       100
        pos       0.95      0.93      0.94       100

avg / total       0.94      0.94      0.94       200

Results for LinearSVC()
Training time: 0.920675s; Prediction time: 0.000388s
             precision    recall  f1-score   support

        neg       0.92      0.96      0.94       100
        pos       0.96      0.92      0.94       100

avg / total       0.94      0.94      0.94       200



  'precision', 'predicted', average, warn_for)


In [10]:
# Print results in a nice table
print("Results for SVC(kernel=rbf)")
print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
print(classification_report(test_labels, prediction_rbf))
print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(test_labels, prediction_linear))
print("Results for LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(test_labels, prediction_liblinear))

Results for SVC(kernel=rbf)
Training time: 2116.871869s; Prediction time: 11.698378s
             precision    recall  f1-score   support

        neg       0.50      1.00      0.67       100
        pos       0.00      0.00      0.00       100

avg / total       0.25      0.50      0.33       200

Results for SVC(kernel=linear)
Training time: 1855.812136s; Prediction time: 4.933268s
             precision    recall  f1-score   support

        neg       0.93      0.95      0.94       100
        pos       0.95      0.93      0.94       100

avg / total       0.94      0.94      0.94       200

Results for LinearSVC()
Training time: 0.920675s; Prediction time: 0.000388s
             precision    recall  f1-score   support

        neg       0.92      0.96      0.94       100
        pos       0.96      0.92      0.94       100

avg / total       0.94      0.94      0.94       200



  'precision', 'predicted', average, warn_for)
