In [2]:
import re
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, accuracy_score

In [8]:
def clean_str(string):
 """
 Tokenization/string cleaning for all datasets except for SST.
 Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
 """
 string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 string = re.sub(r"\'s", " \'s", string)
 string = re.sub(r"\'ve", " \'ve", string)
 string = re.sub(r"n\'t", " n\'t", string)
 string = re.sub(r"\'re", " \'re", string)
 string = re.sub(r"\'d", " \'d", string)
 string = re.sub(r"\'ll", " \'ll", string)
 string = re.sub(r",", " , ", string)
 string = re.sub(r"!", " ! ", string)
 string = re.sub(r"\(", " \( ", string)
 string = re.sub(r"\)", " \) ", string)
 string = re.sub(r"\?", " \? ", string)
 string = re.sub(r"\s{2,}", " ", string)
 return string.strip().lower()


def load_data_and_labels(positive_data_file, negative_data_file):
 """
 Loads MR polarity data from files, splits the data into words and generates labels.
 Returns split sentences and labels.
 """
 # Load data from files
 positive_examples = list(open(positive_data_file, "r", encoding='latin1').readlines())
 positive_examples = [s.strip() for s in positive_examples]
 negative_examples = list(open(negative_data_file, "r", encoding='latin1').readlines())
 negative_examples = [s.strip() for s in negative_examples]
 # Split by words
 x = positive_examples + negative_examples
 x = [clean_str(sent) for sent in x]
 x = np.array(x)
 # Generate labels
 positive_labels = [1] * len(positive_examples)
 negative_labels = [0] * len(negative_examples)
 y = np.concatenate([positive_labels, negative_labels], 0)


 shuffle_indices = np.random.permutation(np.arange(len(y)))
 shuffled_x = x[shuffle_indices]
 shuffled_y = y[shuffle_indices]

 return shuffled_x, shuffled_y

In [10]:
positive_data_file = 'data/rt-polarity.pos'
negative_data_file = 'data/rt-polarity.neg'
x, y = load_data_and_labels(positive_data_file, negative_data_file)

In [12]:
x[:5]

array(['y tu mam tambi n es un buen filme gracias a lo poco convencional de su narrativa , y es quiz el proyecto m s arriesgado en la carrera de alfonso cuar n',
       'a solid piece of journalistic work that draws a picture of a man for whom political expedience became a deadly foreign policy',
       'passion , melodrama , sorrow , laugther , and tears cascade over the screen effortlessly',
       "rubbo 's humorously tendentious intervention into the who wrote shakespeare controversy",
       'a serviceable euro trash action extravaganza , with a decent sense of humor and plenty of things that go boom handguns , bmws and seaside chateaus'],
      dtype='<U266')

In [14]:
y[:5]

array([1, 1, 1, 1, 1])

In [16]:
test_size = 2000
x_train, y_train = x[:-2000], y[:-2000]
x_test, y_test = x[-2000:], y[-2000:]
label_map = {0: 'negative', 1: 'positive'}

In [20]:
class SVM_Classifier(object):

 def __init__(self, use_chi=False):

     self.use_chi = use_chi # Whether use chi-square test for feature selection
     # SVM
     self.model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
     # use tf-idf extract features
     self.feature_processor = TfidfVectorizer()
     # chi-square test for feature selection
     if use_chi:
         self.feature_selector = SelectKBest(chi2, k=10000) # 34814 -> 10000

 def fit(self, x_train, y_train, x_test, y_test):

     x_train_fea = self.feature_processor.fit_transform(x_train)
     if self.use_chi:
         x_train_fea = self.feature_selector.fit_transform(x_train_fea, y_train)
     self.model.fit(x_train_fea, y_train)

     train_accuracy = self.model.score(x_train_fea, y_train)
     print("Training Accuracy：{}".format(round(train_accuracy, 3)))

     x_test_fea = self.feature_processor.transform(x_test)
     if self.use_chi:
         x_test_fea = self.feature_selector.transform(x_test_fea)
     y_predict = self.model.predict(x_test_fea)
     test_accuracy = accuracy_score(y_test, y_predict)
     print("Test Accuracy：{}".format(round(test_accuracy, 3)))
     print('Test set evaluate：')
     print(classification_report(y_test, y_predict, target_names=['negative', 'positive']))

 def single_predict(self, text):
     text_fea = self.feature_processor.transform([text])
     if self.use_chi:
         text_fea = self.feature_selector.transform(text_fea)
     predict_idx = self.model.predict(text_fea)[0]
     predict_label = label_map[predict_idx]
     return predict_label

In [30]:
svm_classifier = SVM_Classifier(use_chi=True)
svm_classifier.fit(x_train, y_train, x_test, y_test)

Training Accuracy：0.913
Test Accuracy：0.779
Test set evaluate：
              precision    recall  f1-score   support

    negative       0.76      0.81      0.78       994
    positive       0.80      0.75      0.77      1006

    accuracy                           0.78      2000
   macro avg       0.78      0.78      0.78      2000
weighted avg       0.78      0.78      0.78      2000



In [32]:
def feature_analysis():
 feature_names = svm_classifier.feature_processor.get_feature_names()
 feature_scores = svm_classifier.feature_selector.scores_
 fea_score_tups = list(zip(feature_names, feature_scores))
 fea_score_tups.sort(key=lambda tup: tup[1], reverse=True)

 return fea_score_tups

feature_analysis()[:500]

[('too', 28.28585442482266),
 ('bad', 21.596840129419903),
 ('dull', 13.058269632641245),
 ('performances', 11.149637461116432),
 ('boring', 10.103624326714655),
 ('moving', 9.700284819109008),
 ('mess', 9.434895894263384),
 ('and', 9.165126740251702),
 ('worst', 8.660604329033024),
 ('portrait', 8.56652131225373),
 ('engrossing', 8.42915982065027),
 ('just', 8.354582669130831),
 ('heart', 8.198028511056354),
 ('flat', 8.183087215639144),
 ('enjoyable', 8.096339231087413),
 ('solid', 8.060624872980421),
 ('tv', 7.9731943198450335),
 ('no', 7.821643006724187),
 ('best', 7.283384976023441),
 ('fun', 7.217372823606684),
 ('fascinating', 6.846478116737199),
 ('entertaining', 6.715363202099347),
 ('powerful', 6.66429259453359),
 ('fails', 6.639406854122845),
 ('thoughtful', 6.389097646937579),
 ('rare', 6.380979113013787),
 ('pretentious', 6.355045964251607),
 ('video', 6.251450134186545),
 ('touching', 6.2412588957278485),
 ('pointless', 6.127617277929383),
 ('nothing', 6.069203875403463),

In [34]:
svm_classifier.single_predict("beautiful actors, great movie")

'positive'

In [36]:
svm_classifier.single_predict("it's really boring")

'negative'