In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [33]:
def clean_str(string):
 """
 Tokenization/string cleaning for all datasets except for SST.
 Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
 """
 string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 string = re.sub(r"\'s", " \'s", string)
 string = re.sub(r"\'ve", " \'ve", string)
 string = re.sub(r"n\'t", " n\'t", string)
 string = re.sub(r"\'re", " \'re", string)
 string = re.sub(r"\'d", " \'d", string)
 string = re.sub(r"\'ll", " \'ll", string)
 string = re.sub(r",", " , ", string)
 string = re.sub(r"!", " ! ", string)
 string = re.sub(r"\(", " \( ", string)
 string = re.sub(r"\)", " \) ", string)
 string = re.sub(r"\?", " \? ", string)
 string = re.sub(r"\s{2,}", " ", string)
 return string.strip().lower()

def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r", encoding='latin1').readlines())
    positive_examples = [s.strip() for s in positive_examples]  # Correction : s.strip() au lieu de s.strip
    negative_examples = list(open(negative_data_file, "r", encoding='latin1').readlines())
    negative_examples = [s.strip() for s in negative_examples]  # Correction : s.strip() au lieu de s.strip
    # Split by words
    x = positive_examples + negative_examples
    x = [clean_str(sent) for sent in x]
    x = np.array(x)
    # Generate labels
    positive_labels = [1] * len(positive_examples)
    negative_labels = [0] * len(negative_examples)
    y = np.concatenate([positive_labels, negative_labels], 0)

    # Shuffle data
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    shuffled_x = x[shuffle_indices]
    shuffled_y = y[shuffle_indices]
    
    return shuffled_x, shuffled_y

In [35]:
positive_data_file = 'data/rt-polarity.pos'
negative_data_file = 'data/rt-polarity.neg'
x, y = load_data_and_labels(positive_data_file, negative_data_file)

In [37]:
x[:5]

array(["eric byler 's nuanced pic avoids easy sentiments and explanations",
       "'linklater fans , or pretentious types who want to appear avant garde will suck up to this project '",
       'so so entertainment',
       "after an uncertain start , murder hits and generally sustains a higher plateau with bullock 's memorable first interrogation of gosling",
       "its direction , its script , and weaver 's performance as a vaguely discontented woman of substance make for a mildly entertaining 77 minutes , if that 's what you 're in the mood for"],
      dtype='<U266')

In [39]:
y[:5]

array([1, 0, 0, 1, 1])

In [41]:
test_size = 2000
x_train, y_train = x[:-2000], y[:-2000]
x_test, y_test = x[-2000:], y[-2000:]
label_map = {0: 'negative', 1: 'positive'}

In [43]:
class NB_Classifier(object):

 def __init__(self):
     # naive bayes
     self.model = MultinomialNB( alpha=1) #Laplace smooth：1
     # use tf-idf extract features
     self.feature_processor = TfidfVectorizer()

 def fit(self, x_train, y_train, x_test, y_test):
     # tf-idf extract features
     x_train_fea = self.feature_processor.fit_transform(x_train)
     self.model.fit(x_train_fea, y_train)

     train_accuracy = self.model.score(x_train_fea, y_train)
     print("Training Accuracy：{}".format(round(train_accuracy, 3)))

     x_test_fea = self.feature_processor.transform(x_test)
     y_predict = self.model.predict(x_test_fea)
     test_accuracy = accuracy_score(y_test, y_predict)
     print("Test Accuracy：{}".format(round(test_accuracy, 3)))

     y_predict = self.model.predict(x_test_fea)
     print('Test set evaluate：')
     print(classification_report(y_test, y_predict, target_names=['0', '1']))

 def single_predict(self, text):
     text_fea = self.feature_processor.transform([text])
     predict_idx = self.model.predict(text_fea)[0]
     predict_label = label_map[predict_idx]
     predict_prob = self.model.predict_proba(text_fea)[0][predict_idx]

     return predict_label, predict_prob

In [45]:
nb_classifier = NB_Classifier()
nb_classifier.fit(x_train, y_train, x_test, y_test)

Training Accuracy：0.927
Test Accuracy：0.772
Test set evaluate：
              precision    recall  f1-score   support

           0       0.76      0.78      0.77       972
           1       0.79      0.76      0.77      1028

    accuracy                           0.77      2000
   macro avg       0.77      0.77      0.77      2000
weighted avg       0.77      0.77      0.77      2000



In [47]:
nb_classifier.single_predict("beautiful actors, great movie")

('positive', 0.7602870931272715)

In [49]:
nb_classifier.single_predict("it's really boring")

('negative', 0.8769542497116805)