In [13]:
import numpy as np
import sklearn
import os
import string
import re
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB

def convert_text(s):
    # Removes all characters from string except letters and digits and convert letters to lowercase
    return re.sub("[^a-zA-Z0-9]", " ", s.lower())

def read_txts(dir_path="./txt_sentoken/pos/"):
    # Reads all files from directory
    if dir_path[-1] != "/":
        dir_path = dir_path + "/"
    txt_list = []
    for file in os.listdir(dir_path):
        file = dir_path + file
        fin = open(file, 'r')
        txt = " ".join(fin.readlines())
        txt = convert_text(txt)
        txt_list.append(txt)
    return txt_list

In [14]:
class PoissonNB:
    def __init__(self, class_prior=None):
        """
        class_prior : np.array, size (n_classes,)
        Prior probabilities of the classes. If specified the priors are not
        adjusted according to the data.
        """
        self.probabilities = class_prior
    
    def fit(self, X, y, epsilon=1e-9):
        """
        Fit Poisson Naive Bayes according to X, y
        
        Parameters
        ----------
        X : np.array, shape (n_samples, n_features)
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : np.array, shape (n_samples,)
            Target values.
        """
        classes = set(y)
        self.classes = classes
        
        n_samples = X.shape[0]
        n_features = X.shape[1]
        
        self.epsilon = epsilon
        
    def predict(self, X):
        """
        Perform classification on an array of test vectors X.
        
        Parameters
        ----------
        X : np.array, shape = [n_samples, n_features]
        
        Returns
        -------
        C : np.array, shape = [n_samples]
            Predicted target values for X
        """

#### Функция, которая считывает рецензии из файлов, но не обрабатывает их

In [15]:
def Get_pure_text(dir_path="./txt_sentoken/pos/"):
    # Reads all files from directory
    if dir_path[-1] != "/":
        dir_path = dir_path + "/"
    txt_list = []
    for file in os.listdir(dir_path):
        file = dir_path + file
        fin = open(file, 'r')
        txt = (" ".join(fin.readlines())).replace('\n','')
        txt = txt.strip('/')
        txt_list.append(txt)
    return txt_list

#### Функция, создающая списки для обучающей и контрольной выборки, а также векторы правильных ответов для этих выборок

In [16]:
def Create_train_and_test(positive, negative):
    x_train = positive[:700] + negative[:700]
    x_test = positive[700:] + negative[700:]
    y_train, y_test = [], []
    
    for i in range(len(x_train)):
        if i <= (len(x_train)/2):
            y_train.append(1)
        else:
            y_train.append(0)
            
    for i in range(len(x_test)):
        if i <= (len(x_test)/2):
            y_test.append(1)
        else:
            y_test.append(0)
    
    return x_train, x_test, y_train, y_test

#### Получаем позитивные и негативные рецензии (а также необработанные рецензии) 

In [17]:
pos = read_txts(dir_path='./pos')
neg = read_txts(dir_path='./neg')

pure_text = Get_pure_text('./pos') + Get_pure_text('./neg')

#### Формируем списки обучающих и контрольных выборок с ответами

In [18]:
x_train, x_test, y_train, y_test = Create_train_and_test(pos, neg)

#### C помощью CountVectorizer создаем 2 матрицы объект-признак для обучающей и контрольной выборок

In [19]:
vectorizer = CountVectorizer().fit(x_train)
x_train = vectorizer.transform(x_train).toarray()
x_test = vectorizer.transform(x_test).toarray()

#### Реализуем гауссовский наивный байесовский классификатор и определяем его точность

In [20]:
gnb = GaussianNB().fit(x_train, y_train)
print('Accuracy for Naive Bayes Classifier:', round(np.mean(gnb.predict(x_test)==y_test), 2))

Accuracy for Naive Bayes Classifier: 0.62


#### Реализуем мультиноминальный наивный байесовский классификатор и определяем его точность

In [21]:
mnb = MultinomialNB().fit(x_train, y_train)
print('Accuracy for Multinomial Classifier:', round(np.mean(mnb.predict(x_test) == y_test), 2))

Accuracy for Multinomial Classifier: 0.82


#### Функция, которая принимает на вход строку с текстом рецензии, обученный классификатор и объект класса CountVectorizer, и классифицирует рецензию

In [22]:
# Метод для классификации текста рецензии
def Classify(text, tr_vectorizer, tr_classifier):
    txt_list = []
    print('Text of the review: \n', text)
    text = convert_text(text)
    txt_list.append(text)
    text = tr_vectorizer.transform(txt_list).toarray()
    res = tr_classifier.predict(text)
    print('\n')
    if res == 1:
        return print('Result: Review is positive!')
    else:
        return print('Result: Review is negative :c')

#### Выводы о наивном байесовском классификаторе
 Наивный байесовский классификатор является эффективным инструментом для работы с большими объемами данных.
Когда выполняется условие независимости признаков элементов обучающей выборки, данный алгоритм позволяет легко и быстро осуществлять как бинарную так и многоклассовую классификацию.
 Несмотря на то, что точность данного классификатора не всегда явлется достаточно высокой, он прекрасно подходит для задач классификации текстов.

#### Пример классификации рецензии (на "родных" данных)

In [25]:
try:
    number = int(input('Choose the number of a review (от 1 до 2000): '))
    classifier_type = input('What classifier do you prefer? mnb или gnb?: ')
    print('\n')
    if classifier_type == 'mnb':
        Classify(pure_text[number - 1], vectorizer, mnb)
    elif classifier_type == 'gnb':
        Classify(pure_text[number - 1], vectorizer, gnb)
    else:
        print('Choose the mnb or gnb classifier!')
except Exception:
    print('Error! Enter the number of a review properly!')

Choose the number of a review (от 1 до 2000): 5
What classifier do you prefer? mnb или gnb?: mnb


Text of the review: 
 moviemaking is a lot like being the general manager of an nfl team in the post-salary cap era -- you've got to know how to allocate your resources .  every dollar spent on a free-agent defensive tackle is one less dollar than you can spend on linebackers or safeties or centers .  in the nfl , this leads to teams like the detroit lions , who boast a superstar running back with a huge contract , but can only field five guys named herb to block for him .  in the movies , you end up with films like " spawn " , with a huge special-effects budget but not enough money to hire any recognizable actors .  jackie chan is the barry sanders of moviemaking .  he spins and darts across the screen like sanders cutting back through the defensive line .  watching jackie in operation condor as he drives his motorcycle through the crowded streets of madrid , fleeing an armada of pursuer

# Бонус 2
#### Пример классификации рецензий, взятых с сайта (https://www.cs.cornell.edu/people/pabo/movie-review-data/)

In [24]:
# Позитивные и негативные отзывы вместе
new_texts = Get_pure_text(dir_path='./tokens/pos') + Get_pure_text(dir_path='./tokens/neg')

try:
    number = int(input('Choose the number of a review (from 1 to 1386): '))
    classifier_type = input('What classifier do you prefer? mnb or gnb?: ')
    print('\n')
    if classifier_type == 'mnb':
        Classify(new_texts[number - 1], vectorizer, mnb)
    elif classifier_type == 'gnb':
        Classify(new_texts[number - 1], vectorizer, gnb)
    else:
        print('Choose the mnb or gnb classifier!')
except Exception:
    print('Error! Enter the number of a review properly!')

Choose the number of a review (from 1 to 1386): 4
What classifier do you prefer? mnb or gnb?: mnb


Text of the review: 
 cast : mel gibson ( jerry fletcher ) , julia roberts ( alice sutton ) , patrick stewart ( dr . jonas ) , cylk cozart ( agent lowry ) director : richard donner certification : r ( usa ) presumably for violence , language , and intense situations year of production : 1997 i was fortunate enough to attend an advance screening for the upcoming thriller conspiracy theory . this was , of course , a big deal for me because reviewing movies is basically just a hobby for me and i never get a chance at something like this . not only did i get to see an advance screening , i was able to see an advance screening of a * very good * movie . the very fast-paced film stars mel gibson as jerry fletcher , a fast-talking , witty , comical taxi driver in new york city . gibson's performance is terrific , and his character is similar to that of martin riggs in the lethal weapon films . 