<a href="https://colab.research.google.com/github/PaulBarriere/TSE-NBSVM/blob/main/NBSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NBSVM Project

Paul Barriere and Nicolas Le Gall

In [None]:
# Packages
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import *
from sklearn.metrics import *

## I) Import data

In [None]:
# Example data : 
# The structure should be two list. Called train_x and train_y.
# train_x should contain for each element the text (review).
# train_y should contain 1 or -1 (1 means positive and -1 means negative).

train_x = ['This canon is very bad, bad. Peace of shit!!', 'Perfect and even incredible for me!', 'Incredible pictures!']
train_y = [-1,1,1]

test_x = ["It's very bad", 'So super cool', 'Greatest of all time']
test_y = [-1, 1, 1]

## II) Define functions

In [None]:
''' Function vectorize returns:
        - the Vector V defined as the feature vector (ie a vector containing 
        strings which are the list of vocabulary)
        
        - the matrix F where the ith column is f(i) the feature count vector
          for training case i as defined in the paper. F is therefore a matrix 
          of size : len(V) x len(train_x) (let's remind that V is the vocabulary 
          vector and that len(train_x) is the number of reviews in the training 
          example)

          - vectorization which is the CountVectorizer fittid with the data. This
          is used later to transform the test data.


    Function vectorize uses:
        - Train_x_sample: a list of strings (list of reviews)

        - ngrams: a tupple with for example (1,1) meaning that you want only 
                  unigrams and (1,2) meaning that you want both unigrams and 
                  bigrams
'''

def vectorize(Train_x_sample, ngrams):
  vectorization =  CountVectorizer(ngram_range = ngrams)
  vectorization.fit(Train_x_sample)
  V = vectorization.get_feature_names()
  
  F = vectorization.transform(Train_x_sample)
  F = F.toarray().T

  return V, F, vectorization




''' Function get_P returns:
        - the Vector P as defined is the paper. This vector is the sum of two
        element: alpha a smoothing parameter and the sum of f(i) for all i where 
        the review is positive (ie we the reviews where train_y = 1 and sum the
        columns of the matrix F. )
  
    Function vectorize uses:
        - alpha a real number between 0 and 1
        
        - Train_y_sample which is the value of a review (positive / negative) of
        the training dataset. 

        - The matrix F
'''

def get_P(alpha, Train_y_sample, F):
  pos_list = [i for i, value in enumerate(Train_y_sample) if value == 1]
  restrict_F = F[:,pos_list]
  P = alpha + np.sum(restrict_F, axis = 1)
  return P


''' Function get_Q returns:
        - the Vector Q as defined is the paper. This vector is the sum of two
        element: alpha a smoothing parameter and the sum of f(i) for all i where 
        the review is negative (ie we the reviews where train_y = -1 and sum the
        columns of the matrix F. )
  
    Function vectorize uses:
        - alpha a real number between 0 and 1
        
        - Train_y_sample which is the value of a review (positive / negative) of
        the training dataset. 

        - The matrix F
'''

def get_Q(alpha, Train_y_sample, F):
  neg_list = [i for i, value in enumerate(Train_y_sample) if value == 0]
  restrict_F = F[:,neg_list]
  Q = alpha + np.sum(restrict_F, axis = 1)
  return Q


''' Function get_r returns:
        - the vector r which is the log-count ratio defined in the paper. 

    Function get_r uses: 
        - functions get_P and get_Q as defined above

        - alpha a real number between 0 and 1
        
        - Train_y_sample which is the value of a review (positive / negative) of
        the training dataset. 

        - The matrix F
'''

def get_R(alpha, Train_y_sample, F):
  P = get_P(alpha, Train_y_sample, F)
  Q = get_Q(alpha, Train_y_sample, F)
  
  P = P / np.linalg.norm(P,ord=2)
  Q = Q / np.linalg.norm(Q,ord=2)
  R = np.log(P/Q)

  return R



''' --------------------------------------------------------------------------
Multinomial Naive Bayes (MNB)

Function MNB_fit_model returns:
        - W as defined in the paper. In this case W is R. So the function
        compute R.

        - X as defined in the paper. Here it is f or ^f (depending on what you 
        choose)

        - B as defined in the paper (log(N+/N-))

Function MNB_fit_model uses:
        - Train_x_sample: a list of strings (list of reviews)

        - Train_y_sample which is the value of a review (positive / negative) of
        the training dataset. 

        - ngrams : a tupple of two integer. 

        - alpha : smoothing parameter

        - binarized : a boolean parameter. If TRUE then we tranform F (and p,q,
        r) by taking for each element of F 1 if it is strictly positive and 0 
        otherwise. 

'''

def MNB_fit_model(Train_x_sample, Train_y_sample, 
                  ngrams, alpha = 1, binarized=False):
  # Define V and F
  V,F, vectorization = vectorize(Train_x_sample, ngrams)


  # We change F if Binarized == TRUE:
  if binarized == True:
    F = np.where(F > 0,1,0)

  # Define R and so W:
  R = get_R(alpha, Train_y_sample, F)
  W = R

  # Compute B:
  nb_neg = len([i for i, value in enumerate(Train_y_sample) if value == -1])
  nb_pos = len([i for i, value in enumerate(Train_y_sample) if value == 1])
  B = np.log(nb_pos/nb_neg)

  return W, B, binarized, vectorization


''' Function predict returns:
        - Preds a vector of predictions containing -1 or 1. 

Function predict uses:
        - sample: a list of strings (list of reviews) on which to predict if it 
        is a positive or negative review. 

        - the parameters found in the fitting (binarized,W,B)

The aim is to use this function for all cases MNB, SVM and NBSVM

'''

def predict(binarized,W,B,vectorization,sample):
  # Transform the sample in term of V:
  sample = vectorization.transform(sample).toarray().T

  # Compute the prediction
  predictions =  np.sign(np.dot(W.T, sample) + B)
  return predictions

''' Function eval returns:
        - a vector containing the accuracy, precision, recall and F1 score.

Function eval uses:
        - predictions: a list of integer equals to 1 or -1. 

        - true_value: a list of integers equals to 1 or -1. 

The aim is to use this function for all cases MNB, SVM and NBSVM

'''

def eval(predictions, y_test):
  print('accuracy {}'.format(accuracy_score(y_test, predictions)))
  print(classification_report(y_test, predictions))
  return 

In [None]:
''' Illustration on this example with first 10 elements of vector V '''
V = vectorize(train_x, (1,2))[0]
V[:10]

['and',
 'and even',
 'bad',
 'bad bad',
 'bad peace',
 'canon',
 'canon is',
 'even',
 'even incredible',
 'for']

In [None]:
''' Illustration on this example with first two columns of the matrix F (ie the 
two first reviews and it's 10 first lines (ie 10 first element of V) '''
F = vectorize(train_x, (1,2))[1]
F[:10,:2]

array([[0, 1],
       [0, 1],
       [2, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1]])

In [None]:
get_R(1,train_y, F)

array([ 0.23156537,  0.23156537, -0.46158181, -0.46158181, -0.46158181,
       -0.46158181, -0.46158181,  0.23156537,  0.23156537,  0.23156537,
        0.23156537,  0.63703048,  0.23156537,  0.23156537, -0.46158181,
       -0.46158181,  0.23156537, -0.46158181, -0.46158181, -0.46158181,
       -0.46158181,  0.23156537,  0.23156537,  0.23156537, -0.46158181,
       -0.46158181, -0.46158181, -0.46158181, -0.46158181])

## III) Naive bayes 

In [None]:
W, B, binarized, vectorization = MNB_fit_model(Train_x_sample = train_x, 
                                              Train_y_sample = train_y,
                                              ngrams = (1,2), 
                                              alpha = 1, 
                                              binarized=False)

In [None]:
pred = predict(binarized,W,B,vectorization,test_x)
pred

array([-1.,  1.,  1.])

In [None]:
eval(pred,test_y)

accuracy 1.0
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



# IV) SVM

In [None]:
clf = svm.LinearSVC()
clf.fit(F.T, train_y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
pred_svm = clf.predict(vectorization.transform(test_x))
pred_svm

array([-1,  1,  1])

In [None]:
eval(pred_svm,test_y)

accuracy 1.0
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



# V) NBSVM

In [None]:
clf = svm.LinearSVC()
clf.fit(np.dot(get_R(1,train_y, F),F), train_y)

ValueError: ignored