In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd drive/MyDrive/BERT_assign/

/content/drive/MyDrive/BERT_assign


In [3]:
#Read the dataset - Amazon fine food reviews
import pandas as pd
reviews = pd.read_csv(r"Reviews.csv")
#check the info of the dataset
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [4]:
reviews.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [5]:
df = pd.DataFrame(zip(reviews.Text,reviews.Score),columns=("text","ratings"))
df.head()

Unnamed: 0,text,ratings
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [6]:
df = df[df["ratings"] != 3]

In [7]:
df["ratings"] = df["ratings"].apply(lambda x:1 if(x > 2) else 0)


## Sentiment Analysis using SPACY library

In [15]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
sp = spacy.load('en_core_web_sm',disable=['parser', 'tagger', 'ner'])

all_stopwords = sp.Defaults.stop_words


In [16]:
import string
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [21]:
def dataCleaning(sentence):
  
  doc = sp(sentence)
  tokens = []
  for token in doc:

    if token.lemma_ != '-PRON-':
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)
  clean_tokens = []
  for token in tokens:
    if token not in punct and token not in all_stopwords:
      clean_tokens.append(token)
  return clean_tokens



In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["text"], df["ratings"], test_size = 0.2)
print(X_train.shape,y_test.shape)

(420651,) (105163,)


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC

tfidf = TfidfVectorizer(tokenizer = dataCleaning)
svm = LinearSVC()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)


In [25]:
# Training the model
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function dataCleaning at 0x7fed65ecb560>)),
                ('svm', LinearSVC())])

In [26]:
# Testing on the test dataset
y_pred = pipe.predict(X_test)

In [27]:
# Printing the classification report and the confusion matrix
print(classification_report(y_test,y_pred))
print("\n\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.73      0.78     16585
           1       0.95      0.97      0.96     88578

    accuracy                           0.94    105163
   macro avg       0.90      0.85      0.87    105163
weighted avg       0.93      0.94      0.93    105163




[[12039  4546]
 [ 2254 86324]]


In [29]:

from sklearn.metrics import f1_score,accuracy_score

print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

0.9621060139984843
0.9353384745585425


### Using NLTK package

In [30]:
import nltk

In [45]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [46]:

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = list(set(stopwords.words('english')))

stop_words.remove("not")
lemmatizer = WordNetLemmatizer()

In [50]:
import string
import re
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [51]:
def nltk_preprocess(f):

  f = f.lower()
  f = f.split(" ")

  f = [word for word in f if not word in (stop_words)]
  

  f = [lemmatizer.lemmatize(word) for word in f]
  f = " ".join(f)
  f = re.sub(r'[^\w\s]','',f)
  



  return f

print(nltk_preprocess("Rikin NSYSK id !@# hello FDfdefe 2525"))

rikin nsysk id  hello fdfdefe 2525


In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["text"], df["ratings"], test_size = 0.2)
print(X_train.shape,y_test.shape)

(420651,) (105163,)


In [53]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC

tfidf = TfidfVectorizer(tokenizer = nltk_preprocess)
svm = LinearSVC()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)

pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function nltk_preprocess at 0x7fed4f7faef0>)),
                ('svm', LinearSVC())])

In [54]:
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))
print("\n\n")
print(confusion_matrix(y_test,y_pred))


from sklearn.metrics import f1_score,accuracy_score

print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.72      0.00      0.00     16590
           1       0.84      1.00      0.91     88573

    accuracy                           0.84    105163
   macro avg       0.78      0.50      0.46    105163
weighted avg       0.82      0.84      0.77    105163




[[   39 16551]
 [   15 88558]]
0.9144680455592157
0.8424731131671785
