In [2]:
import pandas as pd
import numpy as np

In [5]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [9]:
df = pd.read_csv('/drive/MyDrive/Sentiment_analysis/a1_RestaurantReviews_HistoricDump.tsv',delimiter = '\t',quoting = 3)

In [10]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [12]:
df.shape

(900, 2)

# Data preprossing

In [13]:
import re
import nltk

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

In [26]:
import string

punc = string.punctuation

def remove_punc(text):
  return text.translate(str.maketrans('','',punc))

In [30]:
df['Review']=df['Review'].apply(remove_punc)        ## remove punctuation

In [32]:
df['Review']=df['Review'].str.lower()               ## Convert into lower

In [33]:
df.head()

Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [36]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def word_split(text):
  return word_tokenize(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [38]:
df['Review'] = df['Review'].apply(word_split)     ### Word tokenized here

In [97]:
def Stem_words(text):
  return ''.join([ps.stem(i) for i in text])

In [98]:
df['Review'].apply(Stem_words)

0                                       wow loved  place
1                                        crust  not good
2                            not tasty   texture   nasty
3      stopped    late may bank holiday  rick steve r...
4                     selection   menu  great     prices
                             ...                        
895     want  first say  server  great    perfect ser...
896                               pizza selections  good
897                                strawberry tea   good
898           highly unprofessional  rude   loyal patron
899                            overall  great experience
Name: Review, Length: 900, dtype: object

In [49]:
##Stopwords

def remove_stopwords(text):
  new_text = []
  for word in text:
    if word in all_stopwords:
      new_text.append('')
    else:
      new_text.append(word)
  x = new_text[:]
  new_text.clear()
  return ' '.join(x)

In [54]:
df['Review']=df['Review'].apply(remove_stopwords)

In [55]:
df.head()

Unnamed: 0,Review,Liked
0,wow loved place,1
1,crust not good,0
2,not tasty texture nasty,0
3,stopped late may bank holiday rick steve r...,1
4,selection menu great prices,1


# Data Transformation

In [85]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)

In [86]:
X = cv.fit_transform(df['Review']).toarray()
y = df['Liked']

In [92]:
# Saving BoW dictionary to later use in prediction
import pickle
bow_path = '/content/drive/MyDrive/Sentiment_analysis/c1_BoW_Sentiment_Model.pkl'
pickle.dump(cv, open(bow_path, "wb"))

# Dividing dataset into training and test set

In [93]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Model fitting (Naive Bayes)

In [94]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [95]:
# Exporting NB Classifier to later use in prediction
import joblib
joblib.dump(classifier, '/content/drive/MyDrive/Sentiment_analysis/c2_Classifier_Sentiment_Model') 

['/content/drive/MyDrive/Sentiment_analysis/c2_Classifier_Sentiment_Model']

In [96]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[57 21]
 [27 75]]


0.7333333333333333