In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
dataset = pd.read_csv('Reviews.csv', nrows=80000)
dataset = dataset.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)

In [3]:
def html_tag_remove(review):
  soup = BeautifulSoup(review, 'lxml')
  return soup.get_text()

In [4]:
def removeContractions(review):
    phrase = re.sub(r"won't", "will not", review)
    phrase = re.sub(r"can\'t", "can not", review)
    phrase = re.sub(r"n\'t", " not", review)
    phrase = re.sub(r"\'re", " are", review)
    phrase = re.sub(r"\'s", " is", review)
    phrase = re.sub(r"\'d", " would", review)
    phrase = re.sub(r"\'ll", " will", review)
    phrase = re.sub(r"\'t", " not", review)
    phrase = re.sub(r"\'ve", " have", review)
    phrase = re.sub(r"\'m", " am", review)
    return phrase

In [5]:
def removeAlphaNumericWords(review):
     return re.sub("\S*\d\S*", "", review).strip()
def removeSpecialChars(review):
     return re.sub('[^a-zA-Z]', ' ', review)
def scorePartition(x):
    if x < 3:
        return 0
    return 1

In [6]:
def TextPreprocessing(review):
    review = html_tag_remove(review)
    review = removeContractions(review)
    review = removeAlphaNumericWords(review)
    review = removeSpecialChars(review) 
    review = review.lower()  
    review = review.split()
    lmtzr = WordNetLemmatizer()
    review = [lmtzr.lemmatize(word, 'v') for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)    
    return review

In [7]:
actualScore = dataset['Score']
positiveNegative = actualScore.map(scorePartition) 
dataset['Score'] = positiveNegative

In [8]:
import tqdm
corpus = []   
for index, row in tqdm.tqdm(dataset.iterrows()):
    review = TextPreprocessing(row['Text'])
    corpus.append(review)

71493it [14:15, 83.61it/s] 


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,3), max_features = 5000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,6].values

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [11]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.6892307692307692


In [13]:
def predictNewReview(newReview):
  if newReview =='':
    print('Invalid Review')  
  else:
    newReview = TextPreprocessing(newReview)
    new_review = cv.transform([newReview]).toarray()  
    prediction =  classifier.predict(new_review)
    print(prediction)
    if prediction[0] == 1:
      print( "Positive Review" )
    else:        
      print( "Negative Review")

In [14]:
r = "Worth every penny. The packaging was good as well. The consistency was on point and so were the flavours. However the packaging could be better."
predictNewReview(r)

[1]
Positive Review


In [16]:
r = "It was expensive as hell. It wasn't even tasting good. The only good thing was the packaging."
predictNewReview(r)

[0]
Negative Review


In [21]:
from joblib import dump, load
dump(classifier, '/content/drive/MyDrive/Colab Notebooks/Text prediction /sentiment_model.joblib') 

['/content/drive/MyDrive/Colab Notebooks/Text prediction /sentiment_model.joblib']