# Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing libraries for scraping

In [2]:
from bs4 import BeautifulSoup
import csv

Getting the links from csv file

In [3]:
with open('Links.csv') as f:
    reader = csv.reader(f)
    links = [row[0] for row in reader]

In [7]:
from selenium import webdriver
driver = webdriver.Chrome('C:\webdrivers/chromedriver')

In [8]:
reviews = []
for link in links:
    response = driver.get(link)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'html.parser')
    i = soup.find_all('div', {'class':'reviewSnippetCard'})
    for j in i:
        review = j.find('p', {'class':'snippetSummary'}).text
        rating = int(j.find('span', {'class': 'stars-icon-star-on'})['style'][6:-2])/20
        reviews.append([review,rating])

In [9]:
dataset = pd.DataFrame(reviews, columns=['Review', 'Rating'])

In [10]:
dataset.to_csv('reviews.csv', index=False)

# Applying Natural Language Processing

In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [13]:
ps = PorterStemmer()

Preprocessing the data for applying NLP

In [14]:
corpus = []
for i in range(len(dataset)):
    review = re.sub('@[\w]*', ' ', dataset['Review'][i])
    review = re.sub('^a-zA-Z#', ' ', review)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000)

In [16]:
X = dataset['Review'].values
y = dataset['Rating'].values

In [17]:
X = cv.fit_transform(corpus).toarray()

In [18]:
X[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
y

array([4., 4., 5., 3., 5., 4., 4., 4., 4., 5., 4., 4., 5., 5., 5., 5., 4.,
       5., 4., 4., 5., 4., 4., 5., 4., 5., 4., 5., 4., 4., 1., 3., 2., 4.,
       5., 4., 4., 3., 4., 4., 1., 4., 4., 4., 5., 5., 2., 4., 5., 5., 4.,
       4., 4., 4., 4., 5., 4., 4., 3., 4., 3., 4., 4., 3., 3., 3., 4., 5.,
       5., 1., 4., 4., 5., 4., 4., 3., 4., 4., 4., 4., 5., 4., 4., 5., 5.,
       5., 4., 4., 4., 4., 2., 4., 4., 3., 2., 4., 2., 4., 5., 5., 4., 4.,
       4., 5., 5., 4., 4., 1., 4., 4., 3., 3., 3., 4., 4., 4., 4., 3., 4.,
       4.])

In [21]:
for i in range(len(y)):
    if y[i] >= 3.5:
        y[i] = 1 # 1  means Positive
    else:
        y[i] = 0 # 0 means Negative

In [22]:
y

array([1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1.,
       1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1.,
       1.])

# Splitting into Training Set and Test Set

In [23]:
from sklearn.model_selection import train_test_split
X_train ,X_test ,y_train ,y_test = train_test_split(X, y, test_size = 0.2)

In [24]:
print(len(X_train))
print(len(X_test))
print(len(X))

96
24
120


# Applying Naive Bayes Classifier

In [26]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

In [27]:
nb.fit(X_train, y_train)
nb.score(X_train, y_train)

0.9791666666666666

In [28]:
nb.score(X_test, y_test)

0.8333333333333334