# Assignment IV
Sai Nithish

## Web Scrapping

In [8]:
import requests
from bs4 import BeautifulSoup as bs4
import pandas as pd

In [9]:
def scrape(url, count = 10):
    n = count//10
    ratings = []
    reviews = []
    titles = []
    for j in range(n):
        URL = f"{url}&pageNumber={j}"
        page = requests.get(URL)
        soup = bs4(page.content, "html.parser")
        t = soup.find_all('a',attrs={"data-hook":"review-title" , "class":"a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold"})
        r = soup.find_all('i',attrs={"data-hook":"review-star-rating"})
        re = soup.find_all('span',attrs={"data-hook":"review-body" , "class":"a-size-base review-text review-text-content"})
        for i in range(len(r)):
            ratings.append(r[i].text[0])
            reviews.append(re[i].text[1:-1])
            titles.append(t[i].text[1:-1])

    rev = pd.DataFrame()
    rev['ratings'] = ratings
    rev['reviews'] = reviews
    rev['titles'] = titles
    return rev


In [10]:
url = "https://www.amazon.in/Apple-MacBook-Chip-13-inch-256GB/product-reviews/B08N5W4NNB/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
#df = scrape(url,30000)

## Loading dataset

In [11]:
df = pd.read_csv('./Mac.csv',index_col=False)


In [12]:
df = df[['ratings','reviews']]

In [13]:
df.head()

Unnamed: 0,ratings,reviews
0,4,Loved every bit of this gorgeous laptop. The s...
1,5,The battery backup is just awesome as it easil...
2,5,Thanks to Amazon for quick and safe delivery 🙏...
3,5,Best laptop till date and unbeatable desgin an...
4,5,This laptop is not having SMC options as we ha...


## Data preprocessing

In [14]:
df['ratings'].value_counts()

5    541
4     66
1     64
3     17
2      9
Name: ratings, dtype: int64

In [15]:
df.duplicated().sum()

48

In [16]:
df = df.drop_duplicates()

In [32]:
df = df.dropna()

In [17]:
!pip install -q contractions

## Text Preprocessing

In [18]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import contractions

In [19]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nitis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nitis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nitis\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [117]:
def clean(text):
    text = text.lower()   
    text = re.sub('\.', ' ', text) 
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = contractions.fix(text)
    stop = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop])
    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = re.sub(' +', ' ', text)
    return text

In [28]:
df['reviews']

0      Loved every bit of this gorgeous laptop. The s...
1      The battery backup is just awesome as it easil...
2      Thanks to Amazon for quick and safe delivery 🙏...
3      Best laptop till date and unbeatable desgin an...
4      This laptop is not having SMC options as we ha...
                             ...                        
691    Awesome Product, but order a samsung T7 500gig...
693    Discounts were worth it. Apple products are a ...
694    Amazing device with lighting fast chip inside ...
695    Even though a I'm a first time user for Mac , ...
696    Amazon sometimes miss to mention device serial...
Name: reviews, Length: 649, dtype: object

In [34]:
df['reviews'] = df['reviews'].apply(clean)

## Word Embedding

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [95]:
vec = CountVectorizer()
X = vec.fit_transform(df["reviews"])
X

<646x2718 sparse matrix of type '<class 'numpy.int64'>'
	with 12019 stored elements in Compressed Sparse Row format>

In [96]:
X = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
X

Unnamed: 0,aake,aakh,aand,ability,able,abrupt,absolute,absolutely,accept,acceptable,...,yesterday,yet,youtube,yr,yu,zephyrus,zero,zip,zoom,zoomed
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
642,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
643,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
644,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
y = df['ratings']
y

0      4
1      5
2      5
3      5
4      5
      ..
691    5
693    5
694    4
695    5
696    4
Name: ratings, Length: 646, dtype: int64

## Class balancing

In [98]:
y.value_counts()

5    498
1     63
4     59
3     17
2      9
Name: ratings, dtype: int64

In [88]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [99]:
oversample = RandomOverSampler()
# fit and apply the transform
X, y = oversample.fit_resample(X, y)

In [100]:
y.value_counts()

4    498
5    498
3    498
2    498
1    498
Name: ratings, dtype: int64

## Modelling

In [101]:
from sklearn.model_selection import train_test_split

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=5, test_size=0.2)

In [103]:
print('Shape of Train data : ', X_train.shape)
print('Shape of Test data : ', X_test.shape)

Shape of Train data :  (1992, 2718)
Shape of Test data :  (498, 2718)


In [104]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [105]:
def report(X_train, X_test, y_train, y_test):
    print('Training : \n', classification_report(y_train, X_train))
    print('\nTesting : \n', classification_report(y_test, X_test))

### SVM

In [106]:
from sklearn import svm

In [107]:
clf = svm.SVC(class_weight='balanced', probability=True)
clf.fit(X_train, y_train)

In [108]:
y_pred = clf.predict(X_test)
y_pred.shape

(498,)

In [109]:
report(clf.predict(X_train), y_pred, y_train, y_test)

Training : 
               precision    recall  f1-score   support

           1       1.00      0.91      0.95       393
           2       1.00      1.00      1.00       402
           3       1.00      0.96      0.98       401
           4       0.97      0.85      0.91       404
           5       0.79      0.99      0.88       392

    accuracy                           0.94      1992
   macro avg       0.95      0.94      0.94      1992
weighted avg       0.95      0.94      0.94      1992


Testing : 
               precision    recall  f1-score   support

           1       1.00      0.95      0.98       105
           2       1.00      1.00      1.00        96
           3       1.00      0.90      0.95        97
           4       0.97      0.78      0.86        94
           5       0.74      0.98      0.85       106

    accuracy                           0.92       498
   macro avg       0.94      0.92      0.93       498
weighted avg       0.94      0.92      0.93       4

### Random Forest

In [112]:
from sklearn.ensemble import RandomForestClassifier


In [113]:
clf = RandomForestClassifier(max_depth=6)
clf.fit(X_train, y_train)

In [114]:
y_pred = clf.predict(X_test)
y_pred.shape

(498,)

In [115]:
report(clf.predict(X_train), y_pred, y_train, y_test)

Training : 
               precision    recall  f1-score   support

           1       0.96      0.76      0.85       393
           2       1.00      0.89      0.94       402
           3       0.99      0.85      0.92       401
           4       0.71      0.73      0.72       404
           5       0.58      0.83      0.69       392

    accuracy                           0.81      1992
   macro avg       0.85      0.81      0.82      1992
weighted avg       0.85      0.81      0.82      1992


Testing : 
               precision    recall  f1-score   support

           1       0.85      0.72      0.78       105
           2       1.00      0.89      0.94        96
           3       0.99      0.77      0.87        97
           4       0.60      0.65      0.63        94
           5       0.50      0.70      0.58       106

    accuracy                           0.74       498
   macro avg       0.79      0.75      0.76       498
weighted avg       0.79      0.74      0.76       4