In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
headers = {
    'authority': 'www.amazon.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}


In [3]:
reviews_url = 'https://www.amazon.in/HP-Micro-Edge-Anti-Glare-15s-fq5111TU/dp/B0B6F5V23N/ref=sr_1_1_sspa?crid=2W61VS83SVMTG&keywords=hp%2Blaptop&qid=1708004340&sprefix=hp%2Caps%2C302&sr=8-1-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&th=1'

In [4]:
len_page = 4

In [5]:
def reviewsHtml(url, len_page):
    
    # Empty List define to store all pages html data
    soups = []
    
    # Loop for gather all 3000 reviews from 300 pages via range
    for page_no in range(1, len_page + 1):
        
        # parameter set as page no to the requests body
        params = {
            'ie': 'UTF8',
            'reviewerType': 'all_reviews',
            'filterByStar': 'critical',
            'pageNumber': page_no,
        }
        
        # Request make for each page
        response = requests.get(url, headers=headers)
        
        # Save Html object by using BeautifulSoup4 and lxml parser
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Add single Html page data in master soups list
        soups.append(soup)
        
    return soups

In [6]:
def getReviews(html_data):

    # Create Empty list to Hold all data
    data_dicts = []
    
    # Select all Reviews BOX html using css selector
    boxes = html_data.select('div[data-hook="review"]')
    
    # Iterate all Reviews BOX 
    for box in boxes:
        
        # Select Name using css selector and cleaning text using strip()
        # If Value is empty define value with 'N/A' for all.
        try:
            name = box.select_one('[class="a-profile-name"]').text.strip()
        except Exception as e:
            name = 'N/A'

        try:
            stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
        except Exception as e:
            stars = 'N/A'   

        try:
            title = box.select_one('[data-hook="review-title"]').text.strip()
        except Exception as e:
            title = 'N/A'

        try:
            # Convert date str to dd/mm/yyy format
            datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
            date = datetime.strptime(datetime_str, '%B %d, %Y').strftime("%d/%m/%Y")
        except Exception as e:
            date = 'N/A'

        try:
            description = box.select_one('[data-hook="review-body"]').text.strip()
        except Exception as e:
            description = 'N/A'

        # create Dictionary with al review data 
        data_dict = {
            'Name' : name,
            'Stars' : stars,
            'Title' : title,
            'Date' : date,
            'Description' : description
        }

        # Add Dictionary in master empty List
        data_dicts.append(data_dict)
    
    return data_dicts

In [7]:
html_datas = reviewsHtml(reviews_url, len_page)

# Empty List to Hold all reviews data
reviews = []

# Iterate all Html page 
for html_data in html_datas:
    
    # Grab review data
    review = getReviews(html_data)
    
    # add review data in reviews empty list
    reviews += review

In [10]:
df_reviews = pd.DataFrame(reviews)

In [11]:
print(df_reviews)

                          Name Stars  \
0      Dr. Ramesh Babu Devalla   5.0   
1                 rakesh saini   4.0   
2                  Placeholder   5.0   
3             Sudheendra Adiga   1.0   
4                 SHYAM BUTIYA   5.0   
5                  Placeholder   4.0   
6               Ramamoorthy. R   3.0   
7   KHARADI JOVANBHAI KALUBHAI   5.0   
8      Dr. Ramesh Babu Devalla   5.0   
9                 rakesh saini   4.0   
10                 Placeholder   5.0   
11            Sudheendra Adiga   1.0   
12                SHYAM BUTIYA   5.0   
13                 Placeholder   4.0   
14              Ramamoorthy. R   3.0   
15  KHARADI JOVANBHAI KALUBHAI   5.0   
16     Dr. Ramesh Babu Devalla   5.0   
17                rakesh saini   4.0   
18                 Placeholder   5.0   
19            Sudheendra Adiga   1.0   
20                SHYAM BUTIYA   5.0   
21                 Placeholder   4.0   
22              Ramamoorthy. R   3.0   
23  KHARADI JOVANBHAI KALUBHAI   5.0   


In [12]:
df_reviews.to_csv('hpreviews.csv', index=False)

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB  # Import Naive Bayes
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score
import numpy as np 

In [14]:
df = pd.read_csv('hpreviews.csv')

In [15]:
X = df['Description'] 
y = df['Stars'] 

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),  
    ('model', MultinomialNB())  # Use Naive Bayes (MultinomialNB)
])

In [18]:
pipeline.fit(X_train, y_train)

In [19]:
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.0


In [20]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

Accuracy Score: 1.0


In [21]:
new_comments = ["This is amazing!", "Not working"]
predicted_ratings = pipeline.predict(new_comments)
predicted_ratings = np.round(predicted_ratings).astype(int)

In [62]:
print("Predicted Ratings:", predicted_ratings)

Predicted Ratings: [5 5]
