# Web Scraping (Economics and Business Book)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import time
from datetime import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [2]:
import plotly
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected = True)
import plotly.graph_objs as go

In [3]:
pages=2

def get_info(page_no):
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
    
    r = requests.get('https://www.amazon.in/gp/bestsellers/books/1318068031/ref=zg_bs_pg_'+str(page_no)+'?ie=UTF8&pg='+str(page_no), headers=headers)#, proxies=proxies)
    content = r.content
    soup=BeautifulSoup(content)
    
    all_record=[]
    for d in soup.find_all('div',{'class':'a-section a-spacing-none aok-relative'}):
        book_name=d.find('span',{'class':'aok-inline-block zg-item'})
        #print(book_name.div.img['alt'])
        author=d.find('a',{'class':'a-size-small a-link-child'})
        #if author is not None:
            #print(author.text.strip())
        rating=d.find('span',{'class':'a-icon-alt'})
        #print(rating.text)
        customer_rated=d.find('a',{'class':'a-size-small a-link-normal'})
        #print(customer_rated.text)
        book_type=d.find('span',{'class':'a-size-small a-color-secondary'})
        #print(book_type.text)
        price=d.find('span',{'class':'p13n-sc-price'})
        #print(price.text)
        
        book_record=[]
        if book_name is not None:
            book_record.append(book_name.div.img['alt'])
        else:
            book_record.append('unknown-product')
        
        if author is not None:
            book_record.append(author.text)
        elif author is None:
            author=d.find('span',{'class','a-size-small a-color-base'})
            if author is not None:
                book_record.append(author.text)
            else:
                book_record.append('Anonymous')
        
        if rating is not None:
            book_record.append(rating.text)
        else:
            book_record.append('-1')
        
        if customer_rated is not None:
            book_record.append(customer_rated.text)
        else:
            book_record.append('0')
            
        if book_type is not None:
            book_record.append(book_type.text)
        else:
            book_record.append('0')
            
        if price is not None:
            book_record.append(price.text)
        else:
            book_record.append('0')
        all_record.append(book_record)
    return(all_record)

In [4]:
records=[]
for i in range(1,pages+1):
    records.append(get_info(i))

In [5]:
flatten=lambda l:[item for sublist in l for item in sublist]
flatten(records)

[['World’s Greatest Books For Personal Growth & Wealth (Set of 4 Books): Perfect Motivational Gift Set',
  'Dale Carnegie',
  '4.5 out of 5 stars',
  '1,564',
  'Paperback',
  '₹\xa0299.00'],
 ['The Intelligent Investor (English) Paperback – 2013',
  'Benjamin Graham',
  '4.4 out of 5 stars',
  '13,250',
  'Paperback',
  '₹\xa0401.00'],
 ['The Psychology of Money',
  'Morgan Housel',
  '4.5 out of 5 stars',
  '1,986',
  'Paperback',
  '₹\xa0328.00'],
 ['Rich Dad Poor Dad : What The Rich Teach Their Kids About Money That the Poor and Middle Class Do Not!',
  'Robert T. Kiyosaki',
  '4.5 out of 5 stars',
  '35,703',
  'Mass Market Paperback',
  '₹\xa0375.06'],
 ['A New Idea of India: Individual Rights in a Civilisational State',
  'Harsh Madhusudan',
  '4.8 out of 5 stars',
  '54',
  'Hardcover',
  '₹\xa0578.00'],
 ['Think and Grow Rich',
  'Napoleon Hill',
  '4.5 out of 5 stars',
  '30,574',
  'Paperback',
  '₹\xa0149.00'],
 ['The Intelligent Investor Rev Ed.',
  'Benjamin Graham',
  '4

In [6]:
df=pd.DataFrame(flatten(records),columns=['Book_Name','Author','Rating','Customer_Rated','Type','Price'])

In [7]:
df.head(2)

Unnamed: 0,Book_Name,Author,Rating,Customer_Rated,Type,Price
0,World’s Greatest Books For Personal Growth & W...,Dale Carnegie,4.5 out of 5 stars,1564,Paperback,₹ 299.00
1,The Intelligent Investor (English) Paperback –...,Benjamin Graham,4.4 out of 5 stars,13250,Paperback,₹ 401.00


In [8]:
df['Rating']=df['Rating'].apply(lambda x:x.split(' ')[0])

In [9]:
df['Rating']=pd.to_numeric(df['Rating'],errors='ignore')

In [10]:
df['Customer_Rated']=df['Customer_Rated'].str.replace(',','')

In [11]:
df['Price']=df['Price'].apply(lambda x:x.split('₹')[-1])

In [12]:
df['Price']=df['Price'].str.replace(',','')

In [13]:
df['Price']=df['Price'].apply(lambda x:x.split('.')[0])

In [14]:
df['Price']=df['Price'].astype(int)

In [15]:
df['Customer_Rated']=pd.to_numeric(df['Customer_Rated'],errors='ignore')

In [16]:
df.replace(str(0),np.nan,inplace=True)

In [17]:
df.replace(0,np.nan,inplace=True)

In [18]:
count_nan=len(df)-df.count()

In [19]:
count_nan

Book_Name          0
Author             0
Rating             0
Customer_Rated     2
Type               1
Price             15
dtype: int64

In [20]:
df=df.dropna()

In [21]:
df.sort_values(by='Rating',ascending=False)

Unnamed: 0,Book_Name,Author,Rating,Customer_Rated,Type,Price
47,Pandemonium: The Great Indian Banking Tragedy,Tamal Bandyopadhyay,5.0,3.0,Hardcover,585.0
4,A New Idea of India: Individual Rights in a Ci...,Harsh Madhusudan,4.8,54.0,Hardcover,578.0
23,A New Idea of India: Individual Rights in a Ci...,Harsh Madhusudan,4.8,54.0,Kindle Edition,499.0
14,The Joys Of Compounding: The Passionate Pursui...,Gautam Baid,4.7,66.0,Paperback,399.0
56,Joys Of Compounding: The Passionate Pursuit of...,Gautam Baid,4.7,66.0,Kindle Edition,298.0
...,...,...,...,...,...,...
9,Basics of Indian Stock Market: Learn Markets F...,ANGSHUMAN ADHIKARI,3.9,59.0,Kindle Edition,100.0
20,THE SCAM: from Harshad Mehta to Ketan Parekh A...,Debashis Basu,3.8,29.0,Kindle Edition,413.0
94,Seven habbits of highly effective People E boo...,by Robert T Kiyoyaski (Author),3.7,24.0,Paperback,270.0
80,The Secret (English) Paperback – 2019 by By Rh...,Rhonda Byrne,3.0,12.0,Paperback,116.0


In [22]:
df.sort_values(by='Price',ascending=False)

Unnamed: 0,Book_Name,Author,Rating,Customer_Rated,Type,Price
99,How to Make Money in Intraday Trading: A Maste...,Ashwani Gujral,4.1,401.0,Paperback,889.0
22,The 7 Habits of Highly Effective People: Power...,Stephen R. Covey,4.3,9675.0,Kindle Edition with Audio/Video,738.0
92,Economics on your tips - Indian economic devel...,Gaurav Jain,4.4,280.0,Hardcover,700.0
51,No Rules Rules: Netflix and the Culture of Rei...,Reed Hastings,4.6,931.0,Paperback,650.0
90,Elon Musk: How the Billionaire CEO of SpaceX a...,Ashlee Vance,4.5,7841.0,Paperback,614.0
...,...,...,...,...,...,...
84,The Richest Man in Babylon: George S. Clason I...,George S. Clason,4.4,12428.0,Kindle Edition,17.0
21,The Art of Public Speaking by Dale Carnegie (I...,Dale Carnegie,4.1,705.0,Kindle Edition,11.0
11,How to take decisions (Management Sutras Book 5),Devdutt Pattanaik,4.2,800.0,Kindle Edition,10.0
18,10 Ways to Motivate Yourself: Change Your Life...,Steve Chandler,4.3,1121.0,Kindle Edition,10.0


In [23]:
df['Rating'].value_counts()

4.5    24
4.3    16
4.4    15
4.6    10
4.2     4
4.1     3
4.7     2
4.8     2
4.0     2
3.7     1
3.8     1
3.9     1
3.0     1
5.0     1
1.0     1
Name: Rating, dtype: int64

In [24]:
df['Type'].value_counts()

Paperback                          45
Kindle Edition                     28
Hardcover                           8
Kindle Edition with Audio/Video     1
Perfect Paperback                   1
Mass Market Paperback               1
Name: Type, dtype: int64

In [25]:
df['Type'].replace({'Paperback Bunko':'Paperback','Mass Market Paperback':'Paperback','Perfect Paperback':'Paperback'},inplace=True)

In [26]:
df['Type'].value_counts()

Paperback                          47
Kindle Edition                     28
Hardcover                           8
Kindle Edition with Audio/Video     1
Name: Type, dtype: int64

In [27]:
top_10_price=df.sort_values(by='Price',ascending=False).head(10)
top_10_price

Unnamed: 0,Book_Name,Author,Rating,Customer_Rated,Type,Price
99,How to Make Money in Intraday Trading: A Maste...,Ashwani Gujral,4.1,401.0,Paperback,889.0
22,The 7 Habits of Highly Effective People: Power...,Stephen R. Covey,4.3,9675.0,Kindle Edition with Audio/Video,738.0
92,Economics on your tips - Indian economic devel...,Gaurav Jain,4.4,280.0,Hardcover,700.0
51,No Rules Rules: Netflix and the Culture of Rei...,Reed Hastings,4.6,931.0,Paperback,650.0
90,Elon Musk: How the Billionaire CEO of SpaceX a...,Ashlee Vance,4.5,7841.0,Paperback,614.0
33,The Scam: From Harshad Mehta To Ketan Parekh A...,Debashis Basu and Sucheta Dalal,4.0,13.0,Paperback,600.0
47,Pandemonium: The Great Indian Banking Tragedy,Tamal Bandyopadhyay,5.0,3.0,Hardcover,585.0
83,"Case Study Handbook, Revised Edition: A Studen...",William Ellet,4.5,70.0,Paperback,583.0
4,A New Idea of India: Individual Rights in a Ci...,Harsh Madhusudan,4.8,54.0,Hardcover,578.0
91,"The 4-Hour Work Week: Escape the 9-5, Live Any...",Timothy Ferriss,4.3,8699.0,Paperback,524.0


In [28]:
top_15_price=df.sort_values(by='Price',ascending=False).head(15)
top_15_price

Unnamed: 0,Book_Name,Author,Rating,Customer_Rated,Type,Price
99,How to Make Money in Intraday Trading: A Maste...,Ashwani Gujral,4.1,401.0,Paperback,889.0
22,The 7 Habits of Highly Effective People: Power...,Stephen R. Covey,4.3,9675.0,Kindle Edition with Audio/Video,738.0
92,Economics on your tips - Indian economic devel...,Gaurav Jain,4.4,280.0,Hardcover,700.0
51,No Rules Rules: Netflix and the Culture of Rei...,Reed Hastings,4.6,931.0,Paperback,650.0
90,Elon Musk: How the Billionaire CEO of SpaceX a...,Ashlee Vance,4.5,7841.0,Paperback,614.0
33,The Scam: From Harshad Mehta To Ketan Parekh A...,Debashis Basu and Sucheta Dalal,4.0,13.0,Paperback,600.0
47,Pandemonium: The Great Indian Banking Tragedy,Tamal Bandyopadhyay,5.0,3.0,Hardcover,585.0
83,"Case Study Handbook, Revised Edition: A Studen...",William Ellet,4.5,70.0,Paperback,583.0
4,A New Idea of India: Individual Rights in a Ci...,Harsh Madhusudan,4.8,54.0,Hardcover,578.0
91,"The 4-Hour Work Week: Escape the 9-5, Live Any...",Timothy Ferriss,4.3,8699.0,Paperback,524.0


In [29]:
import plotly.express as px

In [30]:
fig = px.scatter(top_10_price, x="Price", y="Author", color="Type",
                 size='Rating', hover_data=['Book_Name'])
fig.show()

In [31]:
fig = px.scatter(top_15_price, x="Price", y="Author",color="Rating", hover_data=['Book_Name'])
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()

In [32]:
fig = px.scatter(top_15_price, x="Price", y="Author",color="Type", hover_data=['Book_Name'])
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()

In [45]:
data=pd.DataFrame(flatten(records),columns=['Book_Name','Author','Rating','Customer_Rated','Type','Price'])

In [46]:
data['Rating']=data['Rating'].apply(lambda x:x.split(' ')[0])
data['Rating']=pd.to_numeric(data['Rating'],errors='ignore')
data['Customer_Rated']=data['Customer_Rated'].str.replace(',','')
data['Price']=data['Price'].apply(lambda x:x.split('₹')[-1])
data['Price']=data['Price'].str.replace(',','')
data['Price']=data['Price'].apply(lambda x:x.split('.')[0])
data['Price']=data['Price'].astype(int)
data['Customer_Rated']=pd.to_numeric(data['Customer_Rated'],errors='ignore')

In [47]:
data['Type'].value_counts()

Paperback                          46
Kindle Edition                     28
Audible Audiobook                  14
Hardcover                           8
Perfect Paperback                   1
Kindle Edition with Audio/Video     1
Mass Market Paperback               1
0                                   1
Name: Type, dtype: int64

In [48]:
data['Type'].replace({'Paperback Bunko':'Paperback','Mass Market Paperback':'Paperback','Perfect Paperback':'Paperback'},inplace=True)

In [49]:
type_book=data['Type'].value_counts()

In [50]:
type_book=pd.DataFrame(type_book).reset_index()

In [51]:
type_book.columns=['Type','Total']

In [52]:
type_book

Unnamed: 0,Type,Total
0,Paperback,48
1,Kindle Edition,28
2,Audible Audiobook,14
3,Hardcover,8
4,Kindle Edition with Audio/Video,1
5,0,1


In [53]:
type_book.drop(5,axis=0,inplace=True)

In [54]:
type_book

Unnamed: 0,Type,Total
0,Paperback,48
1,Kindle Edition,28
2,Audible Audiobook,14
3,Hardcover,8
4,Kindle Edition with Audio/Video,1


In [55]:
import plotly.graph_objects as go

In [56]:
data1 = {
   "values": type_book['Total'],
   "labels": type_book['Type'],
   "name": "Book",
   "hoverinfo":"label+percent+name",
    "marker": {'colors': ['#FF8C00', '#E9967A', '#FFE4C4','#FAEBD7']},
    "pull":[0, 0, 0.2, 0],
   "type": "pie"
}

data = [data1]
layout = go.Layout(
   {
      "title":"Distribution of Movie Rating",
      "grid": {"rows": 1, "columns": 1}
   }
)
fig = go.Figure(data = data, layout = layout)

fig.update_layout(
    title={
        'text': "<b>Best Selling Book Type<b>",
        'y':0.9,
        'x':0.49,
        'xanchor': 'center',
        'yanchor': 'top'},font=dict(
        family="Times New Roman", size=15
    ), legend={
    "x": 0.78,
    "y": 0.85})
iplot(fig)