In [279]:
from pymongo import MongoClient
import requests
from selenium import webdriver
import bs4
import re
from pprint import pprint
import time
import pickle

In [246]:
mongo_uri = 'mongodb://localhost:10000/'
url = 'https://www.amazon.in/dp/B07DJHY82F/ref=gbph_img_m-5_d182_b23b14bf?smid=A23AODI1X2CEAE&pf_rd_p=a3a8dc53-aeed-4aa1-88bb-72ce9ddad182&pf_rd_s=merchandised-search-5&pf_rd_t=101&pf_rd_i=1389401031&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_r=P3FSQH2KEB3B5QQ1NQD5'

- [X] Product Title
- [ ] Product Description
- [X] Product Enlarge Image
- [X] Product Price (With Exchange and Without Exchange)
- [X] Product Colours
- [X] No. of Reviews
- [X] Star Rating
- [X] Technical Details
- [X] Most Recent 100 Reviews
- [X] All Reviews

In [248]:
#Scraping Data
def scrape_selenium():
    data = {}
    browser = webdriver.Firefox()
    print("Opening Browser")
    browser.get(url)
    print("Scraping Product Details")
    data = {
        'title': browser.find_element_by_id('productTitle').text,
        'price': {
            'with_exchange': re.sub(' +',' ',browser.find_element_by_id('maxBuyBackDiscountSection').text),
            'mrp': re.sub(' +','',browser.find_element_by_class_name('priceBlockStrikePriceString').text),
            'deal_price': re.sub(' +','',browser.find_element_by_id('priceblock_dealprice').text),
        },
        'colors': [i.get_attribute('alt') for i in browser.find_elements_by_class_name('imgSwatch')],
        'no_of_reviews': int(re.sub(',','',browser.find_element_by_id('acrCustomerReviewText').text.split()[0])),
        'images': [re.search('\"(.+?)\"',i.get_attribute('style'))[0] for i in browser.find_elements_by_class_name('ivThumbImage')  if i.get_attribute('style')!=''],
        'ratings': float(browser.find_element_by_class_name('arp-rating-out-of-text').text.split()[0]), 
        'tech_details':{},
        'all_reviews': [],
        'most_recent_reviews': [],
    }

    print("Scraping Technical Details")
    # scraping Technical Details
    for i in browser.find_element_by_class_name('pdTab').find_elements_by_tag_name('tr'):
        key_value = i.find_elements_by_tag_name('td')
        if key_value[0].text.split():
            data['tech_details'][key_value[0].text.strip()] = ''.join(key_value[1].text.strip())
    
    for i in browser.find_elements_by_css_selector("span[data-action='main-image-click']"):
        try:
            i.click()
        except Exception as e:
            continue
    data['images'] = [re.search('\"(.+?)\"',i.get_attribute('style'))[0] for i in browser.find_elements_by_class_name('ivThumbImage')  if i.get_attribute('style')!='']
 
    reviews_url = browser.find_element_by_css_selector("a[data-hook='see-all-reviews-link-foot']").get_attribute('href')
    return data, reviews_url, browser

In [250]:
def most_recent(rurl):
    # Fetch 100 most recent reviews
    print("Fetching 100 most recent reviews")
    most_recent_reviews = []
    revurl = rurl + '&sortBy=recent&pageNumber='
    for page in range(1,11):
        reurl = revurl + str(page)
        resp = requests.get(reurl)
        count = 0
        print("Page :",page,end='\r')
        while resp.status_code!=200:
            count += 1
            print(count,end='\r')
        soup = bs4.BeautifulSoup(resp.text)
        rev_list = soup.select_one('#cm_cr-review_list').select('.review')
        #print(rev_list)
        for rev in rev_list:
            review = {
                'customer': rev.select_one('.a-profile-name').text.strip(),
                'rating': float(re.search('\d(\.\d)?',rev.select_one('.review-rating')['class'][2])[0]),
                'title': rev.select_one('.review-title').text.strip(),
                'review': rev.select_one('.review-text').get_text(separator=' ').strip(),
            }
            #print(review)
            most_recent_reviews.append(review)
    return most_recent_reviews

In [251]:
def all_reviews(rurl):
    # Fetch all reviews
    print("Fetching all Reviews")
    reviews = []
    revurl = rurl + '&sortBy=helpful&pageNumber='
    pages = data['no_of_reviews']//10 + (data['no_of_reviews']%10 + 9)//10  
    for page in range(1,pages+1):
        reurl = revurl + str(page)
        resp = requests.get(reurl)
        count = 0
        print("Page :",page,end='\r')
        while resp.status_code!=200:
            count += 1
            print(count,end='\r')
            time.sleep(5)
        soup = bs4.BeautifulSoup(resp.text)
        rev_list = soup.select_one('#cm_cr-review_list').select('.review')
        #print(rev_list)
        for rev in rev_list:
            review = {
                'customer': rev.select_one('.a-profile-name').text.strip(),
                'rating': float(re.search('\d(\.\d)?',rev.select_one('.review-rating')['class'][2])[0]),
                'title': rev.select_one('.review-title').text.strip(),
                'review': rev.select_one('.review-text').get_text(separator=' ').strip(),
            }
            #print(review)
            reviews.append(review)
    return reviews

In [274]:
# Scraping All Reviews
def fetch_all_reviews(browser, rurl):
    print("Fetching All Reviews")
    reviews = []
    pages = data['no_of_reviews']//10 + (data['no_of_reviews']%10 + 9)//10
    revurl = rurl + '&sortBy=helpful&pageNumber='
    for page in range(1,pages+1):
        print("Page :",page,end='\r')
        browser.get(revurl+str(page))
        try:
            browser.find_element_by_class_name("no-reviews-section")
        except Exception as e:
            pass
        else:
            break
        review_list = browser.find_element_by_id('cm_cr-review_list').find_elements_by_class_name('review')
        for rev in review_list:
            review = {
                'customer': rev.find_element_by_class_name('a-profile-name').text,
                'rating': float(re.search('\d(\.\d)?',rev.find_element_by_class_name('review-rating').get_attribute('class'))[0]),
                'title': rev.find_element_by_class_name('review-title').text.strip(),
                'review': ''.join(rev.find_element_by_class_name('review-text').text.strip()),
            }
            #print(review)
            reviews.append(review)
    return reviews

In [247]:
# Database Functions
def connect():
    client = MongoClient(mongo_uri)
    return client.data

def insert_product(data):
    db = connect().products
    if db.find_one({'':data['']}):
        db.products.insert(data)
        return True
    return False

def insert_reviews(data, pid):
    db = connect().products
    if db.find_one({'':data['']}):
        db.products.insert(data)
        return True
    return False


In [249]:
data, rurl, browser = scrape_selenium()
data['most_recent_reviews'] = most_recent(rurl)
data['all_reviews'] = fetch_all_reviews(browser, rurl)
pprint(data)

Opening Browser
Scraping Product Details
Scraping Technical Details
{'all_reviews': [],
 'colors': ['Mirror Black', 'Midnight Black', 'Speed Orange'],
 'images': ['"https://images-na.ssl-images-amazon.com/images/I/41VkqnrF85L._AA50_.jpg"',
            '"https://images-na.ssl-images-amazon.com/images/I/31zrxJ74RHL._AA50_.jpg"',
            '"https://images-na.ssl-images-amazon.com/images/I/31-GdZkkyUL._AA50_.jpg"',
            '"https://images-na.ssl-images-amazon.com/images/I/41YJBZcf4UL._AA50_.jpg"',
            '"https://images-na.ssl-images-amazon.com/images/I/21bXdUPe8fL._AA50_.jpg"',
            '"https://images-na.ssl-images-amazon.com/images/I/21uJnr-B7TL._AA50_.jpg"',
            '"https://images-na.ssl-images-amazon.com/images/I/31ohyErZTPL._AA50_.jpg"',
            '"https://images-na.ssl-images-amazon.com/images/I/21oIB5yMVgL._AA50_.jpg"',
            '"https://images-na.ssl-images-amazon.com/images/I/214iFzKjoXL._AA50_.jpg"',
            '"https://images-na.ssl-images-amazo

In [280]:
#saving the data in pickle
with open('data.pickle', 'wb') as f:
    pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

#loading the pickle data
with open('data.pickle', 'rb') as f:
    b = pickle.load(f)

True