<a href="https://colab.research.google.com/github/SukratiJain/UnwrapAndScrap/blob/main/Amazon_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Scraper for Amazon products of a certain zipcode, fetching all products and storing locally into a Csv and database.
##### Technologies Used: Python3, Requests, Sqlite3

In [32]:
import json
import re
import csv
import requests

from lxml import html
import sqlite3

conn = sqlite3.connect('amazon_location.db')
cursor = conn.cursor()
table = cursor.execute('CREATE TABLE IF NOT EXISTS Baby_Milk([Product Name] text, Price text, [Product ID] text, [Review Count] text, Rating text, Sponsored text );')

zipcode = "10081"
search_keyword = "baby milk"

class Amazon():
    def __init__(self):
        self.keys = ["Product Name", "Price", "Product ID", "Review Count", "Rating", "Sponsored"]
        self.URL_usa = "https://www.amazon.com/"
        self.Change_Location_Url = (
            "https://www.amazon.com/gp/delivery/ajax/address-change.html"
        )
        self.CSRF_Token_Url = (
            "https://www.amazon.com/gp/glow/get-address-selections.html?deviceType=desktop"
            "&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
        )
        self.Headers_User_Agent = (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.62"
        )
        self.Headers = {"Accept-Language": "en", "User-Agent": self.Headers_User_Agent}
        self.count = 0



    def get_token(self, response):
        token = re.findall(r'anti-csrftoken-a2z&quot;:&quot;(.*?);',response.text)[0]
        token = token.replace('&quot',"")

        headers = {
            "anti-csrftoken-a2z": token,
            "user-agent": self.Headers_User_Agent,
        }
        response = requests.get(
            url=self.CSRF_Token_Url, headers=headers, cookies=response.cookies
        )
        csrf_token = re.findall(r'CSRF_TOKEN : "(.+?)"',response.text)[0]
        if not csrf_token:
            print("CSRF token not found")
        return csrf_token, response


    def change_location(self, csrf_token, response):
        headers = {
            "anti-csrftoken-a2z": csrf_token,
            "user-agent": self.Headers_User_Agent,
        }
        response = requests.post(
            url=self.Change_Location_Url,
            data={
                "locationType": "LOCATION_INPUT",
                "zipCode": zipcode,
                "storeContext": "generic",
                "deviceType": "web",
                "pageType": "Gateway",
                "actionSource": "glow",
                "almBrandId": "undefined",
            },
            headers=dict(headers),
            cookies=dict(response.cookies),
        )
        success = json.loads(response.text)["isValidAddress"]
        if success == 1:
          print("Pincode changed to %s" % zipcode)
          return response.cookies

    def create_csv(self):
        filename = "amazon_location.csv"
        write_csv_data = open(filename, 'w', encoding='utf-8', newline = '')
        csvwriter = csv.writer(write_csv_data)
        csvwriter.writerow(self.keys)
        return csvwriter

    def upload_to_db(self):
        conn = sqlite3.connect('amazon_location.db')
        cursor = conn.cursor()
        table = cursor.execute('CREATE TABLE IF NOT EXISTS Baby_Milk([Product Name] text, Price text, [Product ID] text, [Review Count] text, Rating text, Sponsored text );')
        return conn, cursor

    def scrap_data(self, url, response, csvwriter):
        print("Getting all %s products" % search_keyword)
        response = requests.get(
            url=url, headers=self.Headers, cookies=response, params = {"k": search_keyword}
        )

        tree = html.fromstring(response.text)
        div = tree.xpath("//div[contains(@data-component-type,'s-search-result')]")
        
        for card in div:
            self.count += 1
            title = card.xpath('.//span[@class="a-size-base-plus a-color-base a-text-normal"]//text()')[0]
            print("Product", self.count, title)
            price = ''.join(card.xpath('.//span[@class="a-price-whole"]//text()'))
            try:
                price_fraction = card.xpath('.//span[@class="a-price-fraction"]//text()')[0]
            except:
                price_fraction = ""
            price = price+price_fraction

            try:
                sponsored = card.xpath(".//span[contains(text(),'Sponsored')]//text()")[0]
                sponsored = "Yes"
            except:
                sponsored = "No"

            try:
                review = card.xpath('.//*[@class="a-row a-size-small"]/span//text()')
                review_count = card.xpath('.//*[@class="a-row a-size-small"]/span//text()')[2]
                rating = card.xpath('.//*[@class="a-row a-size-small"]/span//text()')[0].split(" ")[0]
            except:
                review = ""
                review_count = ""
                rating = ""

            product_link = card.xpath('.//h2//@href')[0]
            product_link = self.URL_usa+product_link
            product_rsp = requests.get(url=product_link, headers=self.Headers, cookies=response.cookies)
            product_tree = html.fromstring(product_rsp.text)
            product_id = product_tree.xpath('//*[contains(text(),"ASIN")]//following-sibling::span//text() | //*[contains(text(),"ASIN")]//following-sibling::td//text()')[0]

            data_list = [title, price, product_id, review_count, rating, sponsored]
            csvwriter.writerow(data_list)

            query = " insert into Baby_Milk "+str(tuple(self.keys)) + " values"+str(tuple(data_list))
            cursor.execute(query)
        # pagination_url = tree.xpath("//span[@class='s-pagination-strip']//*[contains(text(),'Next')]//@href")
        # if pagination_url:
        #     pagination_url = self.URL_usa+pagination_url[0]
        #     self.scrap_data(url=pagination_url, response=(response.cookies), csvwriter=csvwriter)
        conn.commit()
        conn.close()

    def authorize(self):

        ##### EXTRACT TOKEN AND CSRF TOKEN
        response = requests.get(url=self.URL_usa, headers=self.Headers)
        csrf_token, response = self.get_token(response)


        ##### CHANGE ZIP CODE
        response = self.change_location(csrf_token, response)

        csvwriter = self.create_csv()

        url = self.URL_usa+"s"

        self.scrap_data(url=url, response=dict(response), csvwriter=csvwriter)


if __name__ == '__main__':
    obj = Amazon()
    obj.authorize()



Pincode changed to 10081
https://www.amazon.com/s?k=baby+milk
Product 1 Enfamil NeuroPro Baby Formula Milk Powder, 20.7 oz reusable tub (Pack of 6) & Ready to Use Liquid, 8 fl oz (24 Bottles) Dual Prebiotic, DHA for Brain Support
Product 2 Enfamil DHA-In-Sol for Infants & Toddlers Supplements, Supports Brain Development, Vegan, Expert Recommended DHA & ARA, Easy-to-use Dropper, 2 Fl Oz, Pack of 3
Product 3 Similac 360 Total Care Sensitive Infant Formula with 5 HMO Prebiotics for Fussiness & Gas Due to Lactose Sensitivity, Non-GMO, Baby Formula Powder Tub, Unflavored, 118 Oz, Pack of 4


KeyboardInterrupt: ignored