## Amazon User-Profile Product's Category Crawling

Step2 Crawling products categories

Crawl the products categories from Step1 Purchasing-history-record for grouping or knowing relationship of products and easily analyzing.

### Before running the crawling code you should:
- Make sure your data from Step 1 are completely and non duplicated.
- `Using API` on [`Scraper API website']('https://www.scraperapi.com/?fp_ref=viktoriia91)
- Put cookie.txt copy from AMZ to be the request cookies.
- Make sure internet is stable and well connected.
### When running the crawling code you should:
- Check the `crawled data amount` to make sure resonable.
- Follow the instruction of the note.
### After running the crawling code you should:
- Check the data amounts and `Correct file name`.


In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from fake_useragent import UserAgent

user_agent = UserAgent()
ua = user_agent
import concurrent.futures
import tqdm
import warnings

warnings.filterwarnings(action='once')
from urllib.parse import urlencode


In [None]:
# Check the Model Name
modelname = 'Dell UltraSharp U3415W 34-Inch Curved LED-Lit Monitor'


In [None]:
# Open the Step1 Data from single model
crawl_data = pd.read_csv(f'Step1/AMZ_PH_STEP1_{modelname}.csv', encoding='utf-8')
print('Data amounts:', len(crawl_data))


In [None]:
with open('cookie.txt', 'r') as f:
    output = f.read()
cookies = {}
for line in output.split(';'):
    name, value = line.strip().split('=', 1)
    cookies[name] = value


In [None]:
proxies = {
    "http": "http://scraperapi:f6f47b89744ec60336e92cb702d9c31e@proxy-server.scraperapi.com:8001"
}


In [None]:
# Request the products page that on purpose
def request_web(url, idx):
    headers = {"User-Agent": ua.random}
    try:
        response = requests.get(
            url, headers=headers, proxies=proxies, verify=False, cookies=cookies
        )
        txt = response.text
        return txt
    except:
        return False


In [None]:
# Get category and easy cleaning
def get_content(soap):
    category = soap.find_all("a", class_="a-link-normal a-color-tertiary")
    price = soap.find("span", class_='a-offscreen')
    try:
        brand = soap.find(text='Brand').findNext('td').text
    except:
        brand = 'None'
    if category != []:
        category = [
            str(t)
            .split('">')[1]
            .split('</a>')[0]
            .replace(" ", "")
            .replace('\n', "")
            .replace('&amp;', "")
            for t in category
        ]
        try:
            price = str(price).split('$')[1].replace('</span>', '')
        except:
            price = 'Non-Available'
        content_dict = {'category': category, 'price': price, 'Brand': brand}
        return content_dict
    else:
        return False


In [None]:
# Get All info including category into dataframe
def scrape(arg):
    url, idx = arg
    tmp_format = pd.DataFrame(
        {
            'Asin': [],
            'Brand': [],
            'Category': [],
            'Sub_category': [],
            'Breakdown_category': [],
            'Price': [],
        }
    )
    if request_web(url, idx) != False:
        soap = BeautifulSoup(request_web(url, idx), "lxml")
        if get_content(soap) != False:
            brand = get_content(soap)['Brand']
            category = get_content(soap)['category']
            price = get_content(soap)['price']
            tmp_format = tmp_format.append(
                {
                    'Asin': url.split('/dp/')[1],
                    'Category': [category[1]],
                    'Sub_category': [category[-2]],
                    'Breakdown_category': category[-1],
                    'Price': [price],
                    'Brand': [brand],
                },
                ignore_index=True,
            )
            time.sleep(3)
        else:
            time.sleep(2)

    return tmp_format


In [None]:
# Dataframe to store the crawled data
final_format = pd.DataFrame(
    {
        'Asin': [],
        'Brand': [],
        'Category': [],
        'Sub_category': [],
        'Breakdown_category': [],
        'Price': [],
    }
)


In [None]:
# Define Execution Code
def run(asin_list):
    base_url = 'https://www.amazon.com'
    url_list = [f"{base_url}/dp/{asin}" for asin in asin_list]
    idx_list = []
    for idx in range(len(crawl_data) - 1):
        idx_list.append(idx)
    arg = list(zip(url_list, idx_list))
    with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
        for result in tqdm.tqdm_notebook(
            (executor.map(scrape, arg)), total=len(url_list)
        ):
            global final_format
            final_format = final_format.append(result, ignore_index=True)
        executor.shutdown()


#### No matter it failed or not just keep running next cell

In [None]:
# Executive every products in step one on single model
run(crawl_data['Asin'])


In [None]:
# Drop out the crawled data and prepare the failing data for another try
notSuccessInFinalFormat_Asin = pd.concat(
    [crawl_data['Asin'], final_format['Asin'], final_format['Asin']]
).drop_duplicates(keep=False)


In [None]:
print(
    'Success:' + str(len(final_format)),
    'Falied:' + str(len(notSuccessInFinalFormat_Asin)),
    'Total Data in Step1:' + str(len(crawl_data)),
    final_format.head(),
)


In [None]:
# Saving the crawled data
final_format.to_csv(f'Step2/AMZ_PH_STEP2_{modelname}.csv')


### Keep running Cells below from here until crawl enough data

In [None]:
run(notSuccessInFinalFormat_Asin)


In [None]:
notSuccessInFinalFormat_Asin = pd.concat(
    [crawl_data['Asin'], final_format['Asin'], final_format['Asin']]
).drop_duplicates(keep=False)


In [None]:
print(
    'Success:' + str(len(final_format)),
    'Failed:' + str(len(notSuccessInFinalFormat_Asin)),
    'Total Data in Step1:' + str(len(crawl_data)),
    final_format.head(),
)


In [None]:
final_format.to_csv(f'Step2/AMZ_PH_STEP2_{modelname}.csv')
