## <strong>Step 1. Data Collection</strong>
1. Crawl the products id from the website
2. Crawl the products using product id.

### <strong>Import the necessary libraries</strong>

In [4]:
import requests
import time
from tqdm import tqdm
import random
import pandas as pd
import helpers as hp

### <strong>1. Crawl the products id from the website</strong>

#### Here is the function to get all products id from a category

In [5]:
def parse_products(product_json):
    products = dict()
    products['p_id'] = product_json.get('id')
    return products


def get_product_id_all_pages():
    all_products = []
    for page in tqdm(range(1, 50)):
        hp._PARAMS_PRODUCTS_ID['page'] = page
        response = requests.get('https://tiki.vn/api/v2/products',
                                headers=hp._HEADERS, params=hp._PARAMS_PRODUCTS_ID)
        
        if response.status_code == 200:
            products_json = response.json().get('data')
            for product in products_json:
                all_products.append(parse_products(product))
        else:
            break
        # time.sleep(random.randint(1, 3))
    return all_products

#### Write the products id to a csv file

In [6]:
result = get_product_id_all_pages() 
df = pd.DataFrame(result)
df.to_csv('../../../data/raw/products_id.csv', index=False)

100%|██████████| 49/49 [00:31<00:00,  1.56it/s]


### <strong>2. Crawl the products using product id.</strong>

In [7]:
# Get product ids
df_id = pd.read_csv('../../../data/raw/products_id.csv')

#### Remove duplicated products id from the list

In [8]:
# show list products id is duplicated
df_id[df_id.duplicated(['p_id'], keep=False)]

# Remove duplicated products id 
df_id.drop_duplicates(subset=['p_id'], keep='first', inplace=True)

# select columns p_id 
df_id = df_id[['p_id']]

#### Add the product id to the data frame 

In [9]:
# write 
df_id.to_csv('../../../data/raw/products_id.csv', index=False)

#### Function to crawl the product data

In [10]:
def parser_product(product_detail_json):
    product_dict = dict()
    product_dict['p_id'] = product_detail_json.get('id') if product_detail_json.get('id') else None
    product_dict['p_name'] = product_detail_json.get('name') if product_detail_json.get('name') else None
    product_dict['p_id_shop'] = product_detail_json.get('current_seller').get('id') if product_detail_json.get('current_seller') else None
    product_dict['p_shop_name'] = product_detail_json.get('current_seller').get('name') if product_detail_json.get('current_seller') else None
    product_dict['p_brand'] = product_detail_json.get('brand').get('name') if product_detail_json.get('brand') else None
    product_dict['p_categories'] = product_detail_json.get('categories').get('name') if product_detail_json.get('categories') else None
    product_dict['p_day_created'] = product_detail_json.get('day_ago_created') if product_detail_json.get('day_ago_created') else None
    product_dict['p_sold_quantity'] = product_detail_json.get('quantity_sold').get('value') if product_detail_json.get('quantity_sold') else 0
    product_dict['p_original_price'] = product_detail_json.get('original_price') if product_detail_json.get('original_price') else 0
    product_dict['p_current_price'] = product_detail_json.get('price') if product_detail_json.get('price') else 0
    product_dict['p_discount_rate'] = product_detail_json.get('discount_rate') if product_detail_json.get('discount_rate') else 0
    return product_dict

def get_product_detail(p_ids):
    result = []
    print('Start crawling...')
    for p_id in tqdm(p_ids, total=len(p_ids)):
        response = requests.get('https://tiki.vn/api/v2/products/{}'.format(p_id), headers=hp._HEADERS, params=hp._PARAMS_PRODUCTS_DETAIL)
        if response.status_code == 200:
            product_detail = parser_product(response.json())
            result.append(product_detail)
        time.sleep(2)
    print('Crawling completed!')
    return result

In [None]:
# Save to products_detail.csv
list_products = get_product_detail(df_id.p_id.to_list())
df_product = pd.DataFrame(list_products)
df_product.to_csv('../../../data/raw/products_detail.csv', index=False)