<a href="https://colab.research.google.com/github/Trantracy/web-scraping/blob/master/Tiki_Categories_SQLite.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to crawl Tiki categories and store into SQLite DB

In [0]:
from bs4 import BeautifulSoup
import requests
import sqlite3

TIKI_URL = 'https://tiki.vn'

In [0]:
conn = sqlite3.connect('tiki.db')
cur = conn.cursor()

In [0]:
cur.execute("DROP TABLE categories;")

<sqlite3.Cursor at 0x7f68db7de420>

In [0]:
def create_categories_table():
    query = """
        CREATE TABLE IF NOT EXISTS categories (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name VARCHAR(255),
            url TEXT, 
            parent_id INT, 
            create_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY CREATE TABLE', err)
create_categories_table()

In [0]:
# cur.execute("INSERT INTO categories (name, url, parent_id) VALUES ('test','test_url', 1);")

In [0]:
def select_all():
    return cur.execute('SELECT * FROM categories;').fetchall()

def delete_all():
    return cur.execute('DELETE FROM categories;')

In [0]:
delete_all()
select_all()

[]

In [0]:
class Category:
    def __init__(self, cat_id, name, url, parent_id):
        self.cat_id = cat_id
        self.name = name
        self.url = url
        self.parent_id = parent_id

    def __repr__(self):
        return "ID: {}, Name: {}, URL: {}, Parent_id: {}".format(self.cat_id, self.name, self.url, self.parent_id)

    def save_into_db(self):
        query = """
            INSERT INTO categories (name, url, parent_id)
            VALUES (?, ?, ?);
        """
        val = (self.name, self.url, self.parent_id)
        try:
            cur.execute(query, val)
            self.cat_id = cur.lastrowid
        except Exception as err:
            print('ERROR BY INSERT:', err)


In [0]:
def get_url(url):
    # time.sleep(1)
    try:
        response = requests.get(url).text
        response = BeautifulSoup(response, 'html.parser')
        return response
    except Exception as err:
            print('ERROR BY REQUEST:', err)

In [0]:
def get_main_categories(save_db=False):
    soup = get_url(TIKI_URL)

    result = []
    for a in soup.findAll('a', {'class':'MenuItem__MenuLink-tii3xq-1 efuIbv'}):
        cat_id = None
        name = a.find('span', {'class':'text'}).text
        url = a['href']
        parent_id = None

        cat = Category(cat_id, name, url, parent_id)
        if save_db:
            cat.save_into_db()
        result.append(cat)
    return result

In [0]:
main_categories = get_main_categories(save_db=True)

In [0]:
def get_sub_categories(category, save_db=False):
    name = category.name
    url = category.url
    result = []

    try:
        soup = get_url(url)
        div_containers = soup.findAll('div', {'class':'list-group-item is-child'})
        for div in div_containers:
            sub_id = None
            sub_name = div.a.text
            sub_url = 'http://tiki.vn' + div.a['href']
            sub_parent_id = category.cat_id

            sub = Category(sub_id, sub_name, sub_url, sub_parent_id)
            if save_db:
                sub.save_into_db()
            result.append(sub)
    except Exception as err:
        print('ERROR BY GET SUB CATEGORIES:', err)

    return result

In [0]:
main_categories

In [0]:
cat = main_categories[0]
get_sub_categories(cat)

In [0]:
from collections import deque

de = deque([1, 2, 3])

de.extend([4, 5])
de.append(6)

In [0]:
def get_all_categories(main_categories):
    de = deque(main_categories)
    count = 0

    while de:
        parent_cat = de.popleft()
        sub_cats = get_sub_categories(parent_cat, save_db=True)
        # print(sub_cats)
        de.extend(sub_cats)
        count += 1

        if count % 100 == 0:
            print(count, 'times')


In [0]:
get_all_categories(main_categories)

[ID: None, Name: 
                                Máy tính bảng                                                                (93)
, URL: http://tiki.vn/may-tinh-bang/c1794?src=c.1789.hamburger_menu_fly_out_banner, Parent_id: 18, ID: None, Name: 
                                Máy đọc sách                                                                (33)
, URL: http://tiki.vn/may-doc-sach/c28856?src=c.1789.hamburger_menu_fly_out_banner, Parent_id: 18, ID: None, Name: 
                                Điện thoại Smartphone                                                                (212)
, URL: http://tiki.vn/dien-thoai-smartphone/c1795?src=c.1789.hamburger_menu_fly_out_banner, Parent_id: 18, ID: None, Name: 
                                Điện thoại bàn                                                                (93)
, URL: http://tiki.vn/dien-thoai-ban/c8061?src=c.1789.hamburger_menu_fly_out_banner, Parent_id: 18, ID: None, Name: 
                                Điện thoại p