**Data field**


In [37]:
import requests
import bs4
import json
import sys
import psycopg2
import logging


## Get date from json file


In [38]:
fjs = open("info.json", "r")
data_dict = json.load(fjs)
fjs.close()

fps = open("password.txt", "r")
pw = fps.read()
fps.close()

data_field = data_dict["event"]["data_field"]
url = data_dict["event"]["url"]
table_name = data_dict["event"]["table_name"]

## Connect to Database


In [39]:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def create_connection():
    connection = psycopg2.connect(
        host=data_dict["host"],
        database=data_dict["database"],
        user=data_dict["user"],
        password=pw
    )
    connection.set_session(autocommit=True)
    return connection

connection = create_connection()
logger.info(f"connection log: {connection}")

INFO:root:connection log: <connection object at 0x000002324459F150; dsn: 'user=postgres password=xxx dbname=postgres host=localhost', closed: 0>


## Create the table in database


In [40]:
def create_table(connection, table_name, data_field):
    logger.info(f"Start create table {table_name}")
    list_field_name_field_type = [f'{field["field"]} {field["type"]}' for field in data_field] 
    result_field_name_field_type = ','.join(list_field_name_field_type)

    create_table_str = f"""
        create table if not exists {table_name} (
            {result_field_name_field_type}
        )
    """ # tạo string kiểu <tên cột> <kiểu dữ liệu> để tạo bảng trong DBMS

    cursor = connection.cursor() # tạo con trỏ tới DBMS
    cursor.execute(create_table_str) # thực thi tạo bảng trong DBMS

create_table(connection, table_name, data_field)

INFO:root:Start create table comic_web_data


## Crawl Data


In [41]:
total_stories = []

info = requests.get(url, headers={})

soup = bs4.BeautifulSoup(info.content, 'lxml')

stories = soup.find_all("div", {"class": "item"})

for story in stories:
    try:
        story_id = story["class"][1].split('-')[1]
    except:
        story_id = "null"
    try:
        story_link = story.find("a")["href"]
    except:
        story_link = "null"
    try:
        poster_image_link = story.find("img")["src"]
    except:
        poster_image_link = "null"
    try:
        story_name = story.find("img")["alt"]
    except:
        story_name = "null"

    story_dict = {
        "id": story_id,
        "story_link": story_link,
        "poster_image_link": poster_image_link,
        "story_name": story_name,
    }
    total_stories.append(story_dict)

print(len(total_stories))
for story in total_stories:
    print(story)

16
{'id': '1', 'story_link': 'https://truyenmoi.org/truyen-bach-luyen-thanh-than', 'poster_image_link': 'https://truyenmoi.org/images/thumb129/bach-luyen-thanh-than-1616536478.jpg', 'story_name': 'Bách Luyện Thành Thần'}
{'id': '2', 'story_link': 'https://truyenmoi.org/vu-luyen-dien-phong', 'poster_image_link': 'https://truyenmoi.org/images/thumb129/vu-luyen-dien-phong-1616873103.jpg', 'story_name': 'Võ Luyện Đỉnh Phong'}
{'id': '3', 'story_link': 'https://truyenmoi.org/vo-than-chua-te', 'poster_image_link': 'https://truyenmoi.org/images/thumb129/vo-than-chua-te-1616940080.jpg', 'story_name': 'Võ Thần Chúa Tể'}
{'id': '4', 'story_link': 'https://truyenmoi.org/vo-thuong-than-de', 'poster_image_link': 'https://truyenmoi.org/images/thumb129/vo-thuong-than-de-1617174241.jpg', 'story_name': 'Vô Thượng Thần Đế'}
{'id': '5', 'story_link': 'https://truyenmoi.org/phong-than-chau', 'poster_image_link': 'https://truyenmoi.org/images/thumb129/phong-than-chau-1626670616.jpg', 'story_name': 'Phong T

**Store Data**


In [42]:
def insert_database(connection, table_name, data_stories):
    logger.info(f"Start insert data into table {table_name}")
    list_field_name = [f'{field["field"]}' for field in data_field] 
    result_field_name = ','.join(list_field_name)

    data_insert = [f"({item['id']},'{item['story_link']}','{item['poster_image_link']}','{item['story_name']}')" for idx, item in enumerate(data_stories)]

    result_insert = ','.join(data_insert)

    sql_insert = f"""
        INSERT INTO {table_name} ({result_field_name})
        VALUES {result_insert};
    """
    print(sql_insert)
    cursor = connection.cursor()
    cursor.execute(sql_insert)

insert_database(connection, table_name, total_stories)

INFO:root:Start insert data into table comic_web_data

        INSERT INTO comic_web_data (id,story_link,poster_image_link,story_name)
        VALUES (1,'https://truyenmoi.org/truyen-bach-luyen-thanh-than','https://truyenmoi.org/images/thumb129/bach-luyen-thanh-than-1616536478.jpg','Bách Luyện Thành Thần'),(2,'https://truyenmoi.org/vu-luyen-dien-phong','https://truyenmoi.org/images/thumb129/vu-luyen-dien-phong-1616873103.jpg','Võ Luyện Đỉnh Phong'),(3,'https://truyenmoi.org/vo-than-chua-te','https://truyenmoi.org/images/thumb129/vo-than-chua-te-1616940080.jpg','Võ Thần Chúa Tể'),(4,'https://truyenmoi.org/vo-thuong-than-de','https://truyenmoi.org/images/thumb129/vo-thuong-than-de-1617174241.jpg','Vô Thượng Thần Đế'),(5,'https://truyenmoi.org/phong-than-chau','https://truyenmoi.org/images/thumb129/phong-than-chau-1626670616.jpg','Phong Thần Châu'),(6,'https://truyenmoi.org/cung-chieu-co-vo-quan-nhan-100058','https://truyenmoi.org/images/thumb129/cung-chieu-co-vo-quan-nhan-1620470550.jpg'