In [32]:
from bs4 import BeautifulSoup

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Helper function to safely get text from a tag
    def get_text(tag, default=''):
        return tag.get_text(strip=True) if tag else default

    # Function to find value for a given title
    def find_value_for_title(title):
        divs = soup.find_all('div', class_='re__pr-short-info-item js__pr-short-info-item')
        for div in divs:
            title_span = div.find('span', class_='title')
            if title_span and title_span.get_text(strip=True) == title:
                value_span = div.find('span', class_='value')
                if value_span:
                    return value_span.get_text(strip=True)
        return ''

    # Function to find value for a given title in specs content items
    def find_value_for_specs_title(title):
        divs = soup.find_all('div', class_='re__pr-specs-content-item')
        for div in divs:
            title_span = div.find('span', class_='re__pr-specs-content-item-title')
            if title_span and title_span.get_text(strip=True) == title:
                value_span = div.find('span', class_='re__pr-specs-content-item-value')
                if value_span:
                    return value_span.get_text(strip=True)
        return ''
        
    # Extracting the required data
    title_tag = soup.find('h1', class_='re__pr-title pr-title js__pr-title')
    title_text = get_text(title_tag)

    price_tag = soup.find('div', class_='re__pr-short-info-item js__pr-short-info-item')
    price_value = get_text(price_tag.find('span', class_='value')) if price_tag else ''

    price_per_m2_tag = price_tag.find('span', class_='title', string='Mức giá') if price_tag else None
    price_per_m2 = get_text(price_per_m2_tag.find_next_sibling('span', class_='ext')) if price_per_m2_tag else ''

    address_tag = soup.find('span', class_='re__pr-short-description js__pr-address')
    address = get_text(address_tag)

    area_value = find_value_for_title('Diện tích')
    bedrooms_value = find_value_for_title('Phòng ngủ')
    
    detail_info_tag = soup.find('div', class_='re__section-body re__detail-content js__section-body js__pr-description js__tracking')
    detail_info = get_text(detail_info_tag)

    frontage_value = find_value_for_specs_title('Mặt tiền')
    road_access_value = find_value_for_specs_title('Đường vào')
    house_direction_value = find_value_for_specs_title('Hướng nhà')
    balcony_direction_value = find_value_for_specs_title('Hướng ban công')
    num_floors_value = find_value_for_specs_title('Số tầng')
    num_bedrooms_value = find_value_for_specs_title('Số phòng ngủ')
    num_toilets_value = find_value_for_specs_title('Số toilet')
    legal_status_value = find_value_for_specs_title('Pháp lý')
    interior_value = find_value_for_specs_title('Nội thất')

    img_tags = soup.select('.slick-track img')
    img_data_srcs = [img['data-src'] for img in img_tags if img.has_attr('data-src')]

    contact_name_tag = soup.find('div', class_='re__contact-name js_contact-name')
    contact_name = get_text(contact_name_tag)

    zalo_tag = soup.find('a', class_='re__btn re__btn-se-border--md re__btn-icon-left--md js__zalo-chat js__zalo-chat-qr')
    zalo_url = zalo_tag['data-href'] if zalo_tag else ''

    return {
        'title': title_text,
        'price': price_value,
        'price_per_m2': price_per_m2,
        'address': address,
        'area': area_value,
        'bedrooms': bedrooms_value,
        'detail_info': detail_info,
        'frontage': frontage_value,
        'road_access': road_access_value,
        'house_direction': house_direction_value,
        'balcony_direction': balcony_direction_value,
        'num_floors': num_floors_value,
        'num_bedrooms': num_bedrooms_value,
        'num_toilets': num_toilets_value,
        'legal_status': legal_status_value,
        'interior': interior_value,
        'img_data': img_data_srcs,
        'contact_name': contact_name,
        'zalo_url': zalo_url,
    }

list = parse_html(html_content)


In [33]:
import csv

# Hàm xuất dữ liệu ra file CSV
def export_to_csv(data, filename):
    # Chuyển đổi danh sách ảnh thành chuỗi
    data['img_data'] = ', '.join(data['img_data'])
    
    file_exists = os.path.isfile(filename)

    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=data.keys())
        if not file_exists:
            writer.writeheader()  # Ghi tiêu đề cột nếu file chưa tồn tại
        writer.writerow(data)  # Ghi dữ liệu

In [36]:
import os
import glob
import logging
from datetime import datetime

# Thiết lập logging
logging.basicConfig(filename='process_log.log', level=logging.INFO, format='%(asctime)s - %(message)s')

# Đọc tất cả các file .html trong thư mục và xử lý từng file
def read_and_process_html_files(directory, csv_filename):
    html_files = glob.glob(os.path.join(directory, '*.html'))
    total_files = len(html_files)

    start_time = datetime.now()
    logging.info(f'Start processing {total_files} files.')

    for html_file in html_files:
        file_start_time = datetime.now()
        with open(html_file, 'r', encoding='utf-8') as file:
            html_content = file.read()
            data = parse_html(html_content)
            export_to_csv(data, csv_filename)
        
        file_end_time = datetime.now()
        logging.info(f'Processed file {html_file} in {file_end_time - file_start_time} seconds.')

    end_time = datetime.now()
    logging.info(f'Finished processing all files in {end_time - start_time} seconds.')


In [37]:
# Sử dụng hàm để đọc các file .html và xuất ra file CSV
directory_path = 'data_crawled/1'
csv_filename = 'property_infor.csv'
read_and_process_html_files(directory_path, csv_filename)