In [2]:
import os
import re
import pandas as pd
import chardet
from bs4 import BeautifulSoup

# Detect input file
def htmlInput(file_path):
    with open(file_path, "rb") as file:
        encoding_result = chardet.detect(file.read())
    with open(file_path, "r", encoding=encoding_result["encoding"]) as file:
        html_content = file.read()
    soup = BeautifulSoup(html_content, "html.parser")
    return soup

#Extract title
def extract_title(soup):
    title = soup.title.string.strip() if soup.title else None
    return title

#Extract body
def extract_body(soup):
    article_divs = soup.find_all("div", itemprop="articleBody")
    articles = []
    if article_divs:
        for inside_div in article_divs:
            articleses = inside_div.find_all('p')
            for article in articleses:
                articles.append(article.get_text(strip=True))
    return articles

#Extract tags
def extract_tags(soup):
    tags_div = soup.find("div", class_="__item_article-breadcrumb css-7obyzk e1wlf1s66")
    tag_content = [tag.get_text(strip=True) for tag in tags_div.find_all('a')] if tags_div else None
    return tag_content

#Extract date
def extract_pubdate(soup):
    month_list = {'ม.ค.': '01', 'ก.พ.': '02', 'มี.ค.': '03', 'เม.ย.': '04', 'พ.ค.': '05', 'มิ.ย.': '06', 'ก.ค.': '07', 'ส.ค.': '08', 'ก.ย.': '09', 'ต.ค.': '10', 'พ.ย.': '11', 'ธ.ค.': '12'}
    date_time_div = soup.find('div', {'class': '__item_article-date css-1v3en5e e1wlf1s65'})
    
    datestamps = ""
    timestamps = ""
    year = ""
    numMonth = ""
    date = ""

    if date_time_div is not None:
        date_time_string = date_time_div.get_text(strip=True)
        datestamps = " ".join(date_time_string.split())
        timestamps = " ".join(date_time_string.split()[3:4])

        for month_dict in month_list:
            if month_dict in datestamps:
                month = month_dict
                numMonth = month_list[str(month)]
                date = " ".join(date_time_string.split()[:1])
                year = " ".join(date_time_string.split()[2:3])

        if year != "":
            year = str(int(year) - 543)
        datestamps = year + "-" + numMonth + "-" + date
        pubdate = datestamps + " " + timestamps
    else:
        pubdate = None

    return pubdate

#Extract title
def extract_intro(soup):
    intro_div = soup.find("div", class_="css-1wn93q2 evs3ejl67")
    intro = ""
    if intro_div:
        intro = intro_div.get_text(strip=True)
    else:
        intro = None
    
    return intro
def extract_urlPic(soup):
    url = None
    return url

#start using "csv_progress" (current progress of parsing)
if os.path.exists('csv_progress.txt'):
    with open('csv_progress.txt','r') as f:
        progress_start = int(f.readline().strip())
        start_id = progress_start - 1
else: 
    start_id = 1 
    
article_id = start_id


process = '/Users/macintoshhd/Documents/sp2023/DataSet/thairath/progress.txt'

#start using "progress" (last progress of crwaling)
if os.path.exists(process):
    with open(process,'r') as f:
        progress_end = int(f.readline().strip())
        end_id = progress_end + 1

for article_id in range(start_id, end_id+1):
    file_path = "/Users/macintoshhd/Documents/sp2023/DataSet/thairath/article/" + str(article_id) + "/index.txt"
    
    if os.path.exists(file_path):
        print("Index TXT Processing ID: " + str(article_id))
        
        soup = htmlInput(file_path)
        title = extract_title(soup)
        intro = extract_intro(soup)
        articles = extract_body(soup)
        pubdate = extract_pubdate(soup)
        tag_content = extract_tags(soup)
        url = extract_urlPic(soup)
        
        data_dict = {'Title': title, 'Intro': intro, 'Article': articles, 'DateTime': pubdate, 'Tags': tag_content, 'url_picture': url}
        
        with open("/Users/macintoshhd/Documents/sp2023/DataSet/thairath/article/" + str(article_id) + "/parsed.txt", 'w', encoding="utf-8") as f:
            for key, value in data_dict.items():
                if value is not None:
                    if key == 'Tags' and isinstance(value, list):
                        f.write(f"[::{key}::]\n")
                        for tags in value:
                            f.write(f"{tags}\n")
                            
                    elif key == 'Article' and isinstance(value, list):
                        f.write(f"[::{key}::]\n")
                        for bodys in value:
                            if bodys == 'SPONSORED':
                                continue
                            else:
                                f.write(f"{bodys}\n")
                                f.write("\n")
                    else:
                        f.write(f"[::{key}::]\n{value}\n")


# Write the last processed article_id to 'csv_progress.txt' after loop ends
with open('csv_progress.txt', 'w') as f:
    f.write(str(article_id) + '\n')

Index TXT Processing ID: 2346991
Index TXT Processing ID: 2346992
Index TXT Processing ID: 2346994
Index TXT Processing ID: 2346995
