In [1]:
import os
import csv
import re
import pandas as pd
import chardet
from datetime import datetime
from bs4 import BeautifulSoup


# Detect input file
def htmlInput(file_path):
    with open(file_path, "rb") as file:
        encoding_result = chardet.detect(file.read())
    with open(file_path, "r", encoding=encoding_result["encoding"]) as file:
        html_content = file.read()
    soup = BeautifulSoup(html_content, "html.parser")
    return soup

#Extract title
def extract_title(soup):
    title = soup.title.string.strip() if soup.title else None
    return title

#Extract body
def extract_body(soup):
    article_divs = soup.find_all("div", itemprop = "articleBody")
    articles = []
    if article_divs:
        
        for inside_div in article_divs:
            articleses = inside_div.find_all('p')
            for article in articleses:
                articles.append(article.get_text(strip=True))

        articles = [s.replace('\xa0', ' ') for s in articles]
        articles = [item for item in articles if item != '']
    else:
        articles = None
    
    return articles

#Extract tags
def extract_tags(soup):
    top_div = soup.find('div', class_='ud-post-category-title')
    bott_div = soup.find('div', class_='td-post-source-tags')

    top_tag = top_div.get_text(strip=True)

    bott_tag = []

    if bott_div:
        tag_links = bott_div.find_all('a')
        for tag_link in tag_links:
            bott_tag.append(tag_link.get_text(strip=True))

    if bott_tag and top_tag:
        tag_content = bott_tag + [top_tag]
    elif not bott_tag :
        tag_content = top_tag

    return tag_content

#Extract date
def extract_pubdate(soup):
    date_time = soup.find('time', {'class': 'entry-date'}).get('datetime')
    if date_time is not None:
        date_object = datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S%z")
        pubdate = date_object.strftime("%d/%m/%Y %H:%M:%S")
         
    else:
        pubdate = None
    
    return pubdate


#Extract title
def extract_intro(soup):
    intro_divs = soup.find_all("div", itemprop="articleBody")

    intro = None  # Default value if nothing is found
    for intro_div in intro_divs:
        intro_paragraphs = intro_div.find_all('p')
        for intro_paragraph in intro_paragraphs:
            intro_strong = intro_paragraph.find('strong')
            if intro_strong:
                intro = intro_strong.get_text(strip=True)
                break  # Exit the loop if we found what we were looking for
    
        if intro is None:
            intro = intro_paragraphs[0].get_text(strip=True)
            break  # Exit the outer loop if we found what we were looking for

    return intro

def extract_urlPic(soup):
    img_div = soup.find("div", class_="td-post-featured-image")

    if img_div is not None:
        img_tag = img_div.find("img", class_="entry-thumb")

        if img_tag is not None:
            image_link = img_tag.get("data-cfsrc")
        else:
            image_link = None
    else:
        image_link = None

    return image_link


#start using "csv_progress" (current progress of parsing)
if os.path.exists('parsing_progress.txt'):
    with open('parsing_progress.txt','r') as f:
        progress_start = int(f.readline().strip())
        start_id = progress_start - 1
else: 
    start_id = 1 
    
article_id = start_id


process = 'E:/Crimson_News/src/CrawlingCode/prachachat/progress.txt'

#start using "progress" (last progress of crwaling)
if os.path.exists(process):
    with open(process,'r') as f:
        progress_end = int(f.readline().strip())
        end_id = progress_end + 1

for article_id in range(start_id, end_id+1):
    file_path = "E:/Crimson_News/DataSet/prachachat/article/" + str(article_id) + "/index.txt"
    
    if os.path.exists(file_path):
        try:
        
            soup = htmlInput(file_path)
            title = extract_title(soup)
            intro = extract_intro(soup)
            articles = extract_body(soup)
            pubdate = extract_pubdate(soup)
            tag_content = extract_tags(soup)
            url = extract_urlPic(soup)

            if title == intro:
                intro = None
            if articles and intro == articles[0]:
                articles.pop(0)
            if title == articles[0]:
                articles.pop(0)


            data_dict = {'Title': title, 'Intro': intro, 'Article': articles, 'DateTime': pubdate, 'Tags': tag_content, 'url_picture': url}

            with open(os.path.join("E:/Crimson_News/DataSet/prachachat/article/" + str(article_id) +"/parsing.txt"), 'w', encoding="utf-8") as f:
                for key, value in data_dict.items():
                    if value is not None:
                        if key == 'Tags' and isinstance(value, list):
                            f.write(f"[::{key}::]\n")
                            for tags in value:
                                f.write(f"{tags}\n")

                        elif key == 'Article' and isinstance(value, list):
                            f.write(f"[::{key}::]\n")
                            for bodys in value:
                                if bodys == 'SPONSORED':
                                    continue
                                else:
                                    f.write(f"{bodys}\n")
                                    f.write("\n")
                        else:
                            f.write(f"[::{key}::]\n{value}\n")


            with open('parsing_progress.txt', 'w') as f:
                f.write(f'{article_id}')

            with open('parsing_number.csv', 'a', newline='') as f:
                writer = csv.writer(f)
                validation = 'Valid'
                writer.writerow([article_id, validation])

                    
        except Exception as e:
            print(f"Error processing article {article_id}: {e}")
            with open('parsing_number.csv', 'a', newline='') as f:
                writer = csv.writer(f)
                validation = 'Error'
                writer.writerow([article_id, validation])
                continue


# Write the last processed article_id to 'csv_progress.txt' after loop ends
with open('parsing_progress.txt', 'w') as f:
    f.write(str(article_id) + '\n')

Error processing article 13: module 'datetime' has no attribute 'strptime'
Error processing article 20: module 'datetime' has no attribute 'strptime'
Error processing article 21: module 'datetime' has no attribute 'strptime'
Error processing article 22: module 'datetime' has no attribute 'strptime'
Error processing article 25: module 'datetime' has no attribute 'strptime'
Error processing article 32: module 'datetime' has no attribute 'strptime'
Error processing article 34: module 'datetime' has no attribute 'strptime'
Error processing article 35: module 'datetime' has no attribute 'strptime'
Error processing article 41: module 'datetime' has no attribute 'strptime'
Error processing article 42: module 'datetime' has no attribute 'strptime'
Error processing article 48: module 'datetime' has no attribute 'strptime'
Error processing article 50: module 'datetime' has no attribute 'strptime'
Error processing article 51: module 'datetime' has no attribute 'strptime'
Error processing article 

Traceback (most recent call last):
  File "C:\Users\sp2023-stock\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3505, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\sp2023-stock\AppData\Local\Temp\ipykernel_8532\731614023.py", line 136, in <module>
    soup = htmlInput(file_path)
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sp2023-stock\AppData\Local\Temp\ipykernel_8532\731614023.py", line 16, in htmlInput
    soup = BeautifulSoup(html_content, "html.parser")
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sp2023-stock\anaconda3\Lib\site-packages\bs4\__init__.py", line 335, in __init__
    self._feed()
  File "C:\Users\sp2023-stock\anaconda3\Lib\site-packages\bs4\__init__.py", line 478, in _feed
    self.builder.feed(self.markup)
  File "C:\Users\sp2023-stock\anaconda3\Lib\site-packages\bs4\builder\_htmlparser.py", line 380, in feed
    parser.feed(markup)
  File "C:\Users\sp2023-stock\anaconda3\Lib\htm