In [4]:
import os
import csv
import re
import pandas as pd
import chardet
from datetime import datetime
from bs4 import BeautifulSoup

# Detect input file
def htmlInput(file_path):
    with open(file_path, "r", encoding='utf-8') as file:
        html_content = file.read()
    soup = BeautifulSoup(html_content, "html.parser")
    return soup

#Extract title
def extract_title(soup):
    title = soup.title.string.strip() if soup.title else None
    return title

#Extract body
def extract_body(soup):
    article_divs = soup.find_all('div', class_="detail")
    articles = []

    if article_divs:
        for article_div in article_divs:
            content_details = article_div.find_all('div', class_='content-detail')
            for content_detail in content_details:
                p_tags = content_detail.find_all('p')
                if not p_tags:
                    parts = content_detail.stripped_strings
                    for part in parts:
                        articles.append(part)
                else:
                    for p_tag in p_tags:
                        articles.append(p_tag.get_text(strip=True))

    articles = [s.replace('\x8d', ' ') for s in articles]
    articles = [item for item in articles if item != '']

    # Remove the caption text
    articles = [re.sub(r'\[caption.*?\[/caption\]', '', s) for s in articles]

    return articles

#Extract tags
def extract_tags(soup):
    tag_div = soup.find('div', class_='cat--title')
    tags = []

    if tag_div:
        tag_content = tag_div.get_text(strip=True)
    else: 
        tag_content = None
    return tag_content
def extract_pubdate(soup):
    month_list = {
        'มกราคม': '01', 
        'กุมภาพันธ์': '02', 
        'มีนาคม': '03', 
        'เมษายน': '04', 
        'พฤษภาคม': '05', 
        'มิถุนายน': '06', 
        'กรกฎาคม': '07', 
        'สิงหาคม': '08', 
        'กันยายน': '09', 
        'ตุลาคม': '10', 
        'พฤศจิกายน': '11', 
        'ธันวาคม': '12'
    }

    date_time_div = soup.find('div', class_='date')
    pubdate = None

    if date_time_div is not None:
        date_time_string = date_time_div.get_text(strip=True)

        # Split the string into words
        words = date_time_string.split()

        # Identify the components
        day = words[0]
        month = month_list.get(words[1])  # Get the numeric month
        year = words[2]
        if year.isdigit():
            year = str(int(year) - 543)

        # Form the date and time in the desired format
        pubdate = f"{year}-{month}-{day} "

    return pubdate

#Extract title
def extract_intro(soup):
    intro_div = soup.find("h2", class_="content-blurb")
    intro = ""
    if intro_div:
        intro = intro_div.get_text(strip=True)
    else:
        intro = None

    return intro

def extract_urlPic(soup):
    img_div = soup.find("div", class_="td-post-featured-image")

    if img_div is not None:
        img_tag = img_div.find("img", class_="entry-thumb")

        if img_tag is not None:
            image_link = img_tag.get("data-cfsrc")
        else:
            image_link = None
    else:
        image_link = None

    return image_link



#start using "csv_progress" (current progress of parsing)
if os.path.exists('parsing_progress.txt'):
    with open('parsing_progress.txt','r') as f:
        progress_start = int(f.readline().strip())
        start_id = progress_start - 1
else: 
    start_id = 1 
    
article_id = start_id


process = 'E:/Crimson_News/src/CrawlingCode/thansettakij/progress.txt'

#start using "progress" (last progress of crwaling)
if os.path.exists(process):
    with open(process,'r') as f:
        progress_end = int(f.readline().strip())
        end_id = progress_end + 1

        
for article_id in range(start_id, end_id+1):
    file_path = "E:/Crimson_News/DataSet/thansettakij/article/" + str(article_id) + "/index.txt"
    
    if os.path.exists(file_path):
        try:
        
            soup = htmlInput(file_path)
            title = extract_title(soup)
            intro = extract_intro(soup)
            articles = extract_body(soup)
            pubdate = extract_pubdate(soup)
            tag_content = extract_tags(soup)
            url = extract_urlPic(soup)

            if title == intro:
                intro = None
            if articles and intro == articles[0]:
                articles.pop(0)
            if title == articles[0]:
                articles.pop(0)


            data_dict = {'Title': title, 'Intro': intro, 'Article': articles, 'DateTime': pubdate, 'Tags': tag_content, 'url_picture': url}

            with open(os.path.join("E:/Crimson_News/DataSet/thansettakij/article/" + str(article_id) +"/parsing.txt"), 'w', encoding="utf-8") as f:
                for key, value in data_dict.items():
                    if value is not None:
                        if key == 'Tags' and isinstance(value, list):
                            f.write(f"[::{key}::]\n")
                            for tags in value:
                                f.write(f"{tags}\n")

                        elif key == 'Article' and isinstance(value, list):
                            f.write(f"[::{key}::]\n")
                            for bodys in value:
                                if bodys == 'SPONSORED':
                                    continue
                                else:
                                    f.write(f"{bodys}\n")
                                    f.write("\n")
                        else:
                            f.write(f"[::{key}::]\n{value}\n")
                            
                            
            with open('parsing_progress.txt', 'w') as f:
                f.write(f'{article_id}')

            with open('parsing_number.csv', 'a', newline='') as f:
                writer = csv.writer(f)
                validation = 'Valid'
                writer.writerow([article_id, validation])

                    
        except Exception as e:
            print(f"Error processing article {article_id}: {e}")
            with open('parsing_number.csv', 'a', newline='') as f:
                writer = csv.writer(f)
                validation = 'Error'
                writer.writerow([article_id, validation])
                continue


# Write the last processed article_id to 'csv_progress.txt' after loop ends
with open('parsing_progress.txt', 'w') as f:
    f.write(str(article_id) + '\n')

Error processing article 231565: list index out of range
Error processing article 234511: list index out of range
Error processing article 235800: list index out of range
Error processing article 239626: list index out of range
Error processing article 253039: list index out of range
Error processing article 259503: list index out of range
Error processing article 266125: list index out of range
Error processing article 289439: list index out of range
Error processing article 294026: list index out of range
Error processing article 294029: list index out of range
Error processing article 294030: list index out of range
Error processing article 294031: list index out of range
Error processing article 305421: list index out of range
Error processing article 305425: list index out of range
Error processing article 305426: list index out of range
Error processing article 306002: list index out of range
Error processing article 306725: list index out of range
Error processing article 306727

Error processing article 367442: list index out of range
Error processing article 367746: list index out of range
Error processing article 372740: list index out of range
Error processing article 372741: list index out of range
Error processing article 373054: list index out of range
Error processing article 373055: list index out of range
Error processing article 373056: list index out of range
Error processing article 373085: list index out of range
Error processing article 373578: list index out of range
Error processing article 373579: list index out of range
Error processing article 373711: list index out of range
Error processing article 373714: list index out of range
Error processing article 373832: list index out of range
Error processing article 373834: list index out of range
Error processing article 374022: list index out of range
Error processing article 374054: list index out of range
Error processing article 375692: list index out of range
Error processing article 375741

Error processing article 472163: list index out of range
Error processing article 472165: list index out of range
Error processing article 472166: list index out of range
Error processing article 472167: list index out of range
Error processing article 472168: list index out of range
Error processing article 472306: list index out of range
Error processing article 472308: list index out of range
Error processing article 472309: list index out of range
Error processing article 472310: list index out of range
Error processing article 472313: list index out of range
Error processing article 472321: list index out of range
Error processing article 472756: list index out of range
Error processing article 472757: list index out of range
Error processing article 472765: list index out of range
Error processing article 472767: list index out of range
Error processing article 472768: list index out of range
Error processing article 473044: list index out of range
Error processing article 473045

Error processing article 484396: list index out of range
Error processing article 486055: list index out of range
Error processing article 487022: 'charmap' codec can't decode byte 0x81 in position 2441: character maps to <undefined>
Error processing article 488300: 'charmap' codec can't decode byte 0x9e in position 2454: character maps to <undefined>
Error processing article 490902: 'charmap' codec can't decode byte 0x9d in position 2462: character maps to <undefined>
Error processing article 491168: 'charmap' codec can't decode byte 0x81 in position 2533: character maps to <undefined>
Error processing article 492025: 'charmap' codec can't decode byte 0x81 in position 2444: character maps to <undefined>
Error processing article 493788: 'charmap' codec can't decode byte 0x9d in position 2465: character maps to <undefined>
Error processing article 495593: 'charmap' codec can't decode byte 0x9e in position 2496: character maps to <undefined>
Error processing article 495677: 'charmap' cod

Error processing article 533136: 'charmap' codec can't decode byte 0x9d in position 2463: character maps to <undefined>
Error processing article 533156: 'charmap' codec can't decode byte 0x81 in position 2447: character maps to <undefined>
Error processing article 533164: 'charmap' codec can't decode byte 0x9e in position 2444: character maps to <undefined>
Error processing article 533234: 'charmap' codec can't decode byte 0x81 in position 2438: character maps to <undefined>
Error processing article 533953: 'charmap' codec can't decode byte 0x9e in position 2480: character maps to <undefined>
Error processing article 534009: list index out of range
Error processing article 534140: 'charmap' codec can't decode byte 0x81 in position 2457: character maps to <undefined>
Error processing article 535217: 'charmap' codec can't decode byte 0x81 in position 2438: character maps to <undefined>
Error processing article 535268: 'charmap' codec can't decode byte 0x9e in position 2496: character map

Error processing article 569952: [Errno 28] No space left on device
Error processing article 569953: [Errno 28] No space left on device
Error processing article 569954: [Errno 28] No space left on device
Error processing article 569955: [Errno 28] No space left on device
Error processing article 569956: [Errno 28] No space left on device
Error processing article 569957: [Errno 28] No space left on device
Error processing article 569958: [Errno 28] No space left on device
Error processing article 569959: [Errno 28] No space left on device
Error processing article 569960: [Errno 28] No space left on device
Error processing article 569961: [Errno 28] No space left on device
Error processing article 569962: [Errno 28] No space left on device
Error processing article 569963: [Errno 28] No space left on device
Error processing article 569964: [Errno 28] No space left on device
Error processing article 569965: [Errno 28] No space left on device
Error processing article 569966: [Errno 28] No s

Error processing article 570075: [Errno 28] No space left on device
Error processing article 570076: [Errno 28] No space left on device
Error processing article 570077: [Errno 28] No space left on device
Error processing article 570078: [Errno 28] No space left on device
Error processing article 570079: [Errno 28] No space left on device
Error processing article 570080: [Errno 28] No space left on device
Error processing article 570081: [Errno 28] No space left on device
Error processing article 570082: [Errno 28] No space left on device
Error processing article 570083: [Errno 28] No space left on device
Error processing article 570084: [Errno 28] No space left on device
Error processing article 570085: [Errno 28] No space left on device
Error processing article 570086: [Errno 28] No space left on device
Error processing article 570087: [Errno 28] No space left on device
Error processing article 570088: [Errno 28] No space left on device
Error processing article 570089: [Errno 28] No s

OSError: [Errno 28] No space left on device