# 先爬2024年表特版所有的文章

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import json
import random
import os
import re
from tqdm import tqdm

# 照片處理
from urllib.parse import urlparse
from PIL import Image
from io import BytesIO

In [None]:
HEADERS = {
    'User-Agent':'Mozilla/5.0',
    'cookie':'over18=1'
}

if os.path.exists('2020_articles.jsonl'):
    os.remove('2020_articles.jsonl')

In [None]:
def extract_meta_value(soup, label):
    tags = soup.select('span.article-meta-tag')
    vals = soup.select('span.article-meta-value')

    for tag, val in zip(tags, vals):
        if tag.text.strip() == label:
            return val.text.strip()
    
    # 處理特殊情況，沒有時間欄位時
    if label == '時間':
        f2_texts = [span.text.strip() for span in soup.select('span.f2')]

        for line in f2_texts:
            match = re.search(r'(\d{2}/\d{2}/\d{4})\s+(\d{2}:\d{2}:\d{2})', line)
            if match:
                date_part = match.group(1)  
                time_part = match.group(2)  
                dt = datetime.strptime(date_part + ' ' + time_part, '%m/%d/%Y %H:%M:%S')
                return dt.strftime('%a %b %d %H:%M:%S %Y')

    return None


def crawl_articles():
    is_2024_started = False
    start_index = 2329  #3911(2025)  3647(2024)   3367(2023)    2712(2021)  2329(2020)
    end_index = 2712   #4001(2025)  3916(2024)   3647(2023)   3060(2021)   2712(2020)

    for index in range(start_index, end_index+1):  
        url = f'https://www.ptt.cc/bbs/Beauty/index{index}.html'
        print("\n")
        print(f'目前的列表: {url}')

        res = requests.get(url, headers=HEADERS, timeout=10)
        res.raise_for_status() # 檢查是否取得成功
        soup = BeautifulSoup(res.text, 'html.parser')

        entries = soup.select('div.r-ent') # 取得文章列表
        for entry in entries:
            link_tag = entry.select_one('a')
            if not link_tag:
                continue  # 無網址的文章

            post_url = 'https://www.ptt.cc' + link_tag['href']
            title_text = link_tag.text.strip()
            date_tag = entry.select_one('div.date')
            post_date = date_tag.text.strip() if date_tag else ''
            print(f'抓到的文章：{post_url} | 標題：{title_text} | 列表時間：{post_date}')
            

            # 還沒進入 2024，先用舊邏輯
            if not is_2024_started:
                res_post = requests.get(post_url, headers=HEADERS, timeout=10)
                res_post.raise_for_status()
                post_soup = BeautifulSoup(res_post.text, 'html.parser')
                post_time = extract_meta_value(post_soup, '時間')
                if not post_time:
                    print(f"無法解析時間")
                    continue
                dt = datetime.strptime(post_time, '%a %b %d %H:%M:%S %Y')

                if dt.year < 2020:
                    print('跳過早於 2024 年的文章')
                    continue
                elif dt.year > 2020:
                    print('跳過 2025 年的文章')
                    continue

                # 確認已進入 2024 年
                is_2024_started = True
                mmdd = dt.strftime('%m%d')
            else:
                # 已進入 2024，直接從列表時間推斷
                if post_date >= '01/01':
                    if res_post.status_code == 404:
                        print("該文章為 404，略過")
                        continue
                    post_soup = BeautifulSoup(res_post.text, 'html.parser')
                    post_time = extract_meta_value(post_soup, '時間')
                    if not post_time:
                        continue
                    dt = datetime.strptime(post_time, '%a %b %d %H:%M:%S %Y')

                    if dt.year == 2021:
                        print("抓到 2025 年文章，結束爬蟲")
                        return
                    elif dt.year != 2020:
                        continue        
                    mmdd = dt.strftime('%m%d')
                else:
                    mmdd = post_date.replace("/", "").zfill(4)

            # 篩選
            if not title_text.strip():
                print(f'略過標題為空白或空字串')
                continue
            if '[公告]' in title_text or 'Fw:[公告]' in title_text:
                print('略過公告文')
                continue
            
            article_data = {
                'date': mmdd,
                'title': title_text,
                'url': post_url
            }
            
            with open('2020_articles.jsonl', 'a', encoding='utf-8') as fa:
                fa.write(json.dumps(article_data, ensure_ascii=False) + '\n')

            
            time.sleep(random.uniform(0.3, 0.5)) 

In [None]:
crawl_articles()

# 只抓取有"正妹"的文章

In [None]:
def extract_image_urls(text):
    pattern = r'https?://[^\s"]+\.(?:jpg|jpeg|png|gif)(?=\b|$)'
    return re.findall(pattern, text, flags=re.IGNORECASE)

def Keyword(keyword: str):
    target_articles = []
    image_urls = []

    with open('2020_articles.jsonl', 'r', encoding='utf-8') as f:
        for line in f:
            article = json.loads(line)

            if keyword in article["title"]: # 標題包含關鍵字
                target_articles.append(article)
                continue

            # 否則檢查內文是否包含關鍵字
            url = article["url"]
            res = requests.get(url, headers=HEADERS, timeout=40)
            res.raise_for_status()

            soup = BeautifulSoup(res.text, "html.parser")
            main_content = soup.select_one("#main-content")
            if not main_content:
                continue

            text = main_content.get_text(separator="\n")
            content_split = text.split("※ 發信站")
            if len(content_split) < 2:
                continue
            content = content_split[0]

            if keyword in content:
                target_articles.append(article)

    print(f"找到 {len(target_articles)} 篇文章（標題或內文含關鍵字「{keyword}」）")

    for article in tqdm(target_articles, desc="處理特定文章"):
        url = article["url"]
        print(f"處理中：{url}")
        try:
            res = requests.get(url, headers=HEADERS, timeout=10)
            res.raise_for_status()
        except Exception as e:
            print(f"[!] 無法下載文章內容：{e}")
            continue

        soup = BeautifulSoup(res.text, "html.parser")
        main_content = soup.select_one("#main-content")
        if not main_content:
            continue

        text = main_content.get_text(separator="\n")
        content_split = text.split("※ 發信站")
        if len(content_split) < 2:
            print("無法找到發信站標記，跳過")
            continue

        content = content_split[0]

        print("符合條件，開始擷取圖片連結")

        pushes = soup.select("div.push span.push-content")
        for push in pushes:
            content += push.text

        image_urls += extract_image_urls(content)
        time.sleep(random.uniform(0.1, 0.3))

    unique_image_urls = list(set(image_urls))

    result = {
        "image_urls": unique_image_urls
    }

    outname = f"2020_keyword_{keyword}.json"
    with open(outname, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    print(f"完成 keyword 抽圖：{outname}，共 {len(unique_image_urls)} 張圖片")

In [None]:
Keyword("正妹") 

# 將剛剛的圖片url進行下載保存

In [None]:
save_dir = '../raw_images/2020_正妹_images'
os.makedirs(save_dir, exist_ok=True)

with open('2020_keyword_正妹.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    image_urls = data.get("image_urls", [])

print(f"共載入 {len(image_urls)} 張圖片網址，開始下載")

# 開始下載圖片
for idx, url in enumerate(tqdm(image_urls, desc="Downloading")):
    if url.startswith("https://d.img.vision/dddshay/"):
        print(f"[!] 跳過：{url}")
        continue
    try:
        response = requests.get(
            url,
            timeout=35,
            headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
                'Referer': 'https://www.google.com/'
            }
        )
        response.raise_for_status()
        
        image = Image.open(BytesIO(response.content)).convert("RGB")
        filename = f"{idx:05d}.png" # 儲存為 PNG 格式
        filepath = os.path.join(save_dir, filename)
        image.save(filepath, format="PNG")

    except Exception as e:
        print(f"[!] 第 {idx} 張圖片處理失敗：{url} | 錯誤：{e}")