In [5]:
from pathlib import Path

import requests
import pandas as pd
from bs4 import BeautifulSoup

from crawler_utilities import extract_article_op_IP

In [2]:
# 設定儲存文章的資料夾路徑
FOLDER_PATH = Path("./ptt_articles_op_IP")
# 建立資料夾（若不存在則自動建立）
FOLDER_PATH.mkdir(parents=True, exist_ok=True)

In [3]:
url = 'https://www.ptt.cc/bbs/Gossiping/index.html'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'
}

cookies = {"over18": "1"}

In [4]:
ip_country = []
for _ in range(2):
    # 發送 GET 請求取得網頁內容
    res = requests.get(url, headers=headers, cookies=cookies)
    soup = BeautifulSoup(res.text, "html.parser")

    # 取得所有文章標題區塊
    title_tag_list = soup.select('div[class="title"]')

    for tag in title_tag_list:
        # 取得標題連結標籤
        title_a_tag = tag.select_one("a")
        # 取得標題文字
        title = title_a_tag.text if title_a_tag else "No Title"
        # 取得文章網址
        article_url = "https://www.ptt.cc" + title_a_tag["href"] if title_a_tag else "No URL"

        print(f"Title: {title}")
        print(f"URL: {article_url}")
        
        # 若無法取得網址則跳過
        if "No URL" in article_url:
            print("No URL found, skipping...")
            print()
            continue

        # 擷取文章OP的IP
        result = extract_article_op_IP(article_url, headers, cookies)
        if result:
            print(result)
            print()
            ip_country.append(result)
        else:
            continue

    # 取得「上頁」按鈕，更新 url 以便爬取上一頁
    for btn_tag in soup.select('a[class="btn wide"]'):
        if "上頁" in btn_tag.text:
            url = "https://www.ptt.cc" + btn_tag["href"]
            break
    else:
        # 若找不到「上頁」按鈕則結束爬蟲
        print("No previous page found.")
        break
    print("=====================================")

Title: [問卦] 現在讀餐飲系的在幹嘛?
URL: https://www.ptt.cc/bbs/Gossiping/M.1745247891.A.365.html
['118.160.47.8', '臺灣']

Title: [問卦] 今年還有哪個龍頭會啪一聲沒了？
URL: https://www.ptt.cc/bbs/Gossiping/M.1745247916.A.D62.html
['101.12.144.110', '臺灣']

Title: Re: [爆卦] 李顯龍妻子fb轉嗆習近平是黑幫老大的文章
URL: https://www.ptt.cc/bbs/Gossiping/M.1745247932.A.940.html
['114.34.108.96', '臺灣']

Title: [新聞] 預算凍刪導致外交部發不出護照？　招標
URL: https://www.ptt.cc/bbs/Gossiping/M.1745248028.A.0CD.html
['42.72.127.126', '臺灣']

Title: [問卦] 為什麼上帝可以變黑人  白雪公主不行?
URL: https://www.ptt.cc/bbs/Gossiping/M.1745248041.A.DFC.html
['123.194.132.183', '臺灣']

Title: Re: [問卦] 楊舒雅嗆：鍵盤酸民直接來我的演講talk
URL: https://www.ptt.cc/bbs/Gossiping/M.1745248099.A.1AB.html
['50.53.5.43', '美國']

Title: [問卦] 為什麼新竹這麼多看板啊
URL: https://www.ptt.cc/bbs/Gossiping/M.1745248289.A.A1C.html
['111.251.194.165', '臺灣']

Title: [公告] 八卦板板規(2025.01.21)
URL: https://www.ptt.cc/bbs/Gossiping/M.1737398137.A.644.html
['118.165.88.146', '臺灣']

Title: Fw: [公告] 請避免與登入1次之帳號進行交易 發錢
URL: https://www.ptt.cc/

In [6]:
columns = ['IP地址', '國家']
df = pd.DataFrame(data=ip_country, columns=columns)
df

Unnamed: 0,IP地址,國家
0,118.160.47.8,臺灣
1,101.12.144.110,臺灣
2,114.34.108.96,臺灣
3,42.72.127.126,臺灣
4,123.194.132.183,臺灣
5,50.53.5.43,美國
6,111.251.194.165,臺灣
7,118.165.88.146,臺灣
8,220.135.133.179,臺灣
9,59.120.192.119,臺灣


In [7]:
df.to_csv(r'./ptt_articles_op_ip/pttGossiping.csv', index=False, encoding='utf-8-sig')