In [None]:
import requests
from datetime import datetime, timedelta
from bs4 import BeautifulSoup as bs
import json
from urllib.parse import urljoin

# 计算时间是否在一年内
def check_time(soup: bs):
    item_time = soup.find("time").get_text(separator=" ")

    time_temp_1 = item_time.split(" ")
    time_temp_2 = time_temp_1[1].split(".")

    day = int(time_temp_1[0])
    month = int(time_temp_2[1])
    year = int(time_temp_2[0])

    input_date = datetime(year=year, month=month, day=day)

    current_date = datetime.now()
    time_difference = abs(current_date - input_date)

    if time_difference <= timedelta(days=3):
        return (True, input_date.__str__())
    else:
        return (False, None)


def get_page(soup: bs):
    # 查找分页列表
    pagination = soup.find("ul", class_="pagination paginations")
    if pagination:
        # 获取当前页的 li 元素，通常当前页的 li 会带有 class="current"
        current_page_element = pagination.find("li", class_="current")
        if current_page_element:
            # 获取当前页的页码
            current_page = int(current_page_element.get_text())
            return current_page
        else:
            # 如果找不到带有 current 类的 li，假设当前为第一页
            print("没有找到当前页信息，假设为第一页")
            return 1
    else:
        print("没有找到分页元素，假设为第一页")
        return 1

def get_next_page_url(soup: bs, cur_url: str):
    # 获取分页中的“下一页”链接
    next_page_element = soup.find("li", class_="next")
    if next_page_element:
        next_page_url = next_page_element.find("a")["href"]
        return urljoin(cur_url, next_page_url)
    else:
        # 如果没有“下一页”链接，返回 None，表示已是最后一页
        return None



def get_li(soup: bs):
    # 查找新闻列表，假设新闻项在 class="news-list" 中
    # 你需要根据实际的网页结构调整此处的 class 名称
    news_list = soup.find("ul", class_="news-list")  # 假设 class 为 news-list
    if news_list:
        return news_list.find_all("li")
    else:
        print("没有找到新闻列表")
        return []



class Item:
    def __init__(self, url, title, time):
        self.url = url
        self.title = title
        self.time = time

    def get_html(self):
        response = requests.get(self.url)
        response.encoding = 'utf-8'
        soup = bs(response.text, 'html.parser')
        self.content = soup.find("div", attrs={"class": "v_news_content"}).get_text()

    def store(self, path):
        if self.content is None:
            raise ValueError("Content is empty")
        with open(path, 'w', encoding='utf-8') as f:
            f.write(json.dumps(self.__dict__, ensure_ascii=False, indent=4))

''', "globalization", "technology", "logistics", "esg", "recreation-life"'''
if __name__ == "__main__":
    name_li = ["corporate-news"]
    error_li = []
    url_li = [f"https://www.alibabanews.com/topic/{name}/" for name in name_li]
    count = 0

    for cur_url in url_li:
        page_count = 1  # 从第一页开始
        is_continue = True

        while is_continue:
            print(f"抓取页面: {cur_url}")
            response = requests.get(cur_url)
            response.encoding = 'utf-8'
            soup = bs(response.text, 'html.parser')

            # 获取当前页码
            page = get_page(soup)
            li = get_li(soup)

            if li:  # 如果找到新闻项
                for item in li:
                    title = item.find("a").get("title")
                    url = urljoin(cur_url, item.find("a").get("href"))
                    time = check_time(item)

                    if time[0]:
                        item_obj = Item(url, title, time[1])
                        try:
                            item_obj.get_html()
                            item_obj.store(f"{count}.json")
                            count += 1
                            print(url)
                        except Exception as e:
                            error_li.append(url)
                            print(f"错误: {url} - {e}")
                    else:
                        is_continue = False
                        break

                # 判断是否继续翻页
                next_page_url = get_next_page_url(soup, cur_url)
                if next_page_url:
                    cur_url = next_page_url  # 更新为下一页的 URL
                else:
                    # 如果没有下一页，停止爬取
                    is_continue = False
            else:
                print("未能找到新闻列表，跳过该页面。")
                break


抓取页面: https://www.alibabanews.com/topic/corporate-news/
没有找到新闻列表
未能找到新闻列表，跳过该页面。
抓取页面: https://www.alibabanews.com/topic/globalization/
没有找到新闻列表
未能找到新闻列表，跳过该页面。
抓取页面: https://www.alibabanews.com/topic/technology/
没有找到新闻列表
未能找到新闻列表，跳过该页面。
抓取页面: https://www.alibabanews.com/topic/logistics/
没有找到新闻列表
未能找到新闻列表，跳过该页面。
抓取页面: https://www.alibabanews.com/topic/esg/
没有找到新闻列表
未能找到新闻列表，跳过该页面。
抓取页面: https://www.alibabanews.com/topic/recreation-life/
没有找到新闻列表
未能找到新闻列表，跳过该页面。


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
