In [None]:
#說明區
#本ipynb處理爬到的新聞內文


In [122]:
import requests
from goose3 import Goose
from goose3.text import StopWordsChinese
from goose3.text import StopWordsKorean
from goose3.text import StopWordsArabic
import pandas as pd
from fake_useragent import UserAgent
import time
import random

In [10]:
# goose抓內文
def main_text_goose(url) -> str:
    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"}
    content = ""
    try:
        url = requests.get(url).url # 取得原文連結(原先的url是被google news轉換過的)
        response = requests.get(url, headers=headers)
        if response.status_code >= 400:# 爬蟲被擋
            # 嘗試不被擋爬蟲
            time.sleep(random.uniform(1, 3)) # 隨機暫停 1~3 秒，參考:https://ithelp.ithome.com.tw/articles/10224979
            user_agent = UserAgent()# 隨機切換user_agent
            response = requests.get(url=url,headers={ 'user-agent': user_agent.random })
            if response.status_code >= 400:# 仍然被擋，放棄爬蟲
                print("goose爬蟲被擋--Response Code:{}".format(response.status_code))
                print("文章網址: {}".format(url))
                return ""
            
        # 偵測文章語言
        g = Goose() # 默認的method，若偵測不到文章語言，則會用這個method爬文章
        language = response.headers.get("Content-Language")
        if language != None:
            language = language.split('-')[0]
            g = Goose({'use_meta_language': False, 'target_language':language})
            # 中文、阿拉伯文、韓文等文章需要用到goose套件中的斷詞系統
            if language == 'zh':# 中文
                g = Goose({'stopwords_class': StopWordsChinese})
            elif language == 'ar':# 阿拉伯文
                g = Goose({'stopwords_class': StopWordsArabic})
            elif language == 'ko':# 韓文
                g = Goose({'stopwords_class':StopWordsKorean})
                
        # 爬取內文
        article = g.extract(raw_html=response.text)
        content = article.cleaned_text
        
        # 「文章有可能是中文，但先前沒偵測到文章語言」的額外處理
        if language == None and len(article.title) != 0 and len(article.cleaned_text) == 0:
            g = Goose({'stopwords_class': StopWordsChinese})
            article = g.extract(raw_html=response.text)
            content = article.cleaned_text 
            
        # 檢查內文是否為空
        if len(content) > 0:
            return content   
        else:   
            print("goose抓不到內文")    
            print("文章網址: {}".format(url))
            return ""
    except Exception as e:
        print("can not get main text(goose)\n", e)
        return ""

In [172]:
# 將新聞的相關資料(標題、連結、內文等)寫入json檔
def writeNewsDataToJson(newsMedia = "",newsTitle = "",newsUrl = "",newsTopicUrl = ""):
    # 以下簡稱要加入json檔的新聞為「新聞a」
    
    # 爬取新聞a內文
    text = ""
    text = main_text_goose(newsUrl)
    
    # 爬到新聞a的內文才寫入json
    if text != "": 
        df = pd.DataFrame() # df包含讀取以及寫入json檔的資料
        fileName = 'newsDatas.json' # json檔名稱
        newNews = {    # 新聞a的資訊
            'media':newsMedia,  # 報導媒體
            'title':newsTitle,  # 標題
            'text':text,    # 內文
            'url':newsUrl   # 連結
        }
        try: 
            df = pd.read_json(fileName)
            if df.empty or len(df) == 0: #json檔內的所有新聞主題都被手動清空的情況
                df = pd.DataFrame({'topicUrl': [], 'allRelatedNews': []})
            try:
                # 根據新聞a的主題連結，檢查json檔中是否存在新聞a
                isFindNewsInItsTopic = 0 
                for item in df[df['topicUrl'] == newsTopicUrl].iloc[0]['allRelatedNews']:
                    if item['url'] == newsUrl:
                          isFindNewsInItsTopic = 1
                          break
                      
                # 若新聞a不在json檔內，則將新聞a加入json檔
                if isFindNewsInItsTopic == 0: 
                    df[df['topicUrl'] == newsTopicUrl].iloc[0]['allRelatedNews'].append(newNews)
                    df.to_json(fileName, orient='records', indent=4, force_ascii=False)  
            except IndexError as e:
                # 若新聞a的主題也不在json檔內，先將主題加入json檔
                topicDataWithNewNews = {
                    'topicUrl': [newsTopicUrl],
                    'allRelatedNews':[[newNews]]
                }
                df = pd.concat([df, pd.DataFrame(topicDataWithNewNews)])
                df.to_json(fileName, orient='records', indent=4, force_ascii=False)    
            except Exception as e:
                print("a發生非預期的錯誤：{}".format(e))
        except FileNotFoundError:
            # json檔不存在或尚未創建
            print('找不到json檔，將重新建立新的json檔')
            df = pd.DataFrame({'topicUrl': [], 'allRelatedNews': []})
            topicDataWithNewNews = {
                'topicUrl': [newsTopicUrl],
                'allRelatedNews':[[newNews]]
            }
            df = pd.concat([df, pd.DataFrame(topicDataWithNewNews)])
            df.to_json(fileName, orient='records', indent=4, force_ascii=False)  
        except Exception as e:
            print("b發生非預期的錯誤：{}".format(e))
    return
