In [18]:
# 說明:
# 本ipynb爬取所有yahoo新聞的使用者留言，並計算使用者正負向分數

In [19]:
import json
import pandas as pd
import import_ipynb
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import re
from chinese_tokenizer import chineseWordTokenizer
from emotionalDictionaryExtractor import get_positive_emotional_tokens
from emotionalDictionaryExtractor import get_negative_emotional_tokens

In [20]:
#全域變數
good_word = get_positive_emotional_tokens() #所有正向情緒詞彙
bad_word = get_negative_emotional_tokens()  #所有負向情緒詞彙

In [21]:
# 載入新聞JSON檔的資料
with open('newsDatas.json') as f:
    all_news = json.load(f)

In [22]:
#爬取某個yahoo新聞的使用者留言
def getMessage(newUrl: str):
    try:
        # 要爬取的Yahoo新聞網址url
        #  = 'https://tw.news.yahoo.com/新聞台撤照案-中天勝訴-北高行撤銷ncc違法處分-201000270.html'
        # url = 'https://tw.news.yahoo.com/朱立倫48小時2度會郭台銘-達成徵召侯友宜決定-130649010.html'
        # 發送HTTP GET請求到該網址
        response = requests.get(newUrl)

        # 建立BeautifulSoup物件
        soup = BeautifulSoup(response.text, 'html.parser')

        # spaceId
        id_soup = soup.find_all(
            'script', src=re.compile("https://s.yimg.com/aaq/c/"))
        idString = id_soup[0].find_next()
        id = str(idString).split("'")[1]
        # print("spaceId:"+id)

        # context
        context_soup = soup.find('meta', property="al:ios:url")
        context = context_soup['content'].split('/')[4]
        # print("context:"+context)

        # messageSum

        urlMessage = "https://tw.news.yahoo.com/_td-news/api/resource/canvass.getPresence_ns;apiVersion=v1;context="+context + \
            ";lang=zh-Hant-TW;messageIds=;namespace=yahoo_content;oauthConsumerKey=frontpage.oauth.canvassKey;oauthConsumerSecret=frontpage.oauth.canvassSecret;region=TW?"
        payloadMessage = {}
        headersMessage = {}
        responseMessage = requests.request(
            "GET", urlMessage, headers=headersMessage, data=payloadMessage)
        messageSum = str(responseMessage.json()['messageCount'])
        # print(messageSum)

        # crawl
        url = "https://tw.news.yahoo.com/_td-news/api/resource/canvass.getMessageListForContext_ns;apiVersion=v1;context="+context+";count="+"100" + \
            ";index=null;lang=zh-Hant-TW;namespace=yahoo_content;oauthConsumerKey=frontpage.oauth.canvassKey;oauthConsumerSecret=frontpage.oauth.canvassSecret;rankingProfile=;region=TW;sortBy=popular;spaceId="+id+";"

        payload = {}
        headers = {}

        response = requests.request("GET", url, headers=headers, data=payload)

        message = []
        for i in response.json()['canvassMessages']:
            message_token = chineseWordTokenizer(i['details']['userText'])
            message.append(message_token)

        return message
    except Exception as e:
        print("讀取某新聞的留言時發生錯誤: {}".format(e))
        return []


In [23]:
#從已載入的JSON資料中找到某主題內所有yahoo新聞的媒體、標題，留言，並把留言切成tokens
def get_allCommentsOnSpecificTopicFromYahoo(topicNum:int):
    try:
        allCommentsOnSpecificTopicFromYahoo = {}
        for topic in all_news:
            if topic['topicNum'] == topicNum: # 找到主題
                for news in topic['allRelatedNews']:
                    if news['media'] == "Yahoo奇摩新聞":
                        message_tokens = getMessage(news['url']) #from kuan
                        allCommentsOnSpecificTopicFromYahoo[news['docID']]  = {'media':news['media'],'title':news['title'],'message_tokens':message_tokens}           
                break
        return allCommentsOnSpecificTopicFromYahoo
    except Exception as e:
        print("讀取所有新聞的留言時發生錯誤: {}".format(e))
        return dict()


In [24]:
#計算某詞的tf
def calculate_tf(word_list, comment):
    count = sum([1 for w in comment if w in word_list])
    return count

In [25]:
# 計算某篇新聞的所有使用者留言的正負向分數
def calculate_comment_score_in_a_news(docID,allCommentsOnSpecificTopic,txt_file):
    total_score = 0
    scores = []
    comments = []
    comments = allCommentsOnSpecificTopic[docID]['message_tokens']
    if len(comments) > 0:
        for comment in comments:
            one_comment_score = calculate_tf(good_word, comment) - calculate_tf(bad_word, comment)
            scores.append(one_comment_score)
            total_score += one_comment_score
        txt_file.write(f"新聞{docID}\n")
        txt_file.write(f"標題： {allCommentsOnSpecificTopic[docID]['title']}\n")
        txt_file.write(f"留言人數： {len(comments)}\n")
        txt_file.write(f"個別留言分數： {scores}\n")
        txt_file.write(f"平均留言分數： {round(total_score/len(comments), 3)}\n")
        txt_file.write("-----"*10 + "\n")

In [26]:
#計算某主題內所有新聞的所有使用者留言的正負向分數
def calculate_comment_score_on_specific_topic(topicNum):
    allCommentsOnSpecificTopic = get_allCommentsOnSpecificTopicFromYahoo(topicNum)
    if len(allCommentsOnSpecificTopic) > 0:
        txt_file = open("0525以前的使用者評分結果(by ckiptagger).txt", "a")
        txt_file.write(f"主題 {topicNum}\n")
        txt_file.write("="*10 + "\n")
        for docID in allCommentsOnSpecificTopic.keys():
            calculate_comment_score_in_a_news(docID,allCommentsOnSpecificTopic,txt_file)
        txt_file.write("\n\n\n")
        txt_file.close() 
    

In [29]:
# #測試 
# 計算所有主題的所有文章的留言分數
for i in range(418,420):
    calculate_comment_score_on_specific_topic(i)
