In [75]:
# 說明:
# 本ipynb提供某篇新聞所有贊成與反對的留言

In [76]:
import json
import string
import pandas as pd
import import_ipynb
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import re
from chinese_tokenizer import chineseWordTokenizer
from emotionalDictionaryExtractor import get_positive_emotional_tokens
from emotionalDictionaryExtractor import get_negative_emotional_tokens

In [77]:
#全域變數
good_word = get_positive_emotional_tokens() #所有正向情緒詞彙
bad_word = get_negative_emotional_tokens()  #所有負向情緒詞彙

In [78]:
# 載入新聞JSON檔的資料
with open('newsDatas.json') as f:
    all_news = json.load(f)

In [79]:
#爬取某個yahoo新聞的使用者留言
def getMessage(newUrl: str):
    try:
        # 要爬取的Yahoo新聞網址url
        #  = 'https://tw.news.yahoo.com/新聞台撤照案-中天勝訴-北高行撤銷ncc違法處分-201000270.html'
        # url = 'https://tw.news.yahoo.com/朱立倫48小時2度會郭台銘-達成徵召侯友宜決定-130649010.html'
        # 發送HTTP GET請求到該網址
        response = requests.get(newUrl)

        # 建立BeautifulSoup物件
        soup = BeautifulSoup(response.text, 'html.parser')

        # spaceId
        id_soup = soup.find_all(
            'script', src=re.compile("https://s.yimg.com/aaq/c/"))
        idString = id_soup[0].find_next()
        id = str(idString).split("'")[1]
        # print("spaceId:"+id)

        # context
        context_soup = soup.find('meta', property="al:ios:url")
        context = context_soup['content'].split('/')[4]
        # print("context:"+context)

        # messageSum

        urlMessage = "https://tw.news.yahoo.com/_td-news/api/resource/canvass.getPresence_ns;apiVersion=v1;context="+context + \
            ";lang=zh-Hant-TW;messageIds=;namespace=yahoo_content;oauthConsumerKey=frontpage.oauth.canvassKey;oauthConsumerSecret=frontpage.oauth.canvassSecret;region=TW?"
        payloadMessage = {}
        headersMessage = {}
        responseMessage = requests.request(
            "GET", urlMessage, headers=headersMessage, data=payloadMessage)
        messageSum = str(responseMessage.json()['messageCount'])
        # print(messageSum)

        # crawl
        url = "https://tw.news.yahoo.com/_td-news/api/resource/canvass.getMessageListForContext_ns;apiVersion=v1;context="+context+";count="+"100" + \
            ";index=null;lang=zh-Hant-TW;namespace=yahoo_content;oauthConsumerKey=frontpage.oauth.canvassKey;oauthConsumerSecret=frontpage.oauth.canvassSecret;rankingProfile=;region=TW;sortBy=popular;spaceId="+id+";"

        payload = {}
        headers = {}

        response = requests.request("GET", url, headers=headers, data=payload)

        messages_origin = []
        messages_token = []
        
        for i in response.json()['canvassMessages']:
            message_origin = i['details']['userText']
            message_token = chineseWordTokenizer(message_origin)
            messages_origin.append(message_origin)
            messages_token.append(message_token)
        return messages_origin,messages_token
    except Exception as e:
        print("讀取某新聞的留言時發生錯誤: {}".format(e))
        return [],[]


In [80]:
# m_o,m_t = getMessage("https://tw.news.yahoo.com/%E8%8A%AF%E8%8F%AF%E5%A4%A7%E6%88%B0-%E9%96%8B%E6%89%93-%E5%BE%90%E5%B7%A7%E8%8A%AF%E5%85%AC%E9%96%8B-%E8%A8%B1%E6%B7%91%E8%8F%AF%E7%9B%B8%E7%B0%BF-%E9%9B%99%E6%96%B9%E4%BA%92%E5%97%86%E8%96%AA%E6%B0%B4%E5%B0%8F%E5%81%B7-134702324.html")
# print(m_o[5])
# print(m_t[5])

In [81]:
#某yahoo新聞內所有的媒體、標題，留言，並把留言切成tokens
def get_all_comments_on_a_News(title:str):
    try:
        for topic in all_news:
            for news in topic['allRelatedNews']:
                if news['title'] == title:
                    messages_origin,messages_token = getMessage(news['url']) #from kuan
                    return {'messages_origin':messages_origin,'messages_token':messages_token}
    except Exception as e:
        print("讀取某新聞的留言時發生錯誤: {}".format(e))
        return dict()


In [82]:
# n = get_all_comments_on_a_News("「芯華大戰」開打！徐巧芯公開「許淑華相簿」 雙方互嗆薪水小偷")
# print(n['messages_origin'])
# print(n['messages_token'])

In [83]:
#計算某詞的tf
def calculate_tf(word_list, comment):
    count = sum([1 for w in comment if w in word_list])
    return count

In [88]:
# 計算某篇新聞的所有使用者留言的正負向分數
def calculate_comment_score_in_a_news(title,messages_token):
    try:
        scores = []
        if len(messages_token) > 0:
            for message_token in messages_token:
                one_comment_score = calculate_tf(good_word, message_token) - calculate_tf(bad_word, message_token)
                scores.append(one_comment_score)
            return scores 
    except Exception as e:
        print("計算某篇新聞的所有留言分數時發生錯誤: {}".format(e))
        return []


In [116]:
def print_news_pos_neg_comments(title:str):
    news = get_all_comments_on_a_News(title)
    scores = calculate_comment_score_in_a_news(title,news['messages_token'])
    positive_messages = [] # 所有贊成該新聞的留言
    negative_messages = [] # 所有反對該新聞的留言
    unknown_messages = [] # 所有立場不明確的留言
    for ith,message in enumerate(news['messages_origin']) :
        if scores[ith] > 0 :
            positive_messages.append(message)
        elif scores[ith] < 0 :
            negative_messages.append(message)
        else:
            unknown_messages.append(message)
    txt_file = open("某篇新聞的所有留言.txt", "a")
    txt_file.write(f"標題： {title}\n")
    txt_file.write(f"========================\n")
    txt_file.write(f"贊成的留言：\n\n")
    for ith,message in enumerate(news['messages_origin']):
        if message in positive_messages:
            txt_file.write(f"{message}\n")
            txt_file.write(f"-----------------------------\n")
    txt_file.write(f"========================\n")
    txt_file.write(f"反對的留言：\n\n")
    for ith,message in enumerate(news['messages_origin']):
        if message in negative_messages:
            txt_file.write(f"{message}\n")
            txt_file.write(f"-----------------------------\n")
    txt_file.write(f"========================\n")
    txt_file.write(f"立場不明確的留言：\n\n")
    for ith,message in enumerate(news['messages_origin']):
        if message in unknown_messages:
            txt_file.write(f"{message}\n")
            txt_file.write(f"-----------------------------\n")
    txt_file.close() 
    

In [117]:
#輸入標題
print_news_pos_neg_comments("「芯華大戰」開打！徐巧芯公開「許淑華相簿」 雙方互嗆薪水小偷")