In [113]:
# 說明:
# 本ipynb利用情緒詞字典幫新聞文章算出正負面的分數

In [114]:
import jieba
#import logging
import pandas as pd
import string
import import_ipynb
from collections import defaultdict
from emotionalDictionaryExtractor import get_positive_emotional_tokens
from emotionalDictionaryExtractor import get_negative_emotional_tokens

In [115]:
#第一次使用jieba套件時要執行以下程式碼
# jieba.set_dictionary('chinese_tokens_for_jieba\dict.txt.big') #讓jieba使用繁體詞庫來斷詞
# jieba.setLogLevel(logging.ERROR)  #jieba跑程式時不會出現一堆log輸出(如jieba Prefix dict has been built successfully)

In [116]:
#全域變數
good_word = get_positive_emotional_tokens() #所有正向情緒詞彙
bad_word = get_negative_emotional_tokens()  #所有負向情緒詞彙

In [117]:
#對新聞文章的內文進行中文斷詞
def chineseWordTokenizer(newsText:str) -> list:
    #讓jieba切token
    tokens = jieba.lcut(newsText)
    
    #去除標點符號
    chinesePunctuations = ['＂', '＃', '＄', '％', '＆', '＇', '（', '）', '＊', '＋', '，', '－', '／', '：', '；', '＜', '＝', '＞', '＠', '［', '＼', '］', '＾', '＿', '｀', '｛', '｜', '｝', '～', '｟', '｠', '｢', '｣', '､', '\u3000', '、', '〃', '〈', '〉', '《', '》', '「', '」', '『', '』', '【', '】', '〔', '〕', '〖', '〗', '〘', '〙', '〚', '〛', '〜', '〝', '〞', '〟', '〰', '〾', '〿', '–', '—', '‘', '’', '‛', '“', '”', '„', '‟', '…', '‧', '﹏', '﹑', '﹔', '·', '！', '？', '｡', '。']
    tokens = [token for token in tokens if token not in chinesePunctuations]

    return tokens

In [118]:
#從json檔讀取某主題的所有新聞的媒體、標題，內文，並把內文切成tokens
def get_allRelatedNewsOnSpecificTopic(topicNum:int):
    try:
        df = pd.read_json('newsDatas.json')
        allRelatedNewsOnSpecificTopic = []
        for news in df[df['topicNum'] == topicNum].iloc[0]['allRelatedNews']:
            tokens = chineseWordTokenizer(news['text'])
            tempNews = {'docID':news['docID'],'media':news['media'],'title':news['title'],'tokens':tokens}
            allRelatedNewsOnSpecificTopic.append(tempNews)
        return allRelatedNewsOnSpecificTopic
    except Exception as e:
        print("讀取某主題的所有新聞時發生錯誤: {}".format(e))

def get_allRelatedNewsOnSpecificTopic_Title(topicNum:int):
    try:
        df = pd.read_json('newsDatas.json')
        allRelatedNewsOnSpecificTopic_Title = []
        for news in df[df['topicNum'] == topicNum].iloc[0]['allRelatedNews']:
            tokens = chineseWordTokenizer(news['title'])
            tempNews = {'docID':news['docID'],'media':news['media'],'title':news['title'],'tokens':tokens}
            allRelatedNewsOnSpecificTopic_Title.append(tempNews)
        #print(allRelatedNewsOnSpecificTopic_Title)
        return allRelatedNewsOnSpecificTopic_Title
    except Exception as e:
        print("讀取某主題的所有新聞Title時發生錯誤: {}".format(e))

In [119]:
#計算某詞的tf
def calculate_tf(word_list, document):
    count = sum([1 for w in document if w in word_list])
    return count

#計算某主題中所有新聞文章的分數
def calculate_tf_in_news_on_specific_topic(topicNum):
    tf_dict = defaultdict(dict)
    scores = []
    document_list = []
    document_list_title = []
    allRelatedNews = get_allRelatedNewsOnSpecificTopic(topicNum)
    allRelatedNews_Title = get_allRelatedNewsOnSpecificTopic_Title(topicNum)
    for news in allRelatedNews:
        document_list.append(news['tokens'])
        
    for news_title in allRelatedNews_Title:
        document_list_title.append(news_title['tokens'])

    for i_title, document_title in enumerate(document_list_title):
        tf_dict[i_title]["title"] = allRelatedNews[i_title]["title"]
        tf_dict[i_title]["docID"] = allRelatedNews[i_title]["docID"]
        tf_dict[i_title]["media"] = allRelatedNews[i_title]["media"]
        #標題權重較重，所以分數乘2
        good_score_title = 2*calculate_tf(good_word, document_title)
        bad_score_title = 2*calculate_tf(bad_word, document_title)
        score_title = good_score_title - bad_score_title
        
        document = document_list[i_title]
        good_score = calculate_tf(good_word, document)
        bad_score = calculate_tf(bad_word, document)
        score = good_score - bad_score

        total_score = score_title + score
        tf_dict[i_title]["score"] = total_score
        scores.append(total_score)

    inv_list = ""
    highest_score_list = ""
    lowest_score_list = ""
    
    #將分數不為0的文章加入inverted list
    for i, score in enumerate(scores):
        doc_title = tf_dict[i]["title"]
        doc_name = tf_dict[i]["docID"]
        doc_media = tf_dict[i]["media"]
        if score != 0:
            inv_list += f"({doc_name},{score})"
            if score == max(scores):
                highest_score_list += f"新聞標題:{doc_title}, 媒體:{doc_media}, 文章代號:{doc_name}, 分數:{score}\n"
            if score == min(scores):
                lowest_score_list += f"新聞標題:{doc_title}, 媒體:{doc_media}, 文章代號:{doc_name}, 分數:{score}\n"
    print(f"inv_list={inv_list}")
    print(f"最高分 :\n {highest_score_list}")
    print(f"最低分 :\n {lowest_score_list}")
    


In [120]:
#測試 
# 計算某主題的所有文章分數
for i in range(1, 12563):
    calculate_tf_in_news_on_specific_topic(topicNum = i)
#print(get_allRelatedNewsOnSpecificTopic(4))

inv_list=(1-1,-12)(1-2,2)(1-3,6)(1-4,-5)(1-5,4)(1-6,5)(1-7,-8)(1-8,6)(1-9,2)(1-10,19)(1-12,-2)(1-13,3)(1-14,9)(1-15,-3)(1-16,-4)(1-17,4)(1-18,-2)(1-19,9)(1-20,3)(1-21,23)(1-22,1)(1-23,5)(1-24,-5)(1-25,2)(1-26,-7)(1-27,-5)(1-28,-3)(1-29,19)(1-30,-19)(1-31,2)(1-32,12)(1-33,-4)(1-34,4)(1-35,2)(1-36,3)(1-37,-6)(1-38,-1)(1-39,1)(1-40,-2)(1-41,-10)(1-42,-1)(1-44,-10)(1-45,-7)(1-46,4)(1-47,8)(1-48,8)(1-49,2)(1-50,4)(1-51,-2)(1-52,9)
最高分 :
 新聞標題:國民黨北市立委初選徐巧芯勝出 將對戰民進黨許淑華| 政治, 媒體:中央社即時新聞, 文章代號:1-21, 分數:23

最低分 :
 新聞標題:徐巧芯指控涉嫌「包車載人投票」 費鴻泰嗆：讓人想起走路工事件 | 政治 | CTWANT, 媒體:CTWANT, 文章代號:1-30, 分數:-19

inv_list=(2-1,18)(2-2,15)(2-3,12)(2-4,6)(2-5,38)(2-6,22)(2-7,-6)(2-8,24)(2-9,22)(2-10,31)(2-11,12)(2-12,8)(2-13,30)(2-14,43)(2-15,20)(2-16,4)(2-17,5)(2-18,29)(2-19,31)(2-20,39)(2-21,8)(2-22,17)(2-23,31)(2-24,4)(2-25,3)(2-26,6)(2-27,5)(2-28,29)(2-29,16)(2-30,11)(2-31,20)(2-32,4)(2-33,12)(2-34,26)(2-35,13)(2-36,22)(2-37,12)(2-38,39)(2-39,-7)(2-40,9)(2-41,34)(2-42,20)(2-43,22)(2-44,8)(2-45,8)(2-46,19)(2-47,32)

TypeError: 'NoneType' object is not iterable