In [115]:
# 說明:
# 本ipynb利用情緒詞字典幫新聞文章算出正負面的分數

In [116]:
import jieba
#import logging
import pandas as pd
import string
import import_ipynb
from collections import defaultdict
from emotionalDictionaryExtractor import get_positive_emotional_tokens
from emotionalDictionaryExtractor import get_negative_emotional_tokens

In [117]:
#第一次使用jieba套件時要執行以下程式碼
# jieba.set_dictionary('chinese_tokens_for_jieba\dict.txt.big') #讓jieba使用繁體詞庫來斷詞
# jieba.setLogLevel(logging.ERROR)  #jieba跑程式時不會出現一堆log輸出(如jieba Prefix dict has been built successfully)

In [118]:
#全域變數
good_word = get_positive_emotional_tokens() #所有正向情緒詞彙
bad_word = get_negative_emotional_tokens()  #所有負向情緒詞彙

In [119]:
#對新聞文章的內文進行中文斷詞
def chineseWordTokenizer(newsText:str) -> list:
    #讓jieba切token
    tokens = jieba.lcut(newsText)
    
    #去除標點符號
    chinesePunctuations = ['＂', '＃', '＄', '％', '＆', '＇', '（', '）', '＊', '＋', '，', '－', '／', '：', '；', '＜', '＝', '＞', '＠', '［', '＼', '］', '＾', '＿', '｀', '｛', '｜', '｝', '～', '｟', '｠', '｢', '｣', '､', '\u3000', '、', '〃', '〈', '〉', '《', '》', '「', '」', '『', '』', '【', '】', '〔', '〕', '〖', '〗', '〘', '〙', '〚', '〛', '〜', '〝', '〞', '〟', '〰', '〾', '〿', '–', '—', '‘', '’', '‛', '“', '”', '„', '‟', '…', '‧', '﹏', '﹑', '﹔', '·', '！', '？', '｡', '。']
    tokens = [token for token in tokens if token not in chinesePunctuations]

    return tokens

In [120]:
#從json檔讀取某主題的所有新聞的媒體、標題，內文，並把內文切成tokens
def get_allRelatedNewsOnSpecificTopic(topicNum:int):
    try:
        df = pd.read_json('newsDatas.json')
        allRelatedNewsOnSpecificTopic = []
        for news in df[df['topicNum'] == topicNum].iloc[0]['allRelatedNews']:
            tokens = chineseWordTokenizer(news['text'])
            tempNews = {'docID':news['docID'],'media':news['media'],'title':news['title'],'tokens':tokens}
            allRelatedNewsOnSpecificTopic.append(tempNews)
        return allRelatedNewsOnSpecificTopic
    except Exception as e:
        print("讀取某主題的所有新聞時發生錯誤: {}".format(e))

In [121]:
#計算某詞的tf
def calculate_tf(word_list, document):
    count = sum([1 for w in document if w in word_list])
    return count

#計算某主題中所有新聞文章的分數
def calculate_tf_in_news_on_specific_topic(topicNum):
    tf_dict = defaultdict(dict)
    scores = []
    document_list = []
    allRelatedNews = get_allRelatedNewsOnSpecificTopic(topicNum)
    for news in allRelatedNews:
        document_list.append(news['tokens'])
        
    #計算每篇文章的分數
    for i, document in enumerate(document_list):
        tf_dict[i]["title"] = allRelatedNews[i]["title"]
        tf_dict[i]["docID"] = allRelatedNews[i]["docID"]
        tf_dict[i]["media"] = allRelatedNews[i]["media"]
        good_score = calculate_tf(good_word, document)
        bad_score = calculate_tf(bad_word, document)
        score = good_score - bad_score
        tf_dict[i]["score"] = score
        scores.append(score)

    inv_list = ""
    highest_score_list = ""
    lowest_score_list = ""
    
    #將分數不為0的文章加入inverted list
    for i, score in enumerate(scores):
        doc_title = tf_dict[i]["title"]
        doc_name = tf_dict[i]["docID"]
        doc_media = tf_dict[i]["media"]
        if score != 0:
            inv_list += f"({doc_name},{score})"
            if score == max(scores):
                highest_score_list += f"新聞標題:{doc_title}, 媒體:{doc_media}, 文章代號:{doc_name}, 分數:{score}\n"
            if score == min(scores):
                lowest_score_list += f"新聞標題:{doc_title}, 媒體:{doc_media}, 文章代號:{doc_name}, 分數:{score}\n"
    print(f"inv_list={inv_list}")
    print(f"最高分 :\n {highest_score_list}")
    print(f"最低分 :\n {lowest_score_list}")
    


In [122]:
#測試 
# 計算某主題的所有文章分數
for i in range(1, 10):
    calculate_tf_in_news_on_specific_topic(topicNum = i)
#print(get_allRelatedNewsOnSpecificTopic(4))

inv_list=(4-1,-1)(4-2,-5)(4-3,-2)(4-4,5)(4-5,-2)(4-6,6)(4-7,5)(4-8,-2)(4-9,15)(4-10,-5)(4-11,1)(4-12,-2)(4-13,-6)(4-14,4)(4-15,-1)(4-16,-7)(4-17,-7)
最高分 :
 新聞標題:韓家軍今造勢挺韓國瑜選總統 前韓市府局長曹桓榮將參加, 媒體:自由時報, 文章代號:4-9, 分數:15

最低分 :
 新聞標題:數千韓粉岡山造勢挺韓國瑜 籲國民黨列入總統民調, 媒體:經濟日報, 文章代號:4-16, 分數:-7
新聞標題:數千韓粉岡山造勢挺韓國瑜 籲國民黨列入總統民調, 媒體:Yahoo奇摩新聞, 文章代號:4-17, 分數:-7

