In [20]:
# 說明:
# 本ipynb利用情緒詞字典幫新聞文章算出正負面的分數

In [21]:
import json
import pandas as pd
import import_ipynb
from collections import defaultdict
from chinese_tokenizer import chineseWordTokenizer
from emotionalDictionaryExtractor import get_positive_emotional_tokens
from emotionalDictionaryExtractor import get_negative_emotional_tokens

In [22]:
#全域變數
good_word = get_positive_emotional_tokens() #所有正向情緒詞彙
bad_word = get_negative_emotional_tokens()  #所有負向情緒詞彙

In [23]:
# 載入新聞JSON檔的資料
with open('newsDatas.json') as f:
    all_news = json.load(f)

In [24]:
#從已載入的JSON資料中找到某主題的所有新聞的媒體、標題，內文，並把標題與內文切成tokens
def get_allRelatedNewsOnSpecificTopic(topicNum:int):
    try:
        allRelatedNewsOnSpecificTopic = []
        for topic in all_news:
            if topic['topicNum'] == topicNum: # 找到主題
                for news in topic['allRelatedNews']:
                    title_tokens = chineseWordTokenizer(news['title'])
                    text_tokens = chineseWordTokenizer(news['text'])
                    tempNews = {'docID':news['docID'],'media':news['media'],'title':news['title'],'title_tokens':title_tokens,'text_tokens':text_tokens}
                    allRelatedNewsOnSpecificTopic.append(tempNews)
                break              
        return allRelatedNewsOnSpecificTopic
    except Exception as e:
        print("讀取某主題的所有新聞時發生錯誤: {}".format(e))
        return None


In [25]:
#計算某詞的tf
def calculate_tf(word_list, document):
    count = sum([1 for w in document if w in word_list])
    return count

#計算某主題中所有新聞文章的分數
def calculate_tf_in_news_on_specific_topic(topicNum):
    tf_dict = defaultdict(dict)
    scores = []
    document_list_title = []
    document_list_text = []
    allRelatedNews = get_allRelatedNewsOnSpecificTopic(topicNum)
    
    for news in allRelatedNews:
        document_list_title.append(news['title_tokens'])
        document_list_text.append(news['text_tokens'])

    for i_title, document_title in enumerate(document_list_title):
        tf_dict[i_title]["title"] = allRelatedNews[i_title]["title"]
        tf_dict[i_title]["docID"] = allRelatedNews[i_title]["docID"]
        tf_dict[i_title]["media"] = allRelatedNews[i_title]["media"]
        #標題權重較重，所以分數乘2
        good_score_title = 2*calculate_tf(good_word, document_title)
        bad_score_title = 2*calculate_tf(bad_word, document_title)
        score_title = good_score_title - bad_score_title
        
        document_text = document_list_text[i_title]
        good_score_text = calculate_tf(good_word, document_text)
        bad_score_text = calculate_tf(bad_word, document_text)
        score_text = good_score_text - bad_score_text

        total_score = score_title + score_text
        tf_dict[i_title]["score"] = total_score
        scores.append(total_score)

    inv_list = ""
    highest_score_list = ""
    lowest_score_list = ""
    
    #將總分數不為0的文章加入inverted list(0分代表可能無立場)
    for i, score in enumerate(scores):
        doc_title = tf_dict[i]["title"]
        doc_name = tf_dict[i]["docID"]
        doc_media = tf_dict[i]["media"]
        if score != 0:
            inv_list += f"({doc_name},{score})"
            if score == max(scores):
                highest_score_list += f"新聞標題:{doc_title}, 媒體:{doc_media}, 文章代號:{doc_name}, 分數:{score}\n"
            if score == min(scores):
                lowest_score_list += f"新聞標題:{doc_title}, 媒體:{doc_media}, 文章代號:{doc_name}, 分數:{score}\n"
    print(f"inv_list={inv_list}")
    print(f"最高分 :\n {highest_score_list}")
    print(f"最低分 :\n {lowest_score_list}")
    


In [None]:
#測試 
# 計算某主題的所有文章分數

for topic in all_news:
    calculate_tf_in_news_on_specific_topic(topic['topicNum'])
#print(get_allRelatedNewsOnSpecificTopic(4))