In [24]:
#說明區
#本ipynb用於取得與某篇新聞同主題的所有新聞

In [25]:
import json
import string
import pandas as pd
import import_ipynb
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import re
from chinese_tokenizer import chineseWordTokenizer
from emotionalDictionaryExtractor import get_positive_emotional_tokens
from emotionalDictionaryExtractor import get_negative_emotional_tokens
from calculate_news_scores import get_allRelatedNewsOnSpecificTopic

In [26]:
#全域變數
good_word = get_positive_emotional_tokens() #所有正向情緒詞彙
bad_word = get_negative_emotional_tokens()  #所有負向情緒詞彙

In [27]:
# 載入新聞JSON檔的資料
with open('newsDatas.json') as f:
    all_news = json.load(f)

In [28]:
def get_all_same_topic_news(title:str):
    try:
        for topic in all_news:
            for news in topic['allRelatedNews']:
                if news['title'] == title:
                    return topic['topicNum']
    except Exception as e:
        print("尋找同主題的新聞標題時發生錯誤: {}".format(e))
        return 0

In [29]:
#計算某詞的tf
def calculate_tf(word_list, document):
    count = sum([1 for w in document if w in word_list])
    return count

#計算某主題中所有新聞文章的分數
def calculate_tf_in_news_on_specific_topic(title,topicNum):
    tf_dict = defaultdict(dict)
    scores = []
    document_list_title = []
    document_list_text = []
    allRelatedNews = get_allRelatedNewsOnSpecificTopic(topicNum)
    
    for news in allRelatedNews:
        document_list_title.append(news['title_tokens'])
        document_list_text.append(news['text_tokens'])

    for i_title, document_title in enumerate(document_list_title):
        tf_dict[i_title]["title"] = allRelatedNews[i_title]["title"]
        tf_dict[i_title]["docID"] = allRelatedNews[i_title]["docID"]
        tf_dict[i_title]["media"] = allRelatedNews[i_title]["media"]
        #標題權重較重，所以分數乘2
        score_title = 2*calculate_tf(good_word, document_title) - 2*calculate_tf(bad_word, document_title)
        
        document_text = document_list_text[i_title]
        
        score_text = calculate_tf(good_word, document_text) - calculate_tf(bad_word, document_text)

        total_score = score_title + score_text
        tf_dict[i_title]["score"] = total_score
        scores.append(total_score)
    
    
    pos_news = []
    neg_news = []

    for i, score in enumerate(scores):
        if scores[i] < 0:
            neg_news.append(tf_dict[i]["title"]) 
        else:
            pos_news.append(tf_dict[i]["title"])
    
    p_n = 0 
    for i, score in enumerate(scores):
        if tf_dict[i]["title"] == title :
            this_news_score = scores[i]
            if this_news_score >= 0:
                p_n = 1
            elif this_news_score < 0:
                p_n = -1
    
    txt_file = open("相似(異)立場文章.txt", "a")
    txt_file.write(f"與「{title}」立場相似的新聞:\n")
    txt_file.write(f"=============================\n")
    if p_n ==1 :
        for i in pos_news :
            txt_file.write(f"{i}\n")
    elif p_n == -1:
        for i in neg_news :
            txt_file.write(f"{i}\n")
    txt_file.write(f"-----------------------------\n")
    txt_file.write(f"與 「{title}」立場相異的新聞:\n")
    txt_file.write(f"=============================\n")
    if p_n ==1 :
        for i in neg_news :
            txt_file.write(f"{i}\n")
    elif p_n == -1 :
        for i in pos_news :
            txt_file.write(f"{i}\n")
    txt_file.write(f"****************************\n")
    txt_file.close() 
    
    


In [30]:
def get_all_news(title):
    calculate_tf_in_news_on_specific_topic(title,get_all_same_topic_news(title))
    

In [32]:
get_all_news("民眾黨募3500萬秒開缺！網見「1職缺」狠吐槽：沒人才了？")
get_all_news("侯友宜最強副手出爐？郭正亮點名1 人斷言：可接年輕人地氣")
