# Crawler

In [1]:
"""
crawl the yahoo news and get the articles titles, contents, and pictures
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Yahoo News URL

url = "https://tw.news.yahoo.com/"

# Fetch the page content

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# Scrape headlines and links

topics = soup.find_all("li", class_="Pos(r) Lh(1.5) H(24px) Mb(8px)")

print(topics)

articles = []

for topic in topics:
    topic_title = topic.find("a").string
    topic_link = topic.find("a")["href"]
        
    # Fetch the article content
    
    article_response = requests.get(topic_link)
    
    article_soup = BeautifulSoup(article_response.content, 'html.parser')
    
    content = "\n".join([p.get_text(strip=True) for p in article_soup.find_all('p') if not p.get_text(strip=True).startswith('更多')])
    
    # Fetch the article image
    
    image_tag = article_soup.find("img", loading_="lazy")
    
    if image_tag:
        image_url = image_tag["src"]
        print("img saved!")
    else:
        image_url = None
    
    # Append to the list
    
    articles.append({"Title": topic_title, "Link": topic_link, "Content": content, "Image": image_url})

# Save to csv
df = pd.DataFrame(articles)
df.to_csv("yahooNews.csv", index=False)


[<li class="Pos(r) Lh(1.5) H(24px) Mb(8px)"><svg class="H(24px) Mend(8px) Cur(p)" data-icon="moon-new-moon" height="3" style="vertical-align:top;fill:#979ba7;stroke:#979ba7;stroke-width:0" viewbox="0 0 48 48" width="3"><path d="M45.75 24.87c0 11.802-9.574 21.376-21.376 21.376C12.57 46.246 3 36.676 3 24.87 3 13.068 12.57 3.497 24.374 3.497c11.802.002 21.376 9.57 21.376 21.375"></path></svg><a class="D(ib) Ov(h) Whs(nw) C($c-fuji-grey-l) C($c-fuji-blue-1-c):h Td(n) Fz(16px) Tov(e) Fw(700)" href="https://tw.news.yahoo.com/%E5%81%A5%E4%BF%9D%E5%81%9C%E5%BE%A9%E4%BF%9D1223%E5%BB%A2%E6%AD%A2%EF%BC%81%E5%81%A5%E4%BF%9D%E5%81%9C%E5%BE%A9%E4%BF%9D%E6%96%B0%E5%88%B6%E3%80%81%E5%BD%B1%E9%9F%BF6%E5%A4%A7qa%E4%B8%80%E6%AC%A1%E7%9C%8B-095603090.html" style="max-width:265px;width:calc(100% - 11px - 0px)">健保停復保今起廢止 影響一次看</a></li>, <li class="Pos(r) Lh(1.5) H(24px) Mb(8px)"><svg class="H(24px) Mend(8px) Cur(p)" data-icon="moon-new-moon" height="3" style="vertical-align:top;fill:#979ba7;stroke:#979ba7;s

In [2]:
import pandas as pd 

# Read from csv
df = pd.read_csv("yahooNews.csv")
sample = df.sample(1)

link = sample["Link"].values[0]
article_response = requests.get(link)
article_soup = BeautifulSoup(article_response.content, 'html.parser')
print(link)
title = article_soup.find('h1').get_text(strip=True)
caasbody = article_soup.find('div', class_='caas-body')
artical_content = ""
caasbody = caasbody.find_all(['p', 'img'])
filtered_data = []

for element in caasbody:
    if element.name == 'img':  # 如果是 <img> 標籤
        if element.has_attr('src'):  # 確保有 src 屬性
            img_data = {
                "type": "image",
                "src": element['src'],
                "alt": element.get('alt', '無描述')  # 如果有 alt 屬性，提取它
            }
            filtered_data.append(img_data)

    elif element.name == 'p':  # 如果是 <p> 標籤
        text = element.get_text(strip=True)  # 提取段落文字
        if not (text.startswith("看更多") or text.startswith("更多")):  # 過濾不需要的段落
            paragraph_data = {
                "type": "paragraph",
                "text": text
            }
            filtered_data.append(paragraph_data)

    
news = {"Title": title, "Content": filtered_data}
news

https://tw.news.yahoo.com/%E9%A2%A8%E8%A9%95-%E8%B3%B4%E6%B8%85%E5%BE%B7%E4%B8%8D%E6%92%9E%E5%8D%97%E7%89%86%E4%B8%8D%E5%9B%9E%E9%A0%AD-%E6%92%9E%E4%BA%86%E5%8D%97%E7%89%86%E4%B9%8B%E5%BE%8C%E5%91%A2-235001252.html


{'Title': '風評：賴清德不撞南牆不回頭，撞了南牆之後呢？',
 'Content': [{'type': 'image',
   'src': 'https://s.yimg.com/ny/api/res/1.2/XthQMbWvrCQJ8uNTv8DYxw--/YXBwaWQ9aGlnaGxhbmRlcjt3PTk2MDtoPTY0MA--/https://media.zenfs.com/ko/stormmediagroup.com/8eb8f976e2ea35a54ffab42e5e05c9b3',
   'alt': ''},
  {'type': 'paragraph',
   'text': '在場內打架場外抗議聲中，立法院連日、徹夜表決通過三項法案，民進黨府、院、黨團皆發出聲明表達遺憾，並強調將「窮盡一切救濟手段」，「依照憲法所賦予的權力，守護自由民主的憲政體制。」意思是，三項法案爭議並未畫上點，相反的，又要重新步上覆議、聲請釋憲、甚至不公布實施的覆轍。'},
  {'type': 'paragraph',
   'text': '做為少數總統，賴清德始終不肯正視少數執政必須面對的現實，他顧念場外被動員號召而來的「青鳥」，却無視立法院六成非民進黨立委代表的民意和訴求；三項法案審議過程中，民進黨從未認真思考協商的可能性，遑論提出對案版本，即使民進黨過去也曾有過相似的主張；即使三項法案三讀通過，究其內容，亦絕非民進黨中中「毀憲亂政」、「國家存亡」、「剝奪人民權益」、危害國家安全」，事實上，通過的版本於現狀影響實在不大。'},
  {'type': 'paragraph',
   'text': '以選罷法修正為例，只通過了罷免連署應附上身份證影本的規定，基本合理，就像獨立參選總統連署也要身份證影本；至於提高罷免門檻（罷免票數高於當選票數）因為民眾黨並不同意，此案還在朝野協商冷凍期，一個月後立法院休會，何時能見天日尚未可知，就算拉出來，民眾黨不同意還是過不了，民進黨大駡藍白政黨剝奪公民罷免權益的同時，大概忘了多數執政時期硬生生取消公投綁大選，還強硬規定兩年一公投的限期，才是紮紮實實剝奪了直接民意表達的生機。'},
  {'type': 'paragraph',
   'text': '憲訴法修正同樣未依照釋憲判決最嚴格的國民黨（翁曉玲）版本，而是採取

# YOLO

# App view

In [3]:
import sys
import requests
from io import BytesIO
from PyQt5.QtWidgets import (
    QApplication, QMainWindow, QVBoxLayout, QLabel, QPushButton,
    QScrollArea, QWidget, QHBoxLayout
)
from PyQt5.QtGui import QPixmap
from PyQt5.QtCore import Qt

class NewsApp(QMainWindow):
    def __init__(self):
        super().__init__()
        
        self.df = pd.read_csv("yahooNews.csv")

        # 窗口設置
        self.setWindowTitle("Yahoo新聞資料庫")
        self.resize(1200, 600)

        # 主容器
        main_widget = QWidget()
        main_layout = QHBoxLayout(main_widget)

        # 左側控制區域
        control_layout = QVBoxLayout()
        btn_random = QPushButton("Random News")
        btn_store = QPushButton("Store To Database")
        control_layout.addWidget(btn_random)
        control_layout.addWidget(btn_store)
        control_layout.addStretch()  # 增加空白區域
        main_layout.addLayout(control_layout, 1)
        
        btn_random.clicked.connect(self.load_news)
        btn_store.clicked.connect(self.store_to_database)

        # 右側新聞顯示區域
        self.scroll_area = QScrollArea()
        self.scroll_area.setWidgetResizable(True)
        self.scroll_content = QWidget()
        self.scroll_layout = QVBoxLayout(self.scroll_content)
        self.scroll_area.setWidget(self.scroll_content)
        main_layout.addWidget(self.scroll_area, 4)

        self.setCentralWidget(main_widget)
        self.news = news
        # 加載新聞
        self.load_news(self.news)

    def load_news(self, news):
        
        sample = df.sample(1)

        link = sample["Link"].values[0]
        article_response = requests.get(link)
        article_soup = BeautifulSoup(article_response.content, 'html.parser')
        #print(link)
        title = article_soup.find('h1').get_text(strip=True)
        caasbody = article_soup.find('div', class_='caas-body')
        caasbody = caasbody.find_all(['p', 'img'])
        filtered_data = []

        for element in caasbody:
            if element.name == 'img':  # 如果是 <img> 標籤
                if element.has_attr('src'):  # 確保有 src 屬性
                    img_data = {
                        "type": "image",
                        "src": element['src'],
                        "alt": element.get('alt', '無描述')  # 如果有 alt 屬性，提取它
                    }
                    filtered_data.append(img_data)

            elif element.name == 'p':  # 如果是 <p> 標籤
                text = element.get_text(strip=True)  # 提取段落文字
                if not (text.startswith("看更多") or text.startswith("更多")):  # 過濾不需要的段落
                    paragraph_data = {
                        "type": "paragraph",
                        "text": text
                    }
                    filtered_data.append(paragraph_data)

            
        self.news = {"Title": title, "Content": filtered_data}       
        
        # 清空舊內容
        for i in reversed(range(self.scroll_layout.count())):
            widget = self.scroll_layout.itemAt(i).widget()
            if widget is not None:
                widget.deleteLater()

        # 添加新聞標題
        title_label = QLabel(self.news["Title"])
        title_label.setStyleSheet("font-size: 18px; font-weight: bold;")
        self.scroll_layout.addWidget(title_label)

        # 添加新聞內容
        for content in self.news["Content"]:
            if content["type"] == "paragraph":
                self.add_paragraph(content["text"])
            elif content["type"] == "image":
                self.add_image(content["src"], content.get("alt", ""))

        # 添加空白填充，防止滾動條過早出現
        self.scroll_layout.addStretch()

    def add_paragraph(self, text):
        """添加段落文字"""
        paragraph_label = QLabel(text)
        paragraph_label.setWordWrap(True)  # 自動換行
        paragraph_label.setStyleSheet("font-size: 14px; margin: 10px 0;")
        self.scroll_layout.addWidget(paragraph_label)

    def add_image(self, src, alt):
        """添加圖片及其描述"""
        try:
            # 獲取圖片
            response = requests.get(src)
            pixmap = QPixmap()
            pixmap.loadFromData(BytesIO(response.content).read())

            # 添加圖片
            image_label = QLabel()
            image_label.setPixmap(pixmap.scaledToWidth(750, Qt.SmoothTransformation))
            self.scroll_layout.addWidget(image_label)

            # 添加圖片描述
            if alt:
                alt_label = QLabel(alt)
                alt_label.setStyleSheet("font-size: 12px; color: gray;")
                self.scroll_layout.addWidget(alt_label)
        except Exception as e:
            error_label = QLabel("圖片載入失敗")
            error_label.setStyleSheet("color: red;")
            self.scroll_layout.addWidget(error_label)
            
    def store_to_database(self):
        """
        將目前新聞儲存至MongoDb
        """
        from pymongo import MongoClient
        client = MongoClient("mongodb://localhost:27017/")
        db = client["news_database"]
        collection = db["news_collection"]
        
        # 取得目前新��
        
        pass

if __name__ == "__main__":
    app = QApplication(sys.argv)
    window = NewsApp()
    window.show()
    sys.exit(app.exec_())


  class NewsApp(QMainWindow):


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [1]:
"""
將目前新聞儲存至MongoDb
"""
from pymongo import MongoClient

client = MongoClient("mongodb+srv://yuanchan1837:yuanchan1837@cluster0.dc6xs.mongodb.net/")

db = client["yahoo_news_database"]

collection = db["yahoo_news_collection"]


collection.insert_one(news)

print("新聞已儲存至 MongoDB")

client.close()


NameError: name 'news' is not defined