In [None]:
deploy_phase = "local"
execution_date = '2024-07-06T00:00:00+09:00'
queries = ["서울 지하철 파업", "서울 지하철 연착", "서울 지하철 지연", "서울 지하철 사고", "서울 지하철 연장"]

# Notebook Initialization

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

In [None]:
# sys.path.insert(0, "/workspace/ojitong")

# Global Const

In [None]:
from datetime import datetime

def convert_to_datetime(date_str: str) -> datetime:
    return datetime.fromisoformat(date_str)

today_datetime = convert_to_datetime(execution_date)
today_datetime

# Knowledge Generation

1) 뉴스 URL 가져오기
  - 네이버 뉴스에서 주어진 검색어로 뉴스 기사의 URL을 가져옵니다.

2) 뉴스 기사 스크래핑
- 추출한 URL에서 뉴스 기사의 세부 정보를 스크래핑합니다.

3) 파일 s3 업로드

In [None]:
from typing import List
import numpy as np
import pandas as pd
from datetime import datetime
from workflow.data_preparation.news_scraper.data_enrichment import update_article_texts, update_kbs_titles, fill_missing_publish_dates
from workflow.data_preparation.news_scraper.news_url_fetcher import format_today_date, create_search_url, fetch_html, get_news_urls
from workflow.data_preparation.news_scraper.news_scraper import scrape_news_articles
from workflow.data_preparation.news_scraper.webdriver_util import initialize_webdriver
from data.utils.file import generate_s3_path

def fetch_news_articles(queries, today_datetime):
    all_articles_data = []
    for query in queries:
        news_urls = get_news_urls(query, today_datetime)
        articles_data = scrape_news_articles(news_urls)
        for article_data in articles_data:
            article_data['category'] = query.split(" ")[-1]
        all_articles_data.extend(articles_data)
    return all_articles_data

def initialize_dataframe(articles_data):
    df = pd.DataFrame(articles_data)
    df = df.replace('', np.nan)
    return df

def fetch_and_process_data(queries: List[str], today_datetime: datetime):
    # 1. 뉴스 기사 패치
    print("*** 1. 뉴스 기사 패치 ***")
    articles_data = fetch_news_articles(queries, today_datetime)
    df = initialize_dataframe(articles_data)
    print(df.head())

    # 2. 결측치 보강
    print("*** 2. 결측치 보강 ***")
    driver = initialize_webdriver()
    try:
        print("*** 2-1. update_article_texts ***")
        df = update_article_texts(df, driver)
        print(df.head())
        
        print("*** 2-2. fill_missing_publish_dates ***")
        df = fill_missing_publish_dates(df, driver)
        df["publish_date"] = df["publish_date"].fillna(today_datetime.strftime('%Y-%m-%d %H:%M:%S%z'))
        print(df.head())
        
        print("*** 2-3. update_kbs_titles ***")
        df = update_kbs_titles(df, driver)
        print(df.head())
    finally:
        driver.quit()

    # 3. 파일 S3 업로드
    s3_path = generate_s3_path(today_datetime=today_datetime)
    df.to_csv(s3_path, index=False)
    print(f"*** Data saved to {s3_path} ***")
    return df

In [None]:
df = fetch_and_process_data(queries, today_datetime)

In [None]:
df