In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd
from datetime import datetime, timedelta
import time
import os

In [35]:
browser = webdriver.Chrome()

### 1. 뉴스 정보(제목, url, 날짜, 분류) 수집

In [None]:
# start_day와 수집 기간 설정
start_day, end_day = "2017-01-20", "2017-01-26"
date_format = "%Y-%m-%d"
start_day = datetime.strptime(start_day, date_format)
end_day = datetime.strptime(end_day, date_format)

# 209주 반복
for i in range(209):
    news_info = pd.DataFrame()
    url = f'https://www.nytimes.com/search?dropmab=false&endDate={end_day.strftime(date_format)}&query=trump&sort=newest&startDate={start_day.strftime(date_format)}&types=article'
    browser.get(url)
    start_day += timedelta(days=7)
    end_day += timedelta(days=7)

    # 한 주에 검색되는 기사가 100개 이상일 때, 100개로 제한
    article_num = int(browser.find_element(By.CLASS_NAME, 'css-nayoou').text.split(' ')[1])
    if article_num > 100:
        article_num = 100

    # 뉴스 정보 수집
    while len(news_info) < article_num:
        try:
            browser.find_element(By.XPATH, '//*[@id="site-content"]/div/div[2]/div[2]/div/button').click()
        except Exception as e:
            print(f"Error clicking button: {e}")
            break
        time.sleep(4)
        articles = browser.find_elements(By.CLASS_NAME, 'css-1l4w6pd')
        for article in articles[-10:]:
            title = article.find_element(By.CLASS_NAME, 'css-2fgx4k').text
            link = article.find_element(By.TAG_NAME, 'a').get_attribute('href')
            date = article.find_element(By.CLASS_NAME, 'css-17ubb9w').text
            section = article.find_element(By.CLASS_NAME, 'css-myxawk').text
            news_info = pd.concat([news_info, pd.DataFrame([[title, link, date, section]], columns=['title', 'link', 'date', 'section'])], ignore_index=True)
            # 진행도 출력
            print(f'{len(news_info)} / {article_num}')
    # 데이터프레임 전처리 및 저장
    news_info.drop_duplicates(inplace=True)
    news_info.dropna(inplace=True)
    news_info.reset_index(inplace=True)
    news_info.to_csv(f'../collect_data/{start_day}_trump_news.csv')

#### csv파일 통합

In [5]:
# 현재 디렉토리의 csv파일을 통합
path = '.'
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]

data_frames = []

for file in csv_files:
    df = pd.read_csv(os.path.join(path, file))
    # 처리된 csv파일 제거
    # os.remove(os.path.join(path, file))
    data_frames.append(df)

# 데이터프레임 전처리 및 저장
combined_df = pd.concat(data_frames, ignore_index=True)
combined_df.drop(columns=['Unnamed: 0', 'index'], inplace=True)
combined_df = combined_df.sort_values(by='date').reset_index(drop=True)
combined_df['date'] = pd.to_datetime(combined_df['date']).dt.strftime('%Y/%m/%d')
combined_df.to_csv('combined_csv.csv', index=False)


### 2. 뉴스 기사 본문 수집

In [None]:
data = []
text_df = pd.DataFrame()
# 카운트를 통해 진행사항 저장
# 코드에 오류가 발생하거나 중지될 때, cnt를 100단위로 다시 선언해주세요
# cnt = 0

# 뉴스 정보 link열을 통해 페이지에 방문하여 본문 수집
for link in combined_df['link'][cnt:]:
    cnt += 1
    time.sleep(0.5)
    browser.get(link)
    # 본문 수집 코드
    paragraphs = browser.find_elements(By.CLASS_NAME, 'css-at9mc1')
    main_text = ''
    for paragraph in paragraphs:
        main_text += paragraph.text + ' '
    
    data.append({'text': main_text})
    # 진행율
    print(f'{cnt}/{len(combined_df)} \n{cnt/len(combined_df)}')

    # 중간 저장
    if (len(data) % 100) == 0:
        text_df = pd.concat([text_df, pd.DataFrame(data)], ignore_index=True)
        data = []
        text_df.to_csv(f'./text_df{cnt}.csv')

text_df = pd.DataFrame(data)

In [28]:
pd.read_csv('./text_df6.csv')

Unnamed: 0.1,Unnamed: 0,text
0,0,"First, Pete Souza became an Instagram celebrit..."
1,1,[Jessica Walter has died at 80. Read her obitu...
2,2,A federal regulator on Wednesday encouraged ba...
3,3,"Through Sunday. Derek Eller, 300 Broome Street..."
4,4,"HEMPSTEAD, N.Y. — With the might of incumbency..."
...,...,...
2395,2395,
2396,2396,
2397,2397,
2398,2398,


In [33]:
cnt=9000

In [6]:
combined_df

Unnamed: 0,title,link,date,section
0,Text to Text: On Empathy and ‘Moral Imagination’,https://www.nytimes.com/2017/01/25/learning/le...,2017/01/25,LESSON PLANS
1,Greenpeace Activists Arrested After Hanging ‘R...,https://www.nytimes.com/2017/01/25/us/greenpea...,2017/01/25,U.S.
2,Trump Orders Mexican Border Wall to Be Built a...,https://www.nytimes.com/2017/01/25/us/politics...,2017/01/25,POLITICS
3,Trump Prepares Orders Aiming at Global Funding...,https://www.nytimes.com/2017/01/25/us/politics...,2017/01/25,POLITICS
4,A High School Defaced With ‘Trump’ and Swastikas,https://www.nytimes.com/2017/01/25/opinion/a-h...,2017/01/25,OPINION
...,...,...,...,...
20865,Congress Grants Waiver to Austin to Serve as D...,https://www.nytimes.com/2021/01/21/us/politics...,2021/01/21,POLITICS
20866,Biden’s Inauguration Scores Bigger TV Ratings ...,https://www.nytimes.com/2021/01/21/business/me...,2021/01/21,MEDIA
20867,Biden Rolls Back the Trump Legacy,https://www.nytimes.com/2021/01/21/us/politics...,2021/01/21,POLITICS
20868,Biden’s First Day,https://www.nytimes.com/2021/01/21/briefing/ex...,2021/01/21,BRIEFING


In [44]:
path = '.'
csv_files = [f for f in os.listdir(path) if f.startswith('text')]

data_frames = []

for file in csv_files:
    df = pd.read_csv(os.path.join(path, file))
    data_frames.append(df)

text_combined_df = pd.concat(data_frames, ignore_index=True)

text_combined_df.to_csv('text_combined_csv.csv', index=False)

In [73]:
text_combined_df.drop(columns='Unnamed: 0').drop_duplicates()[-1:]

Unnamed: 0,text
9199,WASHINGTON — In his first full cabinet meeting...


In [61]:
combined_df.link[9193]

'https://www.nytimes.com/2018/10/25/style/halloween-music-best-spotify.html?searchResultPosition=66'