In [None]:
import requests

api_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
params = {
    "q": "education",
    "api-key": # Your API KEY
    "page": 0,
    "pageSize": 10
}

response = requests.get(api_url, params=params)

if response.status_code == 200:
    data = response.json()
    articles = data.get("response", {}).get("docs", [])
    print(f"找到 {len(articles)} 篇文章")
    for article in articles:
        print(article.get("web_url"))
else:
    print(f"请求失败，状态码: {response.status_code}")

## Fetch News Pipeline

In [None]:
!pip install news-please

In [None]:
import requests
import time
from newsplease import NewsPlease

API_KEY = # Your API KEY
BASE_URL = "https://api.nytimes.com/svc/search/v2/articlesearch.json"

def fetch_news(query="education", total_articles=20, max_retries=8):
    """
    Fetches news articles from the New York Times API based on the search query.
    
    For each page (index), if the request fails it will retry up to max_retries times.
    
    Args:
        query (str): The search keyword.
        total_articles (int): Total number of articles to fetch.
        max_retries (int): Maximum number of retries per page (index).
    
    Returns:
        dict: A dataset containing articles with fields such as 'id', 'title', 'url', 'section', 'content'.
    """
    dataset = {
        "id": [],
        "title": [],
        "url": [],
        "section": [],
        "content": []
    }
    
    current_articles = 0
    current_page = 0
    articles_per_page = 10  # NYTimes API returns up to 10 articles per page

    while current_articles < total_articles:
        # 对于当前页，初始化单页重试计数器
        page_retry = 0
        articles = None

        # 尝试请求当前页，失败则重试（针对该页统计重试次数）
        while page_retry < max_retries:
            params = {
                "q": query,
                "api-key": API_KEY,
                "page": current_page,
            }
            try:
                response = requests.get(BASE_URL, params=params)
                response.raise_for_status()
                data = response.json()
                articles = data.get("response", {}).get("docs", [])
                # 成功获取数据，退出重试循环
                break
            except requests.exceptions.RequestException as e:
                page_retry += 1
                print(f"⚠️ Request failed for page {current_page} (retry {page_retry}/{max_retries}): {e}")
                time.sleep(200)  # 暂停一段时间后重试
            except ValueError as e:
                page_retry += 1
                print(f"⚠️ JSON parsing failed for page {current_page} (retry {page_retry}/{max_retries}): {e}")
                time.sleep(200)

        if page_retry == max_retries:
            print(f"❌ Failed to fetch page {current_page} after {max_retries} retries.")
            break

        if not articles:
            print(f"❌ No more articles found on page {current_page}.")
            break

        # 遍历当前页中的所有文章
        for article in articles:
            dataset["id"].append(article.get("_id"))
            dataset["title"].append(article.get("headline", {}).get("main", "No Title"))
            url = article.get("web_url", "No URL")
            dataset["url"].append(url)
            dataset["section"].append(article.get("section_name", "Unknown"))
            
            # 尝试抓取文章内容
            try:
                parsed_article = NewsPlease.from_url(url)
                content = getattr(parsed_article, "maintext", "No content available")
            except Exception as e:
                print(f"⚠️ Failed to fetch content from {url}: {e}")
                content = "Failed to fetch content."
            
            dataset["content"].append(content)
            
            current_articles += 1
            if current_articles >= total_articles:
                break

        print(f"✅ Fetched {len(articles)} articles from page {current_page}.")
        current_page += 1
        time.sleep(10)  # 避免请求过于频繁

    print(f"✅ Total fetched articles: {current_articles}")
    return dataset

In [None]:
import pandas as pd


# technology, education, business, environment, economy, sports, health, medical
# science, international, national, entertainment, society, culture, law, food, space
# internet, fashion
queries = ["society", "culture", "law", "food", "space", "internet", "fashion"]
for query in queries:
# query = "economy"
    news_dataset = fetch_news(query=query, total_articles=1000)
    
    df = pd.DataFrame(news_dataset)
    df.to_csv(f"{query}_news_data.csv", index=False)
    
    print(f"✅ Dataset Saved to {query}_news_data.csv")

In [None]:
for i in range(min(5, len(news_dataset["content"]))):
    print(f"{i+1}. {news_dataset['title'][i]}\n{news_dataset['content'][i]}")

## Concat

In [None]:
import os
import pandas as pd

folder_path = "/kaggle/input/rag-dataset"

dfs = []

for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        file_path = os.path.join(folder_path, file)
        
        df = pd.read_csv(file_path)
        
        section_name = os.path.splitext(file)[0]
        
        df["section"] = section_name
        
        dfs.append(df)

if dfs:
    merged_df = pd.concat(dfs, ignore_index=True)
    
    output_path = os.path.join("/kaggle/working/", "New_York_Times.csv")
    merged_df.to_csv(output_path, index=False)
    print(f"合并后的文件已保存至: {output_path}")
else:
    print("未找到 CSV 文件")

In [None]:
print(merged_df.shape)
merged_df.head()

In [None]:
print(merged_df["url"])

## Fix the Error Fetching Problem

In [None]:
# !pip install lxml_html_clean
!pip install --no-cache-dir newspaper3k==0.2.8

In [None]:
!pip install lxml_html_clean

In [None]:
import os
import pandas as pd
import time
import random
from newspaper import Article

folder_path = "/kaggle/working"

merged_file_path = os.path.join("/kaggle/input/rag-dataset", "merged_data_updated.csv")
merged_df = pd.read_csv(merged_file_path)

mask = merged_df["content"] == "No content available"
print(f"🔎 需要重新爬取 {sum(mask)} 篇文章")

for idx in merged_df[mask].index:
    url = merged_df.at[idx, "url"]
    try:
        article = Article(url)

        article.download()
        article.parse()

        content = article.text if article.text else "No content available"

        if content != "No content available":
            print(f"✅ 成功爬取 {url}")
        else:
            print(f"⚠️ No content available agiain: {url}")

    except Exception as e:
        print(f"❌ Fail to fetch content again {url}: {e}")
        content = "Failed to fetch content."

    merged_df.at[idx, "content"] = content
    time.sleep(random.uniform(2, 5))

updated_file_path = os.path.join("/kaggle/working/", "merged_data_updated.csv")
merged_df.to_csv(updated_file_path, index=False)

print(f"🎉 更新后的数据已保存至: {updated_file_path}")

In [None]:
updated_file_path = os.path.join("/kaggle/working/", "merged_data_updated.csv")
merged_df.to_csv(updated_file_path, index=False)

print(f"🎉 更新后的数据已保存至: {updated_file_path}")

In [None]:
import os
import pandas as pd
from newsplease import NewsPlease
import time
import random
from scrapy.settings import Settings

custom_settings = Settings()
custom_settings.set("DEFAULT_REQUEST_HEADERS", {
    "User-Agent": "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36"
})

folder_path = "/kaggle/working"

merged_file_path = os.path.join("/kaggle/working", "merged_data_updated.csv")
merged_df = pd.read_csv(merged_file_path)

mask = merged_df["content"] == "No content available"
print(sum(mask))

for idx in merged_df[mask].index:
    url = merged_df.at[idx, "url"]
    try:
        parsed_article = NewsPlease.from_url(url)
        content = parsed_article.maintext if parsed_article else "No content available"
        if content != "No content available":
            print(f"✅ Fetched content for {url}")
        else:
            print(f"⚠️ Failed to fetch content again.")
    except Exception as e:
        print(f"⚠️ Failed to fetch content from {url}: {e}")
        content = "Failed to fetch content."
    
    merged_df.at[idx, "content"] = content
    time.sleep(random.uniform(2, 6))

updated_file_path = os.path.join("/kaggle/working/", "merged_data_updated.csv")
merged_df.to_csv(updated_file_path, index=False)

print(f"🎉 更新后的数据已保存至: {updated_file_path}")

In [None]:
updated_file_path = os.path.join("/kaggle/working/", "merged_data_updated.csv")
merged_df.to_csv(updated_file_path, index=False)

print(f"🎉 更新后的数据已保存至: {updated_file_path}")

## Data Cleaning

In [None]:
import pandas as pd
import os

path = os.path.join("/kaggle/input/rag-dataset", "merged_data_updated.csv")
df = pd.read_csv(path)
df.head()

In [None]:
# Drop error pages
print("Error page count: ", len(df[df["content"] == "Failed to fetch content."]))

error_indices = df[df["content"] == "Failed to fetch content."].index
df.drop(error_indices, inplace=True)

print("Remaining page count: ", len(df))

In [None]:
# Low Quality Content

# Print rows where the title is "Entertainment Events Today"
entertainment_df = df[df["title"] == "Entertainment Events Today"]
print(entertainment_df["content"].reset_index(drop=True)[1])
display(entertainment_df)

error_indices = df[df["title"] == "Entertainment Events Today"].index
df.drop(error_indices, inplace=True)

print("Remaining page count: ", len(df))

In [None]:
# Remove duplicate rows based on the 'content' column

# Identify and display rows that have duplicate content (showing all duplicates)
duplicate_df = df[df.duplicated(subset="content", keep=False)]
print("Rows with duplicate 'content':")
display(duplicate_df)

# Get the indices of duplicate rows, keeping the first occurrence of each unique 'content'
duplicate_indices = df[df.duplicated(subset="content", keep="first")].index

# Drop the duplicate rows from the DataFrame
df.drop(duplicate_indices, inplace=True)

# Print the remaining page count after removing duplicates
print("Remaining page count: ", len(df))

In [None]:
# Check for missing values in critical fields "content"
critical_fields = ["content"]

# Identify rows with missing values in any of the critical fields
missing_df = df[df[critical_fields].isnull().any(axis=1)]
print("Rows with missing critical fields:")
display(missing_df)

# Drop rows with missing critical fields
missing_indices = missing_df.index
df.drop(missing_indices, inplace=True)

print("Remaining page count after removing rows with missing critical fields:", len(df))

In [None]:
# Check the average word count in the 'content' field of the DataFrame
df["word_count"] = df["content"].apply(lambda content: len(content.split()))
average_word_count = df["word_count"].mean()

print("Average word count:", average_word_count)

In [None]:
# Save the cleaned DataFrame to a CSV file without the index column.
df.to_csv("nytimes_cleaned_data.csv", index=False)
print("DataFrame has been saved as 'nytimes_cleaned_data.csv'.")

## Futher Insight

In [None]:
import pandas as pd
import os

path = "/kaggle/input/rag-dataset/nytimes_cleaned_data_2020.csv"
df = pd.read_csv(path)
df.head()

for i in range(3):
    print(df["content"][i])
    print("=" * 50)

In [None]:
# 定义要删除的固定文本
extra_text = (
    "Thank you for your patience while we verify access. If you are in Reader mode please exit and log into your Times account, or subscribe for all of The Times.\n"
    "Thank you for your patience while we verify access.\n"
    "Already a subscriber? Log in.\n"
    "Want all of The Times? Subscribe."
)

# 定义处理函数
def remove_extra_text(content):
    if extra_text in content:
        return content.split(extra_text)[0]
    return content

# 对 DataFrame 的每个 "content" 应用该函数
df["content"] = df["content"].apply(remove_extra_text)

In [None]:
import random

n = 3

for i in range(n):
    idx = random.randint(0, len(df) - 1)
    print("Title:", df["title"][idx])
    print(df["content"][idx])
    print("=" * 50)

In [None]:
import re
import pandas as pd

# 定义一个函数，从 URL 中提取日期字符串（格式：YYYY-MM-DD）
def extract_date(url):
    # 匹配模式：/年份/月份/日期/，例如 /2025/01/28/
    pattern = r"/(\d{4})/(\d{2})/(\d{2})/"
    match = re.search(pattern, url)
    if match:
        year, month, day = match.groups()
        return f"{year}-{month}-{day}"
    else:
        return None

# 对 DataFrame 的 url 列应用该函数，生成一个新的 date 列
df["date"] = df["url"].apply(extract_date)

# 如果需要将 date 列转换为 datetime 类型，可以使用：
df["date"] = pd.to_datetime(df["date"])

display(df.head())

In [None]:
print(df["section"].unique())
print(df["section"].value_counts())

In [None]:
# drop data before 2020

# 生成 date 列并转换为 datetime 类型
df["date"] = df["url"].apply(extract_date)
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# 删除 2020 年以前的数据
df = df[df["date"].dt.year >= 2020]
print(len(df))

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(12, 6))
# 使用 plt.hist 绘制直方图，自动将日期数据分成若干个 bin
plt.hist(df["date"], bins=25, color="skyblue", edgecolor="black")
plt.title("Date Distribution")
plt.xlabel("Date")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Save the cleaned DataFrame to a CSV file without the index column.
df.to_csv("nytimes_cleaned_data_2020.csv", index=False)
print("DataFrame has been saved as 'nytimes_cleaned_data_2020.csv'.")

## Check `the guardian` dataset

In [None]:
import pandas as pd
import os

path = "/kaggle/input/si630-ai-generated-news-detection/train_news_real_df.csv"
guadian_df = pd.read_csv(path)
guadian_df.head()

In [None]:
def extract_date_from_passage_id(passage_id):
    """
    从 passage_id 中提取日期，假设格式类似于:
    "education/2025/jan/31/some-title..."
    """
    # 正则表达式匹配：4位年份/3位月份（字母）/2位日期
    match = re.search(r'(\d{4})/([a-z]{3})/(\d{2})', passage_id, re.IGNORECASE)
    if match:
        year, month, day = match.groups()
        # 将月份转为首字母大写（例如 "jan" -> "Jan"），以便于日期转换
        month = month.capitalize()
        date_str = f"{year}/{month}/{day}"
        try:
            date = pd.to_datetime(date_str, format="%Y/%b/%d")
            return date
        except Exception as e:
            return pd.NaT
    else:
        return pd.NaT

# 对 DataFrame 的 passage_id 列应用该函数，生成新的 date 列
guadian_df["date"] = guadian_df["passage_id"].apply(extract_date_from_passage_id)

# 检查结果
display(guadian_df[["passage_id", "date"]].head())

In [None]:
print(guadian_df["section"].unique())
print(guadian_df["section"].value_counts())

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(12, 6))
# 使用 plt.hist 绘制直方图，自动将日期数据分成若干个 bin
plt.hist(guadian_df["date"], bins=25, color="skyblue", edgecolor="black")
plt.title("Date Distribution")
plt.xlabel("Date")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()