# Real News Collection

In [None]:
import requests
import csv
import time
import numpy as np
import pandas as pd

# a71a3a15-ba85-407b-a8d5-cd6deccbe7fe
# "49bc4923-5543-4f48-975c-48c6336fd8d0"
API_KEY = "a71a3a15-ba85-407b-a8d5-cd6deccbe7fe"
BASE_URL = "https://content.guardianapis.com/search"

## API Check

In [None]:
def fetch_guardian_articles(page=1, section="business"):
    """
    Fetches articles from the Guardian API for a specific section and page.
    
    Args:
        page (int): The page number to fetch.
        section (str): The section to fetch articles from (default is "business").
    
    Returns:
        dict: API response containing articles, or None if an error occurs.
    """
    params = {
        "section": section,
        "api-key": API_KEY,
        "page": page,
    }

    try:
        response = requests.get(BASE_URL, params=params)
        response.raise_for_status()  # Automatically handle HTTP errors
        data = response.json()
        
        if data["response"]["status"] != "ok":
            print(f"API returned an error: {data.get('message', 'Unknown error')}")
            return None
            
        return data["response"]

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {str(e)}")
        return None
    except ValueError as e:
        print(f"JSON parsing failed: {str(e)}")
        return None

In [None]:
response = fetch_guardian_articles(page=1, section="business")

In [None]:
for i in range(response['pageSize']):
    print("id: ", response['results'][i]['id'])
    print("Title: ", response['results'][i]['webTitle'])
    print("url: ", response['results'][i]['webUrl'])
    print("------")

## Text Clean

In [None]:
!pip install news-please

In [None]:
from newsplease import NewsPlease
article = NewsPlease.from_url('https://www.theguardian.com/business/2025/jan/31/openai-to-release-new-artificial-intelligence-model-for-free')
print(article.maintext)

## Collection

In [None]:
def news_scratch(section_name, total_len, passage):
    current_news = 0
    current_page = 1
    total_pages = 10
    
    while current_page <= total_pages and current_news < total_len:
        if current_page % 10 == 0:
            print(f"Fetching page {current_page}...")
        response = fetch_guardian_articles(page=current_page, section=section_name)
        
        for i in range(response['pageSize']):
            passage["id"].append(current_news)
            passage["passage_id"].append(response['results'][i]['id'])
            passage["title"].append(response['results'][i]['webTitle'])
            passage["url"].append(response['results'][i]['webUrl'])
            passage["section"].append(section_name)
            passage["label"].append(0)
            
            article = NewsPlease.from_url(response['results'][i]['webUrl'])
            passage["content"].append(article.maintext)
            
            # print("id: ", response['results'][i]['id'])
            # print("Title: ", response['results'][i]['webTitle'])
            # print("url: ", response['results'][i]['apiUrl'])
            # print("------")
            
            current_news += 1
        
        if not response:
            print(f"ERROR: Unable to Fetch page {current_page}!!")
            
        # Update total pages (API defaults to a maximum of 200 pages)
        total_pages = min(response.get("pages", 1), 10000)
        
        time.sleep(0.2)  
        current_page += 1
    return 

In [None]:
passage = {
        "id": [],
        "passage_id": [],
        "title": [],
        "content": [], 
        "url": [],
        "section": [], 
        "label": [], 
    }

train_sections = ["education", "society", "sport", "environment"]
test_sections = ["technology", "artanddesign", "business", "politics"]
# train_total_len = [5900, 8300, 18900, 5800]
train_total_len = [800, 1100, 2400, 800]
# test_total_len = [1160, 600, 3100, 2400]
test_total_len = [200, 100, 450, 350]

In [None]:
for sec, total_len in zip(train_sections, train_total_len):
    news_scratch(sec, total_len, passage)
    print(f"\nSection {sec} finished! \n")

In [None]:
train_passage_df = pd.DataFrame(passage)
display(train_passage_df)

In [None]:
train_passage_df.to_csv("train_news_df.csv")

In [None]:
# import gc

# # To Save RAM
# del passage, train_passage_df
# _ = gc.collect()

In [None]:
passage = {
        "id": [],
        "passage_id": [],
        "title": [],
        "content": [], 
        "url": [],
        "section": [], 
        "label": [], 
    }

for sec, total_len in zip(test_sections, test_total_len):
    news_scratch(sec, total_len, passage)
    print(f"\nSection {sec} finished! \n")

In [None]:
test_passage_df = pd.DataFrame(passage)
display(test_passage_df)

In [None]:
test_passage_df.to_csv("test_news_df.csv")

## EDA

In [None]:
train_passage_df = pd.read_csv("/kaggle/working/train_news_df.csv")
print("train_size: ", train_passage_df.shape)

In [None]:
test_passage_df = pd.read_csv("/kaggle/working/test_news_df.csv")
print("test_size: ", test_passage_df.shape)

### Remove `\n`

In [None]:
train_passage_df["content"] = train_passage_df["content"].str.replace("\n", " ", regex=False)
test_passage_df["content"] = test_passage_df["content"].str.replace("\n", " ", regex=False)
print(train_passage_df["content"][0])

### Word Count

In [None]:
train_passage_df["word_count"] = train_passage_df["content"].str.split().str.len()
print("average word count: ", train_passage_df["word_count"].mean())

In [None]:
test_passage_df["word_count"] = test_passage_df["content"].str.split().str.len()
print("average word count: ", test_passage_df["word_count"].mean())

### Basic Check

In [None]:
print(train_passage_df["content"].nunique())

### Save

In [None]:
train_passage_df.to_csv("train_news_real_df.csv")
test_passage_df.to_csv("test_news_real_df.csv")