<a href="https://colab.research.google.com/github/StarWanderer1337/NewScrapper/blob/main/news_scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NewsScrapper

> Query and Fetch results from news-api.org.

In [None]:
from google.colab import userdata
from google.auth import default
from google.colab import drive
from google.colab import auth

from datetime import timedelta
from datetime import datetime

import pandas as pd

import requests
import gspread
import json

In [None]:
def extract_source_name(source_dict):
    """
    Extracts the 'name' from a source dictionary.

    Args:
        source_dict (dict): A dictionary containing source information, expected to have a 'name' key.

    Returns:
        str or None: The value associated with the 'name' key in the dictionary,
                     or None if the input is not a dictionary or does not contain the 'name' key.
    """

    if isinstance(source_dict, dict) and "name" in source_dict:
        return source_dict["name"]
    return None  # Or handle cases where "id" is missing differently


def get_news_dataframe(keyword:str, snapshot_date:str, language:str="en", days_ago:int=2):
    """
    Retrieves news articles from the News API and returns them as a pandas DataFrame.

    Fetches articles based on the provided keyword, language, and date range.
    It also adds columns for the search keyword and snapshot date,
    and extracts the source name from the source information.

    Args:
        keyword (str): The search term for news articles (e.g., "technology").
        language (str, optional): The language of the news articles (e.g., "en" for English, "fr" for French). Defaults to "en".
        days_ago (int, optional): The number of past days to retrieve articles from. Defaults to 2.

    Returns:
        pd.DataFrame: A DataFrame containing the news articles with columns
                      like 'author', 'title', 'description', 'url', 'urlToImage',
                      'publishedAt', 'content', 'keyword', 'snapshotDate', and 'source' (extracted name).
                      Returns an empty DataFrame if no articles are found, or None if an error occurs
                      during the API request or JSON processing.
    """

    now = datetime.utcnow()
    past_date = now - timedelta(days=days_ago)
    from_date = past_date.isoformat()[:-3] + "Z"

    url = f"https://newsapi.org/v2/everything?q={keyword}&language={language}&from={from_date}&sortBy=publishedAt&apiKey={API_KEY}"

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        if data["status"] == "ok":
            articles = data["articles"]
            if articles:
                df = pd.DataFrame(articles)
                # add columns for Looker
                df["keyword"] = keyword
                df["snapshotDate"] = snapshot_date
                df["source"] = df["source"].apply(extract_source_name)
                return df
            else:
                return pd.DataFrame() # Return an empty dataframe instead of None
        else:
            print(f"""Error: {data["message"]}""")
            return None

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None
    except json.JSONDecodeError:
        print("Invalid JSON response from the API.")
        return None

In [None]:
# Authenticate and authorize with Google Sheets API
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

# Mount Google Drive to access credentials
drive.mount("/content/drive", force_remount=True)

# Retrieve the News API key from user data (replace 'api-key-news' with your key's name if different)
API_KEY = userdata.get("api-key-news")

# Define the title of the Google Spreadsheet and the name of the worksheet
spreadsheet_title = "news-scrapper"
worksheet_title = "main"
# Connect to the specified Google Sheet and worksheet
sheet = gc.open(title=spreadsheet_title).worksheet(title=worksheet_title)

# Define parameters for the news retrieval
keywords = ["santé", "science", "environnement"] # List of keywords to search
snapshot_date = str(datetime.now()) # date of the extract
language = "fr" # Language of the news articles (French in this case)
days_ago = 2 # Number of days in the past to retrieve articles from
df = pd.DataFrame() # Initialize an empty DataFrame to store the news articles

# Iterate through the list of keywords to fetch news for each
for keyword in keywords:
  # Get news articles for the current keyword
  tmp_df = get_news_dataframe(keyword=keyword, snapshot_date=snapshot_date,language=language, days_ago=days_ago)
  # Concatenate the temporary DataFrame with the main DataFrame
  df = pd.concat([df, tmp_df], ignore_index=True)

# Process the collected news articles
if df is not None:
    if not df.empty:
      # Display the DataFrame of news articles
      display(df)
      # Append the data from the DataFrame to the Google Sheet
      sheet.append_rows(values=df.values.tolist())
    else:
      print("No articles found for the specified keywords and parameters.")
else:
    print("An error occurred while fetching news articles.")

Mounted at /content/drive


Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,keyword,snapshotDate
0,20 Minutes,20 Minutes avec AFP,Un hôpital touché par des frappes israéliennes...,L’armée israélienne a annoncé une frappe contr...,https://www.20minutes.fr/monde/4144900-2025032...,https://img.20mn.fr/BtKMYQ5nStqAeX_FiWBvYCk/14...,2025-03-23T21:19:35Z,Depuis la reprise des opérations militaires is...,santé,2025-03-24 21:39:01.678390
1,Libération,Dov Alfon,"Chahinez Daoud, de victime à symbole","L’ouverture, ce lundi 24 mars, du procès pour ...",https://www.liberation.fr/idees-et-debats/edit...,,2025-03-23T20:27:37Z,"Ce lundi 24 mars, trois femmes en France seron...",santé,2025-03-24 21:39:01.678390
2,Francetvinfo.fr,,"Agression du rabbin d'Orléans, Négociations su...","Tous les jours, les informés débattent de l'ac...",https://www.francetvinfo.fr/replay-radio/les-i...,https://www.francetvinfo.fr/pictures/riXAyyArg...,2025-03-23T20:25:58Z,Les thèmes :\r\nAgression du rabbin d'Orléans ...,santé,2025-03-24 21:39:01.678390
3,Le HuffPost,"Pauline Brault, avec AFP",David Belliard choisi par les écolos comme can...,"Au sein du paysage politique parisien, l’adjoi...",https://www.huffingtonpost.fr/politique/articl...,https://focus.huffingtonpost.fr/2025/03/23/485...,2025-03-23T20:15:10Z,POLITIQUE - Il avait déjà porté les couleurs v...,santé,2025-03-24 21:39:01.678390
4,Lalibre.be,Vincent Braun,"Israël organise le ""départ volontaire"" des Gaz...",L’armée israélienne a lancé dimanche une offen...,https://www.lalibre.be/international/moyen-ori...,https://www.lalibre.be/resizer/v2/ZK6L3G3DGZH6...,2025-03-23T20:07:35Z,Le gouvernement Netanyahou persiste dans ses a...,santé,2025-03-24 21:39:01.678390
...,...,...,...,...,...,...,...,...,...,...
143,L'Express,Aurore Gayte,Câbles sous-marins en mer Baltique : face aux ...,Les incidents sur les câbles internet se multi...,https://www.lexpress.fr/economie/high-tech/cab...,https://www.lexpress.fr/resizer/v2/ZRMVTQSFWJF...,2025-03-23T06:45:00Z,"Depuis novembre 2022, dans la mer Baltique, pl...",environnement,2025-03-24 21:39:01.678390
144,Le Monde,Perrine Mouterde,"Dans les Vosges, la biodiversité est l’affaire...",Lauréate du concours organisé par l’Office fra...,https://www.lemonde.fr/planete/article/2025/03...,https://img.lemde.fr/2025/03/21/0/0/5976/3984/...,2025-03-23T04:30:33Z,"Crapaudrome de Dracourt, dans les Vosges, 9 h ...",environnement,2025-03-24 21:39:01.678390
145,La Tribune.fr,Sophie Iborra,La chronique de Sophie Iborra. « Les femmes au...,"Chaque mois, Sophie iborra rencontre une femme...",https://www.latribune.fr/la-tribune-dimanche/o...,https://pictures.latribune.fr/cdn-cgi/image//7...,2025-03-23T04:15:00Z,« Elle n'a pas peur d'enfiler les gants ni de ...,environnement,2025-03-24 21:39:01.678390
146,Le Monde,Stéphane Foucart,« Les ONG participent du bon fonctionnement de...,"Depuis plusieurs semaines, des eurodéputés de ...",https://www.lemonde.fr/planete/article/2025/03...,https://img.lemde.fr/2024/10/08/0/0/5280/3520/...,2025-03-23T04:00:04Z,"Depuis plusieurs mois, les organisations non g...",environnement,2025-03-24 21:39:01.678390
