In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install --upgrade mediacloud

Collecting mediacloud
  Downloading mediacloud-4.4.1-py3-none-any.whl.metadata (4.3 kB)
Downloading mediacloud-4.4.1-py3-none-any.whl (24 kB)
Installing collected packages: mediacloud
Successfully installed mediacloud-4.4.1


In [None]:
import logging
import datetime
import csv
import sys
import time
import mediacloud.api
import shutil  # for file copying

# ----------------------------------------------------------------------------
# CONFIGURATION
# ----------------------------------------------------------------------------

API_KEY = "0204d14c4f0a2d0f2a73238a48baafeeed105b5d"

# 1) Search Keywords (list of strings to simulate complex queries)
SEARCH_KEYWORDS = [
    #"artificial intelligence",
    #"intelligenza artificiale",
    #"künstliche Intelligenz",
    #"intelligence artificielle",
    #"人工知能",
    #"人工智能", #semplified
    "人工智慧", #traditional
]

# 2) Location (for reference in naming)
SEARCH_LOCATION = "china"

# 3) Overall Time Range
START_DATE_STR = "2024-06-01"
END_DATE_STR   = "2025-06-01"

# 4) Collection IDs for the given location
COLLECTION_IDS = [
    #34412234, # United States - National
    #38379429, # United States - State & Local
    #34412372, # Italy - National
    #38380117, # Italy - State & Local
    #34412409, # Germany - National
    #38379816, # Germany - State & Local
    #34412146, # France - National
    #38379799, # France - State & Local
    34412193, # China - National
    38379438, # China - State & Local
    #34412056, # Japan - National
    #38380157, # Japan - State & Local
]



#34412476, # United Kingdom - National
    #38381111, # United Kingdom - State & Local

# United Kingdom - National
# https://search.mediacloud.org/collections/34412476

# United Kingdom - State & Local
# https://search.mediacloud.org/collections/38381111



# 5) Google Drive folder path
GDRIVE_PATH = "/content/drive/MyDrive/Mediacloud/China"

# Final CSV filename (for the entire range) with "mediacloud" in the name
FINAL_CSV_FILENAME = (
    f"{SEARCH_KEYWORDS[0].lower().replace(' ', '_')}_"
    f"{SEARCH_LOCATION.lower()}_"
    f"{START_DATE_STR}_{END_DATE_STR}_mediacloud.csv"
)

# ----------------------------------------------------------------------------
# LOGGING SETUP
# ----------------------------------------------------------------------------

logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# ----------------------------------------------------------------------------
# HELPER FUNCTIONS
# ----------------------------------------------------------------------------

def parse_date(date_str):
    """
    Convert 'YYYY-MM-DD' string to datetime.date.
    Raises ValueError if not correctly formatted.
    """
    return datetime.datetime.strptime(date_str, "%Y-%m-%d").date()

def last_day_of_month(any_date):
    """
    Given a datetime.date, return a date object representing
    the last day of that same month.
    """
    next_month = any_date.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)

def generate_monthly_date_ranges(start_date_str, end_date_str):
    """
    Generate a list of (month_start_date, month_end_date) tuples
    covering each month from start_date to end_date inclusive.
    """
    start = parse_date(start_date_str)
    end = parse_date(end_date_str)

    monthly_chunks = []
    current_start = start

    while current_start <= end:
        # Last day of current_start's month
        current_end = last_day_of_month(current_start)
        # Do not exceed the overall end date
        if current_end > end:
            current_end = end

        monthly_chunks.append((current_start, current_end))

        # Move to the day after current_end
        next_month_start = current_end + datetime.timedelta(days=1)
        if next_month_start > end:
            break
        current_start = next_month_start

    return monthly_chunks

def generate_daily_date_ranges(start_date, end_date):
    """
    Generate a list of (day_start, day_end) tuples for each day between
    start_date and end_date (inclusive).
    In this case, day_start == day_end for daily intervals.
    """
    current_start = start_date
    daily_chunks = []

    while current_start <= end_date:
        # For a daily chunk, the start and end are the same date
        chunk_end = current_start
        daily_chunks.append((current_start, chunk_end))
        current_start = current_start + datetime.timedelta(days=1)

    return daily_chunks

def fetch_stories(mc_client, query, start_date, end_date, collection_ids):
    """
    Fetch all stories matching 'query' from 'start_date' to 'end_date' (inclusive),
    within any of the specified collection_ids.

    If more than 1000 stories are found for that day, it will fetch multiple pages,
    sleeping 55s between pages to respect rate limits. Returns a list of story dictionaries.
    """
    all_stories = []
    pagination_token = None

    while True:
        try:
            # Request up to 1000 stories per page
            page_stories, next_token = mc_client.story_list(
                query=query,
                start_date=start_date,
                end_date=end_date,
                collection_ids=collection_ids,
                pagination_token=pagination_token,
                page_size=1000
            )

            # Attach collection_ids to each story for reference
            for story in page_stories:
                story["collection_ids"] = collection_ids

            all_stories.extend(page_stories)

            # Print/log how many articles just fetched, total so far
            logging.info(
                f"Fetched {len(page_stories)} stories in this page; "
                f"{len(all_stories)} total so far (range {start_date} - {end_date})."
            )
            print(
                f"Fetched {len(page_stories)} stories in this request; "
                f"{len(all_stories)} total so far (range {start_date} - {end_date})."
            )

            # If there are no more pages, break
            if next_token is None:
                break

            # Pause for 55 seconds before fetching next page to respect rate limits
            logging.info("Pausing for 55 seconds to respect rate limits (multi-page).")
            print("Pausing for 55 seconds to respect rate limits (multi-page)...")
            time.sleep(55)

            # Prepare for next iteration
            pagination_token = next_token

        except Exception as e:
            logging.error(f"Error during API fetch (range {start_date} - {end_date}): {e}")
            break

    return all_stories

def save_stories_to_csv(stories, filename, query):
    """
    Save the story data to a CSV file, including a 'collection_ids' column and the 'query'.
    """
    fieldnames = [
        "stories_id",
        "title",
        "publish_date",
        "url",
        "language",
        "media_id",
        "media_name",
        "query",
        "collection_ids"
    ]

    try:
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()

            for story in stories:
                # Include the query in each row
                story["query"] = query
                row_data = {field: story.get(field, "") for field in fieldnames}
                writer.writerow(row_data)

        logging.info(f"Successfully wrote {len(stories)} stories to {filename}.")
    except Exception as e:
        logging.error(f"Error writing to CSV '{filename}': {e}")

def copy_to_drive(filename, destination_folder):
    """
    Attempt to copy the file from Colab's /content/ to the given Google Drive folder.
    """
    colab_path = f"/content/{filename}"
    try:
        shutil.copy(colab_path, destination_folder)
        logging.info(f"File '{filename}' copied to Google Drive at '{destination_folder}'.")
    except Exception as e:
        logging.error(f"Could not copy file '{filename}' to Google Drive: {e}")

# ----------------------------------------------------------------------------
# MAIN EXECUTION LOGIC
# ----------------------------------------------------------------------------

def main():
    mc_client = mediacloud.api.SearchApi(API_KEY)

    try:
        mc_client.story_list(query="test", start_date=parse_date(START_DATE_STR), end_date=parse_date(START_DATE_STR), page_size=1)
        logging.info("MediaCloud API connection successful.")
    except Exception as e:
        logging.error(f"Could not validate MediaCloud API connection: {e}")
        return

    all_stories = []

    for keyword in SEARCH_KEYWORDS:
        logging.info(f"Starting query: {keyword}")
        monthly_chunks = generate_monthly_date_ranges(START_DATE_STR, END_DATE_STR)

        query_stories = []

        for (month_start, month_end) in monthly_chunks:
            logging.info(f"Processing month {month_start} to {month_end} for query: {keyword}")
            daily_chunks = generate_daily_date_ranges(month_start, month_end)

            for (day_start, day_end) in daily_chunks:
                logging.info(f"  Fetching stories for {day_start} (one-day range)...")
                chunk_stories = fetch_stories(mc_client, keyword, day_start, day_end, COLLECTION_IDS)
                query_stories.extend(chunk_stories)
                logging.info(f"Pausing 55 seconds at the end of day {day_start}.")
                time.sleep(55)

        # Save each query’s results to a temporary CSV
        query_label = keyword.lower().replace(" ", "_").replace('"', "").replace("AND", "and")
        query_csv_filename = f"{query_label}_{SEARCH_LOCATION.lower()}_{START_DATE_STR}_{END_DATE_STR}_mediacloud.csv"
        save_stories_to_csv(query_stories, query_csv_filename, keyword)
        copy_to_drive(query_csv_filename, GDRIVE_PATH)

        all_stories.extend(query_stories)

    # Save combined file
    logging.info(f"Saving final combined CSV for all queries...")
    save_stories_to_csv(all_stories, FINAL_CSV_FILENAME, "; ".join(SEARCH_KEYWORDS))
    copy_to_drive(FINAL_CSV_FILENAME, GDRIVE_PATH)

    logging.info(f"Done! Retrieved {len(all_stories)} total stories from {START_DATE_STR} to {END_DATE_STR}.")


# ----------------------------------------------------------------------------
# RUN
# ----------------------------------------------------------------------------

if __name__ == "__main__":
    main()

Fetched 8 stories in this request; 8 total so far (range 2024-06-01 - 2024-06-01).
Fetched 4 stories in this request; 4 total so far (range 2024-06-02 - 2024-06-02).
Fetched 9 stories in this request; 9 total so far (range 2024-06-03 - 2024-06-03).
Fetched 2 stories in this request; 2 total so far (range 2024-06-04 - 2024-06-04).
Fetched 5 stories in this request; 5 total so far (range 2024-06-05 - 2024-06-05).
Fetched 8 stories in this request; 8 total so far (range 2024-06-06 - 2024-06-06).
Fetched 6 stories in this request; 6 total so far (range 2024-06-07 - 2024-06-07).
Fetched 6 stories in this request; 6 total so far (range 2024-06-08 - 2024-06-08).
Fetched 2 stories in this request; 2 total so far (range 2024-06-09 - 2024-06-09).
Fetched 4 stories in this request; 4 total so far (range 2024-06-10 - 2024-06-10).
Fetched 7 stories in this request; 7 total so far (range 2024-06-11 - 2024-06-11).
Fetched 5 stories in this request; 5 total so far (range 2024-06-12 - 2024-06-12).
Fetc

ERROR:root:Error during API fetch (range 2025-04-24 - 2025-04-24): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Fetched 5 stories in this request; 5 total so far (range 2025-04-25 - 2025-04-25).
Fetched 5 stories in this request; 5 total so far (range 2025-04-26 - 2025-04-26).
Fetched 6 stories in this request; 6 total so far (range 2025-04-27 - 2025-04-27).
Fetched 6 stories in this request; 6 total so far (range 2025-04-28 - 2025-04-28).
Fetched 10 stories in this request; 10 total so far (range 2025-04-29 - 2025-04-29).
Fetched 9 stories in this request; 9 total so far (range 2025-04-30 - 2025-04-30).
Fetched 4 stories in this request; 4 total so far (range 2025-05-01 - 2025-05-01).
Fetched 4 stories in this request; 4 total so far (range 2025-05-02 - 2025-05-02).
Fetched 3 stories in this request; 3 total so far (range 2025-05-03 - 2025-05-03).
Fetched 3 stories in this request; 3 total so far (range 2025-05-04 - 2025-05-04).
Fetched 1 stories in this request; 1 total so far (range 2025-05-05 - 2025-05-05).
Fetched 0 stories in this request; 0 total so far (range 2025-05-06 - 2025-05-06).
Fe

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Mediacloud/China/人工智慧_traditional_china_2024-06-01_2025-06-01_mediacloud.csv')
df.head()

Unnamed: 0,stories_id,title,publish_date,url,language,media_id,media_name,query,collection_ids
0,,心相近｜共谱发展新篇 续写“中阿情缘”,2024-06-01,http://www.81.cn/xx_207779/16313075.html,zh,,81.cn,人工智慧,"[34412193, 38379438]"
1,,《中华优秀传统文化少儿绘本大系》在京首发,2024-06-01,http://www.81.cn/yd_208600/jdt_208601/16313633...,zh,,81.cn,人工智慧,"[34412193, 38379438]"
2,,共产党进城后，从城市管理到城市治理的转型发展,2024-06-01,http://www.81.cn/yd_208600/16313591.html,zh,,81.cn,人工智慧,"[34412193, 38379438]"
3,,开放提速 创新潮涌——来自第十三届中部博览会的观察,2024-06-01,http://www.81.cn/ss_208539/16313160.html,zh,,81.cn,人工智慧,"[34412193, 38379438]"
4,,赓续中华文脉 厚植文化自信——写在文化传承发展座谈会召开一周年之际,2024-06-01,http://www.81.cn/yw_208727/16313143.html,zh,,81.cn,人工智慧,"[34412193, 38379438]"


In [None]:
len(df)

1856

In [None]:
# ========================================
#  1) SETUP: Google Drive & Installations
# ========================================
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# If you're in Google Colab or a Jupyter environment, run this cell to install/upgrade mediacloud
!pip install --upgrade mediacloud

Collecting mediacloud
  Downloading mediacloud-4.4.0-py3-none-any.whl.metadata (4.3 kB)
Downloading mediacloud-4.4.0-py3-none-any.whl (24 kB)
Installing collected packages: mediacloud
Successfully installed mediacloud-4.4.0


In [None]:
import logging
import datetime
import csv
import sys
import time
import mediacloud.api
import shutil  # for file copying

# ----------------------------------------------------------------------------
# CONFIGURATION
# ----------------------------------------------------------------------------

API_KEY = "0204d14c4f0a2d0f2a73238a48baafeeed105b5d"

# 1) Search Keyword
SEARCH_KEYWORD = "intelligence artificielle"
#intelligenza artificiale
#artificial intelligence
#هوش مصنوعی
#искусственный интеллект
#inteligência artificial
#künstliche intelligenz
#intelligence artificielle
#人工知能
#कृत्रिम होशियारी
#人工智能

# 2) Location (for reference in naming)
SEARCH_LOCATION = "france"

# 3) Overall Time Range
START_DATE_STR = "2024-06-01"
END_DATE_STR   = "2025-06-01"

# 4) Collection IDs for the given location
COLLECTION_IDS = [
    #34412372, #Italy National
    #38380117, #Italy State & Local
    #34412476, #UK National
    #38381111, #UK State & Local
    #34412284, #Iran - National
    #38380055, #Iran - State & Local
    #34412232, #Russia - National
    #38380780, #Russia - State & Local
    #34412257, #Brazil - National
    #38379250, #Brazil - State & Local
    #34412409, #Germany - National
    #38379816, #Germany - State & Local
    #34412056, #Japan - National
    #38380157, #Japan - State & Local
    #34412118, #India - National
    #38379954, #India - State & Local
    #34412234, #US - National
    #38379429, #US - State & Local
    #34412193, #China - National
    #38379438, #China - State & Local
    34412146, #France - National
    38379799, #France - State & Local
]



# United Kingdom - National
# https://search.mediacloud.org/collections/34412476

# United Kingdom - State & Local
# https://search.mediacloud.org/collections/38381111



# 5) Google Drive folder path
GDRIVE_PATH = "/content/drive/MyDrive/Mediacloud/France"

# Final CSV filename (for the entire range) with "mediacloud" in the name
FINAL_CSV_FILENAME = (
    f"{SEARCH_KEYWORD.lower().replace(' ', '_')}_"
    f"{SEARCH_LOCATION.lower()}_"
    f"{START_DATE_STR}_{END_DATE_STR}_mediacloud.csv"
)

# ----------------------------------------------------------------------------
# LOGGING SETUP
# ----------------------------------------------------------------------------

logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# ----------------------------------------------------------------------------
# HELPER FUNCTIONS
# ----------------------------------------------------------------------------

def parse_date(date_str):
    """
    Convert 'YYYY-MM-DD' string to datetime.date.
    Raises ValueError if not correctly formatted.
    """
    return datetime.datetime.strptime(date_str, "%Y-%m-%d").date()

def last_day_of_month(any_date):
    """
    Given a datetime.date, return a date object representing
    the last day of that same month.
    """
    next_month = any_date.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)

def generate_monthly_date_ranges(start_date_str, end_date_str):
    """
    Generate a list of (month_start_date, month_end_date) tuples
    covering each month from start_date to end_date inclusive.
    """
    start = parse_date(start_date_str)
    end = parse_date(end_date_str)

    monthly_chunks = []
    current_start = start

    while current_start <= end:
        # Last day of current_start's month
        current_end = last_day_of_month(current_start)
        # Do not exceed the overall end date
        if current_end > end:
            current_end = end

        monthly_chunks.append((current_start, current_end))

        # Move to the day after current_end
        next_month_start = current_end + datetime.timedelta(days=1)
        if next_month_start > end:
            break
        current_start = next_month_start

    return monthly_chunks

def generate_daily_date_ranges(start_date, end_date):
    """
    Generate a list of (day_start, day_end) tuples for each day between
    start_date and end_date (inclusive).
    In this case, day_start == day_end for daily intervals.
    """
    current_start = start_date
    daily_chunks = []

    while current_start <= end_date:
        # For a daily chunk, the start and end are the same date
        chunk_end = current_start
        daily_chunks.append((current_start, chunk_end))
        current_start = current_start + datetime.timedelta(days=1)

    return daily_chunks

def fetch_stories(mc_client, query, start_date, end_date, collection_ids):
    """
    Fetch all stories matching 'query' from 'start_date' to 'end_date' (inclusive),
    within any of the specified collection_ids.

    If more than 1000 stories are found for that day, it will fetch multiple pages,
    sleeping 55s between pages to respect rate limits. Returns a list of story dictionaries.
    """
    all_stories = []
    pagination_token = None

    while True:
        try:
            # Request up to 1000 stories per page
            page_stories, next_token = mc_client.story_list(
                query=query,
                start_date=start_date,
                end_date=end_date,
                collection_ids=collection_ids,
                pagination_token=pagination_token,
                page_size=1000
            )

            # Attach collection_ids to each story for reference
            for story in page_stories:
                story["collection_ids"] = collection_ids

            all_stories.extend(page_stories)

            # Print/log how many articles just fetched, total so far
            logging.info(
                f"Fetched {len(page_stories)} stories in this page; "
                f"{len(all_stories)} total so far (range {start_date} - {end_date})."
            )
            print(
                f"Fetched {len(page_stories)} stories in this request; "
                f"{len(all_stories)} total so far (range {start_date} - {end_date})."
            )

            # If there are no more pages, break
            if next_token is None:
                break

            # Pause for 55 seconds before fetching next page to respect rate limits
            logging.info("Pausing for 55 seconds to respect rate limits (multi-page).")
            print("Pausing for 55 seconds to respect rate limits (multi-page)...")
            time.sleep(55)

            # Prepare for next iteration
            pagination_token = next_token

        except Exception as e:
            logging.error(f"Error during API fetch (range {start_date} - {end_date}): {e}")
            break

    return all_stories

def save_stories_to_csv(stories, filename, query):
    """
    Save the story data to a CSV file, including a 'collection_ids' column and the 'query'.
    """
    fieldnames = [
        "stories_id",
        "title",
        "publish_date",
        "url",
        "language",
        "media_id",
        "media_name",
        "query",
        "collection_ids"
    ]

    try:
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()

            for story in stories:
                # Include the query in each row
                story["query"] = query
                row_data = {field: story.get(field, "") for field in fieldnames}
                writer.writerow(row_data)

        logging.info(f"Successfully wrote {len(stories)} stories to {filename}.")
    except Exception as e:
        logging.error(f"Error writing to CSV '{filename}': {e}")

def copy_to_drive(filename, destination_folder):
    """
    Attempt to copy the file from Colab's /content/ to the given Google Drive folder.
    """
    colab_path = f"/content/{filename}"
    try:
        shutil.copy(colab_path, destination_folder)
        logging.info(f"File '{filename}' copied to Google Drive at '{destination_folder}'.")
    except Exception as e:
        logging.error(f"Could not copy file '{filename}' to Google Drive: {e}")

# ----------------------------------------------------------------------------
# MAIN EXECUTION LOGIC
# ----------------------------------------------------------------------------

def main():
    # Initialize MediaCloud Search API client
    mc_client = mediacloud.api.SearchApi(API_KEY)

    # Confirm API connectivity
    try:
        mc_client.story_list(
            query="test",
            start_date=parse_date(START_DATE_STR),
            end_date=parse_date(START_DATE_STR),
            page_size=1
        )
        logging.info("MediaCloud API connection successful.")
    except Exception as e:
        logging.error(f"Could not validate MediaCloud API connection: {e}")
        return

    # Generate monthly chunks within the overall date range
    monthly_chunks = generate_monthly_date_ranges(START_DATE_STR, END_DATE_STR)
    logging.info(f"Will process data in {len(monthly_chunks)} monthly chunk(s).")

    all_stories = []

    # For each monthly chunk, we break it into daily intervals
    for (month_start, month_end) in monthly_chunks:
        logging.info(f"Processing month {month_start} to {month_end}...")

        monthly_stories = []
        daily_chunks = generate_daily_date_ranges(month_start, month_end)

        for (day_start, day_end) in daily_chunks:
            logging.info(f"  Fetching stories for {day_start} (one-day range)...")
            chunk_stories = fetch_stories(
                mc_client,
                SEARCH_KEYWORD,
                day_start,
                day_end,
                COLLECTION_IDS
            )
            monthly_stories.extend(chunk_stories)

            # Always pause 55 seconds after finishing the daily request,
            # regardless of how many pages were fetched for that day.
            logging.info(f"Pausing 55 seconds at the end of day {day_start}.")
            print(f"Pausing 55 seconds at the end of day {day_start}...")
            time.sleep(55)

        # Save monthly data to CSV
        # We'll name it "keyword_location_YYYY-MM_mediacloud.csv"
        month_label = f"{month_start.year}-{month_start.month:02d}"
        monthly_csv_filename = (
            f"{SEARCH_KEYWORD.lower().replace(' ', '_')}_"
            f"{SEARCH_LOCATION.lower()}_"
            f"{month_label}_mediacloud.csv"
        )

        save_stories_to_csv(monthly_stories, monthly_csv_filename, SEARCH_KEYWORD)
        # Copy monthly CSV to Google Drive for backup
        copy_to_drive(monthly_csv_filename, GDRIVE_PATH)

        # Add monthly stories to the grand total
        all_stories.extend(monthly_stories)

    # After all months are processed, save a final CSV for the entire range
    logging.info(f"Now saving final CSV for the entire range {START_DATE_STR} - {END_DATE_STR}...")
    save_stories_to_csv(all_stories, FINAL_CSV_FILENAME, SEARCH_KEYWORD)
    copy_to_drive(FINAL_CSV_FILENAME, GDRIVE_PATH)

    logging.info(
        f"Done! Retrieved {len(all_stories)} total stories "
        f"from {START_DATE_STR} to {END_DATE_STR}."
    )


# ----------------------------------------------------------------------------
# RUN
# ----------------------------------------------------------------------------

if __name__ == "__main__":
    main()


Fetched 12 stories in this request; 12 total so far (range 2024-06-01 - 2024-06-01).
Pausing 55 seconds at the end of day 2024-06-01...


ERROR:root:Error during API fetch (range 2024-06-02 - 2024-06-02): API Server Error 403. Params: {'start': '2024-06-02', 'end': '2024-06-02', 'q': 'intelligence artificielle', 'platform': 'onlinenews-mediacloud', 'cs': ('34412146,38379799',), 'page_size': 1000}


Pausing 55 seconds at the end of day 2024-06-02...
Fetched 24 stories in this request; 24 total so far (range 2024-06-03 - 2024-06-03).
Pausing 55 seconds at the end of day 2024-06-03...
Fetched 49 stories in this request; 49 total so far (range 2024-06-04 - 2024-06-04).
Pausing 55 seconds at the end of day 2024-06-04...
Fetched 30 stories in this request; 30 total so far (range 2024-06-05 - 2024-06-05).
Pausing 55 seconds at the end of day 2024-06-05...
Fetched 24 stories in this request; 24 total so far (range 2024-06-06 - 2024-06-06).
Pausing 55 seconds at the end of day 2024-06-06...
Fetched 18 stories in this request; 18 total so far (range 2024-06-07 - 2024-06-07).
Pausing 55 seconds at the end of day 2024-06-07...
Fetched 15 stories in this request; 15 total so far (range 2024-06-08 - 2024-06-08).
Pausing 55 seconds at the end of day 2024-06-08...
Fetched 6 stories in this request; 6 total so far (range 2024-06-09 - 2024-06-09).
Pausing 55 seconds at the end of day 2024-06-09...

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Mediacloud/France/intelligence_artificielle_france_2024-06-01_2025-06-01_mediacloud.csv')
df.head()

Unnamed: 0,stories_id,title,publish_date,url,language,media_id,media_name,query,collection_ids
0,,"Paul Jorion, psychanalyste – Mise à jour – Blo...",2024-06-01,https://www.pauljorion.com/blog/2024/06/01/pau...,fr,,pauljorion.com,intelligence artificielle,"[34412146, 38379799]"
1,,Les carnets du psychanalyste – Veiller – Blog ...,2024-06-01,https://www.pauljorion.com/blog/2024/06/01/les...,fr,,pauljorion.com,intelligence artificielle,"[34412146, 38379799]"
2,,L'IA se prépare pour aller à la guerre,2024-06-01,https://www.sciencesetavenir.fr/high-tech/inte...,fr,,sciencesetavenir.fr,intelligence artificielle,"[34412146, 38379799]"
3,,N'attendez pas pour profiter de cette offre su...,2024-06-01,https://www.bfmtv.com/tech/bons-plans/n-attend...,fr,,bfmtv.com,intelligence artificielle,"[34412146, 38379799]"
4,,Ce smartphone Xiaomi à moins de 110 euros fait...,2024-06-01,https://www.bfmtv.com/tech/bons-plans/ce-smart...,fr,,bfmtv.com,intelligence artificielle,"[34412146, 38379799]"


In [None]:
len(df)

8172