In [None]:
# ========================================
#  SETUP: Google Drive & Installations
# ========================================
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install news-please
!pip install tinysegmenter
#!pip install newspaper4k[all]

Collecting news-please
  Downloading news_please-1.6.16-py3-none-any.whl.metadata (2.8 kB)
Collecting Scrapy>=1.1.0 (from news-please)
  Downloading scrapy-2.13.3-py3-none-any.whl.metadata (4.4 kB)
Collecting PyMySQL>=0.7.9 (from news-please)
  Downloading pymysql-1.1.2-py3-none-any.whl.metadata (4.3 kB)
Collecting psycopg2-binary>=2.8.4 (from news-please)
  Downloading psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting hjson>=1.5.8 (from news-please)
  Downloading hjson-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting elasticsearch>=2.4 (from news-please)
  Downloading elasticsearch-9.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting readability-lxml>=0.6.2 (from news-please)
  Downloading readability_lxml-0.8.4.1-py3-none-any.whl.metadata (4.0 kB)
Collecting langdetect>=1.0.7 (from news-please)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m 

In [None]:
import pandas as pd
from newsplease import NewsPlease

def process_csv_with_newsplease(input_df, url_column, output_csv_path=None):
    """
    Takes a Pandas DataFrame that has a specified 'url' column,
    uses news-please to extract news article data for each row, and
    updates the same DataFrame with new columns (title, maintext, authors,
    date_publish, etc.).

    Parameters
    ----------
    input_df : pd.DataFrame
        A DataFrame that must have a column with URLs.
    url_column : str
        The name of the column in 'input_df' that contains the URLs.
    output_csv_path : str, optional
        If provided, the updated DataFrame will be saved to this path as CSV.

    Returns
    -------
    pd.DataFrame
        The updated DataFrame with new columns for the extracted article data.
    """

    # Make a copy so we don't modify the original DataFrame
    df = input_df.copy()

    # List of article metadata fields we want to store
    new_columns = [
        "title",
        "maintext",
        "authors",
        "date_publish",
        "description",
        "language",
        "image_url"
    ]

    # Ensure these columns exist in the DataFrame (fill with None if missing)
    for col in new_columns:
        if col not in df.columns:
            df[col] = None

    # Iterate through each row in the DataFrame
    for idx, row in df.iterrows():
        url = row.get(url_column, None)

        # Skip if the URL is missing or invalid
        if not url or pd.isna(url):
            continue

        try:
            # Fetch article data using news-please
            article = NewsPlease.from_url(url)

            # If news-please fails or returns None, skip
            if article is None:
                print(f"Warning: Could not extract article for URL: {url}")
                continue

            # Update the DataFrame with the retrieved information
            df.at[idx, 'title'] = article.title
            df.at[idx, 'maintext'] = article.maintext
            df.at[idx, 'authors'] = article.authors
            df.at[idx, 'date_publish'] = article.date_publish
            df.at[idx, 'description'] = article.description
            df.at[idx, 'language'] = article.language
            df.at[idx, 'image_url'] = article.image_url

        except Exception as e:
            print(f"Error processing URL ({url}): {e}")
            continue

    # If requested, save the updated DataFrame to CSV
    if output_csv_path:
        df.to_csv(output_csv_path, index=False)
        print(f"Updated DataFrame saved to: {output_csv_path}")

    return df


# ------------------------ EXAMPLE USAGE IN COLAB ------------------------ #
# 1) Import your CSV into a DataFrame (e.g., from Google Drive):
#    import pandas as pd
#    df_news = pd.read_csv("/content/drive/MyDrive/path_to_your_file.csv")
#
# 2) Call the function, specifying:
#    - The DataFrame (df_news)
#    - The column name containing URLs (e.g., "url")
#    - The desired output CSV path (optional)
#
#    updated_df = process_csv_with_newsplease(df_news, "url", "updated_articles.csv")
#
# 3) 'updated_df' now contains new columns (title, maintext, etc.) in the same rows.
#    Inspect the result:
#    updated_df.head()


In [None]:
import pandas as pd
from newsplease import NewsPlease
import threading
import time

def process_url_with_timeout(url, timeout=180):
    """
    Helper function to process a single URL with a timeout.
    """
    article_data = {}
    exception = None

    def target():
        nonlocal article_data, exception
        try:
            article = NewsPlease.from_url(url)
            if article:
                article_data = {
                    "title": article.title,
                    "maintext": article.maintext,
                    "authors": article.authors,
                    "date_publish": article.date_publish,
                    "description": article.description,
                    "language": article.language,
                    "image_url": article.image_url
                }
            else:
                print(f"Warning: Could not extract article for URL: {url}")
        except Exception as e:
            exception = e
            print(f"Error processing URL ({url}): {e}")


    thread = threading.Thread(target=target)
    thread.start()
    thread.join(timeout)

    if thread.is_alive():
        print(f"Warning: Timeout processing URL: {url}")
        # The thread is still alive, it means the processing timed out.
        # We can't forcefully stop it, but we can skip processing its result.
        return None, None
    else:
        return article_data, exception


def process_csv_with_newsplease_timeout(input_df, url_column, output_csv_path=None, timeout_seconds=180):
    """
    Takes a Pandas DataFrame with a 'url' column, uses news-please to extract
    news article data for each row with a timeout, and updates the DataFrame.

    Parameters
    ----------
    input_df : pd.DataFrame
        A DataFrame that must have a column with URLs.
    url_column : str
        The name of the column in 'input_df' that contains the URLs.
    output_csv_path : str, optional
        If provided, the updated DataFrame will be saved to this path as CSV.
    timeout_seconds : int, optional
        The maximum time in seconds to wait for news-please to process a single URL.

    Returns
    -------
    pd.DataFrame
        The updated DataFrame with new columns for the extracted article data.
    """

    # Make a copy so we don't modify the original DataFrame
    df = input_df.copy()

    # List of article metadata fields we want to store
    new_columns = [
        "title",
        "maintext",
        "authors",
        "date_publish",
        "description",
        "language",
        "image_url"
    ]

    # Ensure these columns exist in the DataFrame (fill with None if missing)
    for col in new_columns:
        if col not in df.columns:
            df[col] = None

    # Iterate through each row in the DataFrame
    for idx, row in df.iterrows():
        url = row.get(url_column, None)

        # Skip if the URL is missing or invalid
        if not url or pd.isna(url):
            continue

        article_data, exception = process_url_with_timeout(url, timeout=timeout_seconds)

        if article_data:
            # Update the DataFrame with the retrieved information
            for col in new_columns:
                df.at[idx, col] = article_data.get(col)

    # If requested, save the updated DataFrame to CSV
    if output_csv_path:
        df.to_csv(output_csv_path, index=False)
        print(f"Updated DataFrame saved to: {output_csv_path}")

    return df

# ------------------------ EXAMPLE USAGE IN COLAB ------------------------ #
# 1) Import your CSV into a DataFrame (e.g., from Google Drive):
#    import pandas as pd
#    df_news = pd.read_csv("/content/drive/MyDrive/path_to_your_file.csv")
#
# 2) Call the function, specifying:
#    - The DataFrame (df_news)
#    - The column name containing URLs (e.g., "url")
#    - The desired output CSV path (optional)
#    - The timeout in seconds (optional, defaults to 180)
#
#    updated_df = process_csv_with_newsplease_timeout(df_news, "url", "updated_articles_with_timeout.csv", timeout_seconds=120)
#
# 3) 'updated_df' now contains new columns (title, maintext, etc.) in the same rows.
#    Inspect the result:
#    updated_df.head()

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia_2024-06-01_2025-06-01_mediacloud.csv')
#df = pd.read_csv('/content/drive/MyDrive/Mediacloud/UK/artificial_intelligence_uk_2024-06-01_2025-06-01_mediacloud.csv')
len(df)

7812

In [None]:
#drop last rows
df = df.iloc[:-4312].reset_index(drop=True)
len(df)

3500

In [None]:
#drop first rows
df = df.iloc[2500:].reset_index(drop=True)
len(df)

1000

In [None]:
df = df.sample(n=1000, random_state=42)
len(df)

1000

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia_2024-06-01_2025-06-01_mediacloud.csv')
len(df)

7812

In [None]:
df = df.sample(n=2000, random_state=42)
len(df)

2000

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Japan/人工知能_japan_cleaned_2024-06-01_2025-06-01_frames_newsplease_mediacloud_gemini_2-5.csv")
df2 = pd.read_csv("/content/drive/MyDrive/Mediacloud/Japan/人工知能_japan_addition_2024-06-01_2025-06-01_newsplease_mediacloud.csv")
df3 = pd.read_csv("/content/drive/MyDrive/Mediacloud/Japan/人工知能_japan_addition2_2024-06-01_2025-06-01_newsplease_mediacloud.csv")
df4 = pd.read_csv("/content/drive/MyDrive/Mediacloud/Japan/人工知能_japan_addition3_2024-06-01_2025-06-01_newsplease_mediacloud.csv")
len(df)

975

In [None]:
df = pd.concat([df, df2, df3, df4], ignore_index=True)
len(df)

5815

In [None]:
df = df[df["frame"] != "error"]
df = df[df["frame"] != "Not AI related"]
missing = (1000 - len(df))*8
len(df)

975

In [None]:
missing/8

25.0

In [None]:
df_full = pd.read_csv('/content/drive/MyDrive/Mediacloud/Japan/人工知能_japan_2024-06-01_2025-06-01_mediacloud.csv')
len(df_full)

102144

In [None]:
df_full_unique = df_full.drop_duplicates(subset=["title"])
used_articles = set(df["title"].values)
df_candidates = df_full_unique[~df_full_unique["title"].isin(used_articles)]

df_new = df_candidates.sample(n=missing, random_state=42)
len(df_new)

200

In [None]:
df_updated = process_csv_with_newsplease_timeout(df, "url", "artificial_intelligence_2024-06-01_2025-06-01_newsplease_mediacloud.csv", 180)

ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 403








ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 403




ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://www.tvc.ru/news/307094 HTTPSConnectionPool(host='www.tvc.ru', port=443): Max retries exceeded with url: /news/307094 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7963fc52ae70>, 'Connection to www.tvc.ru timed out. (connect timeout=None)'))




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://www.tvc.ru/news/307550 HTTPSConnectionPool(host='www.tvc.ru', port=443): Max retries exceeded with url: /news/307550 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7963f78a7f50>, 'Connection to www.tvc.ru timed out. (connect timeout=None)'))




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 403




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 403




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://www.tvc.ru/news/305273 HTTPSConnectionPool(host='www.tvc.ru', port=443): Max retries exceeded with url: /news/305273 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7963fc1d99a0>, 'Connection to www.tvc.ru timed out. (connect timeout=None)'))




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 504




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 403




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 403




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 403




ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://orenburzhie.ru/news/dve-tysyachi-par-obuvi-konfiskovano-v-orenburgskix-magazinax-posle-proverki-rospotrebnadzora/ HTTPSConnectionPool(host='orenburzhie.ru', port=443): Max retries exceeded with url: /news/dve-tysyachi-par-obuvi-konfiskovano-v-orenburgskix-magazinax-posle-proverki-rospotrebnadzora/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7963fff3bf80>, 'Connection to orenburzhie.ru timed out. (connect timeout=None)'))




ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://www.sovsibir.ru/news/178890 HTTPSConnectionPool(host='www.sovsibir.ru', port=443): Max retries exceeded with url: /news/178890 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7963f78d9c10>, 'Connection to www.sovsibir.ru timed out. (connect timeout=None)'))




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404




ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://orenburzhie.ru/news/obyavleny-itogi-xiii-konkursa-vmeste-v-cifrovoe-budushhee/ HTTPSConnectionPool(host='orenburzhie.ru', port=443): Max retries exceeded with url: /news/obyavleny-itogi-xiii-konkursa-vmeste-v-cifrovoe-budushhee/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7963f764ab40>, 'Connection to orenburzhie.ru timed out. (connect timeout=None)'))




ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://www.tvc.ru/news/315054 HTTPSConnectionPool(host='www.tvc.ru', port=443): Max retries exceeded with url: /news/315054 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7963f71f8ce0>, 'Connection to www.tvc.ru timed out. (connect timeout=None)'))




ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404








ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://www.tvc.ru/news/306451 HTTPSConnectionPool(host='www.tvc.ru', port=443): Max retries exceeded with url: /news/306451 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7963f6f85c70>, 'Connection to www.tvc.ru timed out. (connect timeout=None)'))




ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://www.tvc.ru/news/306231 HTTPSConnectionPool(host='www.tvc.ru', port=443): Max retries exceeded with url: /news/306231 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7963f6819310>, 'Connection to www.tvc.ru timed out. (connect timeout=None)'))


In [None]:
df_updated = process_csv_with_newsplease(df, "url", "artificial_intelligence_2024-06-01_2025-06-01_newsplease_mediacloud.csv")

ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404


Error processing URL (https://izvmor.ru/novosti-partnerov/sber-otkryl-priyom-zayavok-na-sorevnovanie-ai-journey-contest-s-prizovym-fondom-85-mln-rublej/): 'dict' object has no attribute 'title'


ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404


Error processing URL (https://kamchatka.aif.ru/society/dushevno-i-po-delu-bank-psb-na-kamchatke-delaet-stavku-na-zhivoe-obshchenie?erid=LjN8KQZxa): 'dict' object has no attribute 'title'


ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404


Error processing URL (https://www.avtoradio.ru/news/uid/463278): 'dict' object has no attribute 'title'


ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404


Error processing URL (https://mkset.ru/news/2024-12-24/novyy-metod-diagnostiki-raka-grudi-otkryvaet-novye-vozmozhnosti-lecheniya-5283516): 'dict' object has no attribute 'title'


ERROR:newsplease.crawler.simple_crawler:not a 200 response: 404


Error processing URL (https://www.avtoradio.ru/news/uid/463298): 'dict' object has no attribute 'title'


ERROR:newsplease.crawler.simple_crawler:not a 200 response: 504


Error processing URL (https://oblgazeta.ru/society/education/2024/10/67379/): 'dict' object has no attribute 'title'


ERROR:newsplease.crawler.simple_crawler:not a 200 response: 502


Error processing URL (https://oblgazeta.ru/industry-and-economy/economy/2024/07/56487/): 'dict' object has no attribute 'title'


ERROR:newsplease.crawler.simple_crawler:not a 200 response: 502


Error processing URL (https://oblgazeta.ru/pressreleases/2025/05/99425/): 'dict' object has no attribute 'title'


KeyboardInterrupt: 

In [None]:
len(df_updated)

2000

In [None]:
df = df[df["frame"] != "error"]
df = df[df["frame"] != "Not AI related"]
len(df)

335

In [None]:
df_cleaned = df_updated.dropna(subset=['maintext'])
len(df_cleaned)

1948

In [None]:
df_cleaned = df_cleaned.drop_duplicates(subset=['maintext'], keep='first')
len(df_cleaned)

1889

In [None]:
print(len(df_cleaned))
print(len(df))
print(len(df)+len(df_cleaned))

1889
335
2224


In [None]:
df_final = pd.concat([df, df_cleaned], ignore_index=True)
df_final = df_final.dropna(subset=['maintext'])
df_final = df_final.drop_duplicates(subset=['maintext'], keep='first')
len(df_final)

2224

In [None]:
# Drop rows where 'maintext' is None or NaN
df_cleaned = df_updated.dropna(subset=['maintext'])

len(df_cleaned)

1750

In [None]:
# Drop duplicate rows based on the 'maintext' column
df_cleaned = df_cleaned.drop_duplicates(subset=['maintext'], keep='first')

len(df_cleaned)

1413

In [None]:
#df_cleaned = df_cleaned.sample(n=1000, random_state=42)
#len(df_cleaned)

1000

In [None]:
df_cleaned.to_csv("cleaned_artificial_intelligence_2024-06-01_2025-06-01_newsplease_mediacloud.csv", index=False)

In [None]:
import shutil

# Define source (in Colab environment) and destination (in your Google Drive)
source_path = '/content/cleaned_artificial_intelligence_2024-06-01_2025-06-01_newsplease_mediacloud.csv'
destination_path = '/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia_2024-06-01_2025-06-01_newsplease_mediacloud.csv'
#destination_path = '/content/drive/MyDrive/Mediacloud/Japan/人工知能_japan_addition4_2024-06-01_2025-06-01_newsplease_mediacloud.csv'
#destination_path = '/content/drive/MyDrive/Mediacloud/UK/artificial_intelligence_uk_full2_2024-06-01_2025-06-01_newsplease_mediacloud.csv'

# Copy the file
shutil.copy(source_path, destination_path)

'/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia_2024-06-01_2025-06-01_newsplease_mediacloud.csv'

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia1_2024-06-01_2025-06-01_newsplease_mediacloud.csv')
df2 = pd.read_csv('/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia2_2024-06-01_2025-06-01_newsplease_mediacloud.csv')
df3 = pd.read_csv('/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia3_2024-06-01_2025-06-01_newsplease_mediacloud.csv')
df4 = pd.read_csv('/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia4_2024-06-01_2025-06-01_newsplease_mediacloud.csv')
df5 = pd.read_csv('/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia5_2024-06-01_2025-06-01_newsplease_mediacloud.csv')
print(len(df1))
print(len(df2))
print(len(df3))
print(len(df4))
print(len(df5))
print(len(df1)+len(df2)+len(df3)+len(df4)+len(df5))

264
376
438
262
429
1769


In [None]:
df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
df = df.dropna(subset=['maintext'])
df = df.drop_duplicates(subset=['maintext'], keep='first')
len(df)

1765

In [None]:
output_path_cleaned = "/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia_2024-06-01_2025-06-01_newsplease_mediacloud.csv"
df.to_csv(output_path_cleaned, index=False)
len(df)

1765

In [None]:
df.columns

Index(['stories_id', 'title', 'publish_date', 'url', 'language', 'media_id',
       'media_name', 'query', 'collection_ids', 'maintext', 'authors',
       'date_publish', 'description', 'image_url'],
      dtype='object')

In [None]:
df.head()

Unnamed: 0,stories_id,title,publish_date,url,language,media_id,media_name,query,collection_ids,maintext,authors,date_publish,description,image_url
0,,‘We made the Maldives from a hotel in Heathrow...,2024-07-06,https://www.theguardian.com/film/article/2024/...,en,,theguardian.com,artificial intelligence,"[34412476, 38381111]",The script called for a tree: a magical kind t...,"['Rebecca Liu', 'www.theguardian.com', 'rebecc...",2024-07-06 10:55:35,Globe-trotting in search of picture-perfect sc...,https://i.guim.co.uk/img/media/2a79ee79f476f64...
1,,More cuts and a merger with Channel 4: the BBC...,2024-08-30,https://www.theguardian.com/media/article/2024...,en,,theguardian.com,artificial intelligence,"[34412476, 38381111]",Within days of her appointment as culture secr...,"['Jim Waterson', 'www.theguardian.com', 'jim-w...",2024-08-30 14:00:07,"With 500,000 households a year cancelling thei...",https://i.guim.co.uk/img/media/3618d814963e9f5...
2,,Robot rugby may be a way off but art of coachi...,2025-03-25,https://www.theguardian.com/sport/2025/mar/25/...,en,,theguardian.com,artificial intelligence,"[34412476, 38381111]",Once upon a time coaching sport was deceptivel...,"['Robert Kitson', 'www.theguardian.com']",2025-03-25 10:38:28,As data reshapes coaching and tactics at club ...,https://i.guim.co.uk/img/media/6cb3935abb0b09b...
3,,Client Challenge,2024-12-10,https://www.ft.com/content/cf319e9a-575a-4c27-...,en,,ft.com,artificial intelligence,"[34412476, 38381111]",A required part of this site couldn’t load. Th...,[],,,https://www.ft.com/content/cf319e9a-575a-4c27-...
4,,Inside the hunt for 'Britain's Atlantis' lost ...,2025-03-03,https://www.thesun.co.uk/tech/33533471/britain...,en,,thesun.co.uk,artificial intelligence,"[34412476, 38381111]",BRITAIN wasn't always an island. At the end of...,"['Millie Turner', 'www.facebook.com']",2025-03-03 15:57:40,BRITAIN wasn't always an island. At the end of...,https://www.thesun.co.uk/wp-content/uploads/20...


In [None]:
from newsplease import NewsPlease

# Choose any news article URL
url = "https://www.bbc.com/news/articles/c05768jmm11o"

# Fetch the article data
article = NewsPlease.from_url(url)

# Print all the keys/values in the article object
print("The article object has these attributes:\n")
for key, value in vars(article).items():
    print(f"{key}:", value)


The article object has these attributes:

authors: ['Kayla Epstein']
date_download: 2025-05-23 14:13:01
date_modify: None
date_publish: 2025-05-22 18:20:28
description: The move escalates the administration's row with America's oldest university over hiring, admissions and teaching practices.
filename: https%3A%2F%2Fwww.bbc.com%2Fnews%2Farticles%2Fc05768jmm11o.json
image_url: https://ichef.bbci.co.uk/news/1024/branded_news/9503/live/5954d570-373d-11f0-a5ca-3b23cbd75bec.jpg
language: en
localpath: None
title: Trump administration ends Harvard's ability to enrol international students
title_page: None
title_rss: None
source_domain: www.bbc.com
maintext: "We are fully committed to maintaining Harvard's ability to host our international students and scholars, who hail from more than 140 countries and enrich the University – and this nation – immeasurably," the university responded.
"We are working quickly to provide guidance and support to members of our community. This retaliatory action 