In [5]:
# Set up your API key in .env file, and import needed things
import os
import datetime as dt
from importlib.metadata import version

import mediacloud.api
from dotenv import load_dotenv
import pandas as pd
import difflib

# Sentiment Analysis
import nltk
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# URL-related libraries
from newspaper import Article
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
import re
import concurrent.futures


load_dotenv()
api_key = os.getenv("MC_API_KEY")
search_api = mediacloud.api.SearchApi(api_key)
directory_api = mediacloud.api.DirectoryApi(api_key)
f"Using Media Cloud python client v{version('mediacloud')}"

'Using Media Cloud python client v4.4.0'

# Querying boston globe climate solutions journalism article from url

In [2]:
my_query = 'url:"https://www.bostonglobe.com/2025/03/03/world/south-sudan-coffee-beans-climate-change/"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6, 10)
search_api.story_list(my_query, start_date, end_date)

([{'id': 'bbe590fcc23edb5d310ef99dd1dbddeb1223ff634588d2043ba18e80789700cf',
   'indexed_date': datetime.datetime(2025, 3, 3, 11, 28, 11, 454314, tzinfo=datetime.timezone.utc),
   'language': 'en',
   'media_name': 'bostonglobe.com',
   'media_url': 'bostonglobe.com',
   'publish_date': datetime.date(2025, 3, 3),
   'title': 'As warming climate hammers coffee crops, this rare bean may someday be your brew',
   'url': 'https://www.bostonglobe.com/2025/03/03/world/south-sudan-coffee-beans-climate-change/'}],
 None)

# Extracting fulltext data of the boston globe article

In [3]:
boston_globe_article = search_api.story("bbe590fcc23edb5d310ef99dd1dbddeb1223ff634588d2043ba18e80789700cf")["text"]
with open("../data/boston_globe_article.txt", "w") as file:
    file.write(boston_globe_article)

# Sentiment analysis

In [4]:
with open("../data/boston_globe_article.txt", "r") as f:
    text = f.read()

blob = TextBlob(text)
sentiment = blob.sentiment
print(f"Polarity: {sentiment.polarity:.2f}")

print(f"Subjectivity: {sentiment.subjectivity:.2f}")

try:
    analyzer = SentimentIntensityAnalyzer()
except LookupError:
    nltk.download("vader_lexicon")
    analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)

# Print the scores
print(scores)

# Interpret the compound score
compound_score = scores["compound"]
if compound_score >= 0.05:
    sentiment = "Positive"
elif compound_score <= -0.05:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

print(f"Overall sentiment: {sentiment}")

Polarity: 0.08
Subjectivity: 0.40
{'neg': 0.082, 'neu': 0.83, 'pos': 0.088, 'compound': 0.2392}
Overall sentiment: Positive


# Repeating for Yale 360 article, but using Newspaper3k to scrape, because it was not found using the search API

In [5]:
url = "https://e360.yale.edu/features/eric-nost-interview"
article = Article(url)
article.download()
article.parse()

with open("../data/yale_360_article.txt", "w", encoding="utf-8") as f:
    f.write(article.title + "\n\n")
    f.write(article.text)

blob = TextBlob(text)
sentiment = blob.sentiment
print(f"Polarity: {sentiment.polarity:.2f}")

print(f"Subjectivity: {sentiment.subjectivity:.2f}")

# nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)

# Print the scores
print(scores)

# Interpret the compound score
compound_score = scores["compound"]
if compound_score >= 0.05:
    sentiment = "Positive"
elif compound_score <= -0.05:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

print(f"Overall sentiment: {sentiment}")

Polarity: 0.08
Subjectivity: 0.40
{'neg': 0.082, 'neu': 0.83, 'pos': 0.088, 'compound': 0.2392}
Overall sentiment: Positive


# Repeating for BBC article

In [6]:
# url = 'https://www.bbc.com/news/articles/ce82p6yq061o'
# url = 'https://www.bbc.com/news/articles/c1dekp93l6po'
url = "https://www.bbc.com/news/articles/cvgdyl817p1o"

my_query = f'url:"{url}"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6, 10)
story_info = search_api.story_list(my_query, start_date, end_date)[0][0]
story_id = story_info["id"]

bbc_article = search_api.story(story_id)["text"]

with open("../data/bbc_article.txt", "w") as file:
    file.write(bbc_article)

with open("../data/bbc_article.txt", "r") as f:
    text = f.read()

blob = TextBlob(text)
sentiment = blob.sentiment
print(f"Polarity: {sentiment.polarity:.2f}")

print(f"Subjectivity: {sentiment.subjectivity:.2f}")


# nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)

# Print the scores
print(scores)

# Interpret the compound score
compound_score = scores["compound"]
if compound_score >= 0.05:
    sentiment = "Positive"
elif compound_score <= -0.05:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

print(f"Overall sentiment: {sentiment}")

Polarity: 0.05
Subjectivity: 0.42
{'neg': 0.076, 'neu': 0.797, 'pos': 0.127, 'compound': 0.996}
Overall sentiment: Positive


From this, we can see that it is not necessarily always true that solutions journalism = positive sentiment and non-solutions journalism = negative content, as we have found a sample of non-solutions journalism that has positive sentiment. But we may need to consider sentiment as a feature when creating our classifier pipeline.

# Modularising - extract full text and sentiment analysis as two separate funtions

In [7]:
my_query = 'url:"https://www.bbc.com/news/articles/cvgdyl817p1o"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6, 10)
print(search_api.story_list(my_query, start_date, end_date)[0])

my_query = 'url:"https://e360.yale.edu/features/eric-nost-interview"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6, 10)
x = search_api.story_list(my_query, start_date, end_date)
if len(x[0]) == 0:
    print("not found")

[{'id': 'a4b9491b4f15c523ddd4121978664fd4c7a93d3909f99ba1ecfc9aa45b497675', 'indexed_date': datetime.datetime(2025, 6, 10, 21, 40, 17, 122666, tzinfo=datetime.timezone.utc), 'language': 'en', 'media_name': 'bbc.com', 'media_url': 'bbc.com', 'publish_date': datetime.date(2025, 6, 9), 'title': 'Homeowners warned over green energy scammers', 'url': 'https://www.bbc.com/news/articles/cvgdyl817p1o'}]
not found


In [8]:
def extract_full_text(url, output_file_name):
    my_query = f'url:"{url}"'
    start_date = dt.date(2025, 1, 1)
    end_date = dt.date(2025, 6, 10)

    article_text = ""

    try:
        print(search_api.story_list(my_query, start_date, end_date)[0][0])
        result = search_api.story_list(my_query, start_date, end_date)[0][0]
    except Exception as e:
        print(f"Search API failed: {e}")
        result = None

    if result:
        try:
            article_text = search_api.story(result["id"])["text"]
        except Exception as e:
            print(f"Error fetching from Media Cloud: {e}")
    else:
        try:
            article = Article(url)
            article.download()
            article.parse()
            article_text = article.title + "\n\n" + article.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
        except Exception as e:
            print(f"Error parsing article: {e}")

    out_file = f"{output_file_name}.txt"
    with open(out_file, "w", encoding="utf-8") as file:
        file.write(article_text)

    print(f"Article saved to {out_file}")


def compare_files(file1_path, file2_path):
    """Compares two text files and prints the differences."""
    with open(file1_path, "r") as file1, open(file2_path, "r") as file2:
        file1_lines = file1.readlines()
        file2_lines = file2.readlines()

    diff = difflib.unified_diff(file1_lines, file2_lines, fromfile=file1_path, tofile=file2_path)

    for line in diff:
        print(line, end="")


# def sentiment_analysis(file):
# VADER - pos/neg/neutral
# textblob - Polarity/Subjectivity

# Verifying that extract_full_text is working as expected

In [9]:
extract_full_text(
    "https://www.bostonglobe.com/2025/03/03/world/south-sudan-coffee-beans-climate-change/",
    "../data/boston_globe_article_f",
)
compare_files("../data/boston_globe_article_f.txt", "../data/boston_globe_article.txt")

{'id': 'bbe590fcc23edb5d310ef99dd1dbddeb1223ff634588d2043ba18e80789700cf', 'indexed_date': datetime.datetime(2025, 3, 3, 11, 28, 11, 454314, tzinfo=datetime.timezone.utc), 'language': 'en', 'media_name': 'bostonglobe.com', 'media_url': 'bostonglobe.com', 'publish_date': datetime.date(2025, 3, 3), 'title': 'As warming climate hammers coffee crops, this rare bean may someday be your brew', 'url': 'https://www.bostonglobe.com/2025/03/03/world/south-sudan-coffee-beans-climate-change/'}
Article saved to ../data/boston_globe_article_f.txt


In [10]:
extract_full_text("https://e360.yale.edu/features/eric-nost-interview", "../data/yale_360_article_f")
compare_files("../data/yale_360_article_f.txt", "../data/yale_360_article.txt")

Search API failed: list index out of range
Article saved to ../data/yale_360_article_f.txt


In [11]:
extract_full_text("https://www.bbc.com/news/articles/cvgdyl817p1o", "../data/bbc_article_f")
compare_files("../data/bbc_article_f.txt", "../data/bbc_article.txt")

{'id': 'a4b9491b4f15c523ddd4121978664fd4c7a93d3909f99ba1ecfc9aa45b497675', 'indexed_date': datetime.datetime(2025, 6, 10, 21, 40, 17, 122666, tzinfo=datetime.timezone.utc), 'language': 'en', 'media_name': 'bbc.com', 'media_url': 'bbc.com', 'publish_date': datetime.date(2025, 6, 9), 'title': 'Homeowners warned over green energy scammers', 'url': 'https://www.bbc.com/news/articles/cvgdyl817p1o'}
Article saved to ../data/bbc_article_f.txt


# Listing sources

In [12]:
my_query = "All"
start_date = dt.date(2024, 1, 1)
end_date = dt.date(2025, 6, 10)
source_list = search_api.sources(my_query, start_date, end_date)
for i in source_list:
    print(i["source"])

ttblogs.com
blogdomago.com
amarujala.com
yahoo.com
indiatimes.com
latestnigeriannews.com
benzinga.com
dailymail.co.uk
yardbarker.com
365project.org
hindustantimes.com
mirror.co.uk
globenewswire.com
thesun.co.uk
kdhnews.com
sktoday.com
noticiasya.com
freerepublic.com
india.com
express.co.uk
independent.co.uk
finanznachrichten.de
prnewswire.com
screenrant.com
forbes.com
wtop.com
metroseoul.co.kr
wvnews.com
cbsnews.com
niagarafallsreview.ca
wellandtribune.ca
thespec.com
thehindu.com
therecord.com
the-messenger.com
ctvnews.ca
indianexpress.com
upstract.com
nst.com.my
einpresswire.com
theguardian.com
kesq.com
sportingnews.com
livemint.com
keyt.com
gazette.com
news18.com
ktvz.com
dailyrecord.co.uk
kion546.com
thestar.com
abc17news.com
independent.ie
citynews.ca
newsday.com
nytimes.com
mediaindonesia.com
seznam.cz
krdo.com
nypost.com
localnews8.com
biztoc.com
the-sun.com
urdupoint.com
winnipegfreepress.com
zawya.com
newsweek.com
cbssports.com
regionalmedianews.com
manchestereveningnews.co.uk


## Digging deeper into sources where only particular sections are SoJo

In [13]:
"""my_query = 'url:"https://www.vox.com/future-perfect/*"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6, 10)

search_api.story_list(my_query, start_date, end_date)


my_query = 'url:"https://www.theguardian.com/world/series/the-upside/*"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6, 10)

search_api.story_list(my_query, start_date, end_date)

my_query = 'url:"https://www.hcn.org/topic/what-works/*"'
start_date = dt.date(2021, 1, 1)
end_date = dt.date(2025, 6, 10)

my_query = 'url:"https://montanafreepress.org/series/long-streets/*"'
start_date = dt.date(2021, 1, 1)
end_date = dt.date(2025, 6, 10)"""

'my_query = \'url:"https://www.vox.com/future-perfect/*"\'\nstart_date = dt.date(2025, 1, 1)\nend_date = dt.date(2025, 6, 10)\n\nsearch_api.story_list(my_query, start_date, end_date)\n\n\nmy_query = \'url:"https://www.theguardian.com/world/series/the-upside/*"\'\nstart_date = dt.date(2025, 1, 1)\nend_date = dt.date(2025, 6, 10)\n\nsearch_api.story_list(my_query, start_date, end_date)\n\nmy_query = \'url:"https://www.hcn.org/topic/what-works/*"\'\nstart_date = dt.date(2021, 1, 1)\nend_date = dt.date(2025, 6, 10)\n\nmy_query = \'url:"https://montanafreepress.org/series/long-streets/*"\'\nstart_date = dt.date(2021, 1, 1)\nend_date = dt.date(2025, 6, 10)'

In [14]:
def crawl_section_urls(base_url, max_pages=5):
    """
    Crawl article URLs from a section URL.
    Supports Vox, Guardian, and HCN based on URL patterns.
    max_pages limits pagination depth (adjust as needed).
    """
    article_urls = set()
    for page in range(1, max_pages + 1):
        # Construct page URL depending on site
        if "vox.com" in base_url:
            # Vox paginates with ?page=2 etc
            url = f"{base_url}?page={page}" if page > 1 else base_url
        elif "theguardian.com" in base_url:
            # Guardian paginates with /page/2 etc
            url = f"{base_url}/page/{page}" if page > 1 else base_url
        elif "hcn.org" in base_url:
            # HCN uses ?page=2 etc
            url = f"{base_url}?page={page}" if page > 1 else base_url
        else:
            # Unknown site — no pagination
            url = base_url

        print(f"Crawling page {page} at {url}")
        resp = requests.get(url)
        if resp.status_code != 200:
            print(f"Failed to load page: {url}")
            break

        soup = BeautifulSoup(resp.text, "html.parser")

        # Extract article links based on site
        links = []
        if "vox.com" in base_url:
            links = [a["href"] for a in soup.select("a[href*='/future-perfect/']") if a.has_attr("href")]
        elif "theguardian.com" in base_url:
            links = []
            links += [a["href"] for a in soup.select("a.js-headline-text")]
            links += [a["href"] for a in soup.select("a.u-faux-block-link__overlay")]
            links += [a["href"] for a in soup.select("div.fc-item__content a[href]")]
            links = list(set(links))
        elif "hcn.org" in base_url:
            links = [a["href"] for a in soup.select("h2.entry-title a")]

        # Clean and filter full URLs
        for link in links:
            if link.startswith("/"):
                # Make full URL
                if "vox.com" in base_url:
                    full_url = "https://www.vox.com" + link
                elif "theguardian.com" in base_url:
                    full_url = "https://www.theguardian.com" + link
                elif "hcn.org" in base_url:
                    full_url = "https://www.hcn.org" + link
                else:
                    full_url = link
            else:
                full_url = link

            article_urls.add(full_url)

        # Stop if no new links found on this page (end pagination)
        if not links:
            break

    return sorted(article_urls)


def fetch_and_filter_articles(urls, keywords, start_date=None, end_date=None):
    """
    Download articles, parse publish date, filter by keywords and date range.
    Returns list of dict with title, url, publish_date, and snippet.
    """
    results = []
    for url in urls:
        try:
            art = Article(url)
            art.download()
            art.parse()

            # Only text-based articles (no video/audio-only)
            if len(art.text) < 200:
                # Skip if text too short (likely not a full article)
                continue

            pub_date = art.publish_date
            if pub_date is None:
                # Try to parse date from meta tags manually
                # or skip article if no date
                pub_date = None
            else:
                # Normalize to date only
                pub_date = pub_date.date()

            # Filter by date
            if start_date and pub_date and pub_date < start_date:
                continue
            if end_date and pub_date and pub_date > end_date:
                continue

            # Keyword filter in title or text (case-insensitive)
            text_lower = art.text.lower()
            title_lower = (art.title or "").lower()
            if not any(kw.lower() in text_lower or kw.lower() in title_lower for kw in keywords):
                continue

            # Save minimal info
            snippet = art.text[:300].replace("\n", " ") + "..."
            results.append({"title": art.title, "url": url, "publish_date": pub_date, "snippet": snippet})

        except Exception as e:
            print(f"Failed to process {url}: {e}")
    print(f"Found {len(results)} articles")
    return results

In [15]:
# Your section URLs:
vox_section = "https://www.vox.com/future-perfect"
guardian_section = "https://www.theguardian.com/world/series/the-upside"
hcn_section = "https://www.hcn.org/topic/what-works/"
mfp_section = "https://montanafreepress.org/series/long-streets/"

# Keywords to filter by:
keywords = ["climate", "climate change", "health", "healthcare"]

# Date range for filtering articles:
start_date = dt.date(2024, 1, 1)
end_date = dt.date.today()

# Crawl article URLs (limit max_pages to control scraping load)
print("Crawling Vox articles...")
vox_urls = crawl_section_urls(vox_section, max_pages=5)
print(f"🔗 Vox URLs found: {len(vox_urls)}")

print("Crawling Guardian articles...")
guardian_urls = crawl_section_urls(guardian_section, max_pages=5)
print(f"🔗 Guardian URLs found: {len(guardian_urls)}")

print("Crawling HCN articles...")
hcn_urls = crawl_section_urls(hcn_section, max_pages=3)
print(f"🔗 HCN URLs found: {len(hcn_urls)}")

print("Crawling MFP articles...")
mfp_urls = crawl_section_urls(mfp_section, max_pages=10)
print(f"🔗 MFP URLs found: {len(mfp_urls)}")

Crawling Vox articles...
Crawling page 1 at https://www.vox.com/future-perfect
Crawling page 2 at https://www.vox.com/future-perfect?page=2
Crawling page 3 at https://www.vox.com/future-perfect?page=3
Crawling page 4 at https://www.vox.com/future-perfect?page=4
Crawling page 5 at https://www.vox.com/future-perfect?page=5
🔗 Vox URLs found: 36
Crawling Guardian articles...
Crawling page 1 at https://www.theguardian.com/world/series/the-upside
🔗 Guardian URLs found: 0
Crawling HCN articles...
Crawling page 1 at https://www.hcn.org/topic/what-works/
Crawling page 2 at https://www.hcn.org/topic/what-works/?page=2
Crawling page 3 at https://www.hcn.org/topic/what-works/?page=3
🔗 HCN URLs found: 20
Crawling MFP articles...
Crawling page 1 at https://montanafreepress.org/series/long-streets/
🔗 MFP URLs found: 0


In [16]:
keywords = ["climate", "climate change"]
# Fetch articles and filter by keywords and date
print("Filtering Vox articles...")
vox_matches = fetch_and_filter_articles(vox_urls, keywords, start_date, end_date)
print("Filtering Guardian articles...")
guardian_matches = fetch_and_filter_articles(guardian_urls, keywords, start_date, end_date)
print("Filtering HCN articles...")
hcn_matches = fetch_and_filter_articles(hcn_urls, keywords, start_date, end_date)

# Print summary
print("\n=== Vox matches ===")
for article in vox_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

print("\n=== Guardian matches ===")
for article in guardian_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

print("\n=== HCN matches ===")
for article in hcn_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

Filtering Vox articles...
Found 16 articles
Filtering Guardian articles...
Found 0 articles
Filtering HCN articles...
Found 3 articles

=== Vox matches ===
- The best plant-based meat products, according to a huge blind taste test
  https://www.vox.com/future-perfect/411819/best-plant-based-meat-impossible-beyond-gardein-tofurky

- How to find a meaningful job: try “moral ambition,” says Rutger Bregman
  https://www.vox.com/future-perfect/412698/rutger-bregman-moral-ambition-meaningful-career-doing-good

- How we stretched our aviation system to the brink
  https://www.vox.com/future-perfect/413228/plane-crashes-safety-boeing-newark-aviation-system

- One chilling forecast of our AI future is getting wide attention. How realistic is it?
  https://www.vox.com/future-perfect/414087/artificial-intelligence-openai-ai-2027-china

- Bill Gates shows what the end of perpetual philanthropy looks like
  https://www.vox.com/future-perfect/414135/bill-gates-foundation-philanthropy-elon-musk-billi

In [17]:
keywords = ["health", "healthcare"]
# Fetch articles and filter by keywords and date
print("Filtering Vox articles...")
vox_matches = fetch_and_filter_articles(vox_urls, keywords, start_date, end_date)
print("Filtering Guardian articles...")
guardian_matches = fetch_and_filter_articles(guardian_urls, keywords, start_date, end_date)
print("Filtering HCN articles...")
hcn_matches = fetch_and_filter_articles(hcn_urls, keywords, start_date, end_date)

# Print summary
print("\n=== Vox matches ===")
for article in vox_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

print("\n=== Guardian matches ===")
for article in guardian_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

print("\n=== HCN matches ===")
for article in hcn_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

Filtering Vox articles...
Found 18 articles
Filtering Guardian articles...
Found 0 articles
Filtering HCN articles...
Found 1 articles

=== Vox matches ===
- You’re being lied to about protein
  https://www.vox.com/future-perfect/410565/protein-muscle-gain-weightlifting-plant-based-vegan

- My family has money but doesn’t give to charity. How do I challenge them without being weird?
  https://www.vox.com/future-perfect/410573/family-friends-charity-donations-communication-defensiveness

- How to save 400,000 babies a year
  https://www.vox.com/future-perfect/412847/neonatal-sepsis-infant-death-foreign-aid-test

- The massive stakes of the Trump administration’s plans to end animal testing
  https://www.vox.com/future-perfect/412854/trump-animal-welfare-research-nih-fda-epa

- How switching to a flip phone deepened my friendships
  https://www.vox.com/future-perfect/413657/iphone-detox-flip-phone-friendships

- Bill Gates shows what the end of perpetual philanthropy looks like
  https:/

In [18]:
url = "https://www.theguardian.com/world/series/the-upside"
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")

# Look for relative URLs starting with /world/series/the-upside/
pattern = re.compile(r"^/world/series/the-upside/.*")

links = []
for a in soup.find_all("a", href=True):
    href = a["href"]
    if pattern.match(href):
        full_url = f"https://www.theguardian.com{href}"
        links.append(full_url)

unique_links = list(set(links))
print(f"Found {len(unique_links)} unique links:")
for link in unique_links:
    print(link)

Found 20 unique links:
https://www.theguardian.com/world/series/the-upside/2022/apr/21/all
https://www.theguardian.com/world/series/the-upside/2021/may/14/all
https://www.theguardian.com/world/series/the-upside/2021/feb/15/all
https://www.theguardian.com/world/series/the-upside/2021/jun/18/all
https://www.theguardian.com/world/series/the-upside/2021/jul/09/all
https://www.theguardian.com/world/series/the-upside/2021/apr/23/all
https://www.theguardian.com/world/series/the-upside/2021/mar/19/all
https://www.theguardian.com/world/series/the-upside/2024/aug/11/all
https://www.theguardian.com/world/series/the-upside/2021/may/03/all
https://www.theguardian.com/world/series/the-upside/2021/jun/11/all
https://www.theguardian.com/world/series/the-upside/2021/may/21/all
https://www.theguardian.com/world/series/the-upside/2021/jun/25/all
https://www.theguardian.com/world/series/the-upside/2021/feb/26/all
https://www.theguardian.com/world/series/the-upside/2021/may/07/all
https://www.theguardian.c

In [19]:
guardian_urls = [
    "https://www.theguardian.com/world/series/the-upside/2021/apr/02/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/18/all",
    "https://www.theguardian.com/world/series/the-upside/2022/apr/21/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/07/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/26/all",
    "https://www.theguardian.com/world/series/the-upside/2021/apr/16/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/25/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jul/09/all",
    "https://www.theguardian.com/world/series/the-upside/2021/apr/23/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jul/16/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/21/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/12/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/19/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/11/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/15/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/14/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/03/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/12/all",
    "https://www.theguardian.com/world/series/the-upside/2024/aug/11/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/26/all",
]

keywords = ["climate", "environment", "energy", "policy"]  # Replace with your keywords


def check_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        text = article.text.lower()
        title = article.title.lower()
        if any(k.lower() in text or k.lower() in title for k in keywords):
            return url, article.title
        else:
            return None
    except Exception as e:
        print(f"Failed to process {url}: {e}")
        return None


results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(check_article, url) for url in guardian_urls]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        if result:
            results.append(result)

print("Filtered Guardian matches:")
for url, title in results:
    print(f"- {title}\n  {url}")

Filtered Guardian matches:


In [20]:
guardian_urls = [
    "https://www.theguardian.com/world/series/the-upside/2021/apr/02/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/18/all",
    "https://www.theguardian.com/world/series/the-upside/2022/apr/21/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/07/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/26/all",
    "https://www.theguardian.com/world/series/the-upside/2021/apr/16/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/25/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jul/09/all",
    "https://www.theguardian.com/world/series/the-upside/2021/apr/23/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jul/16/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/21/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/12/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/19/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/11/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/15/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/14/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/03/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/12/all",
    "https://www.theguardian.com/world/series/the-upside/2024/aug/11/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/26/all",
]

keywords = ["health", "healthcare"]  # Replace with your keywords


def check_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        text = article.text.lower()
        title = article.title.lower()
        if any(k.lower() in text or k.lower() in title for k in keywords):
            return url, article.title
        else:
            return None
    except Exception as e:
        print(f"Failed to process {url}: {e}")
        return None


results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(check_article, url) for url in guardian_urls]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        if result:
            results.append(result)

print("Filtered Guardian matches:")
for url, title in results:
    print(f"- {title}\n  {url}")

Filtered Guardian matches:


# Creating a dataframe for Sojo sources and articles

In [21]:
print(os.getcwd())
df = pd.read_csv("../data/SoJosources.csv")
df.head()

/Users/cmagapu/MediaCloud/mc-classifier-pipeline/notebooks


Unnamed: 0,SoJo source,Homepage URL,Present in Media Cloud (yes/no),Has health stories (yes/no),Has climate stories(yes/no),Notes/Questions
0,BBC: “People Fixing the World”,https://www.bbc.co.uk/programmes/p04d42vf,,,,This is a podcast/youtube series
1,BBC: “My Perfect Country”,https://www.bbc.co.uk/programmes/p03gsc50,,,,This is a podcast/youtube series
2,BBC: “Crossing Divides”,https://www.bbc.com/news/topics/czpqp1q456vt,,,,This is a podcast/youtube series
3,BBC: “Future Planet”,https://www.bbc.co.uk/future/future-planet,No,Yes,Yes,
4,POLITICO Magazine: “What Works” + “What Works ...,https://www.politico.com/magazine/what-works-n...,No,no,no,


In [22]:
df_present = df[df["Present in Media Cloud (yes/no)"].str.strip().str.lower().str.contains("^yes", na=False)]

df_present.head()

Unnamed: 0,SoJo source,Homepage URL,Present in Media Cloud (yes/no),Has health stories (yes/no),Has climate stories(yes/no),Notes/Questions
5,The Seattle Times: “Education Lab” + “Traffic ...,https://www.seattletimes.com/education-lab/,Yes,Yes,No,
8,The New York Times: “Fixes”,https://www.nytimes.com/column/fixes,Yes,Yes,Yes,
10,Boston Globe: “Things That Work”,https://apps.bostonglobe.com/metro/graphics/20...,yes,yes,no,
13,Montana Free Press: “Long Streets”,https://montanafreepress.org/,yes,yes,yes,"although it has health and climate stories, th..."
16,Milwaukee Journal Sentinel: “Wisconsin Ideas Lab”,https://www.jsonline.com/,yes,yes,no,not sojo


In [23]:
df_present = df_present.copy()

df_present.loc[:, "clean_domain"] = df_present["Homepage URL"].apply(
    lambda url: urlparse(url).netloc.replace("www.", "") if pd.notnull(url) else None
)


def get_source_id(domain):
    if not domain:
        return None
    response = directory_api.source_list(name=domain)
    if response["count"] > 0:
        return response["results"][0].get("id")
    return None


df_present.loc[:, "source_id"] = df_present["clean_domain"].apply(get_source_id)

# Convert to numeric with coercion, store in a separate variable
converted_source_id = pd.to_numeric(df_present["source_id"], errors="coerce").astype("Int64")

# Then assign back the converted series
df_present.loc[:, "source_id"] = converted_source_id

[  24940,       1,    <NA>,  368086,      36,   18482,  712531,  214610,
   25249,  370820,       3,   30766,  448086,  745626,  287350,   98379,
  538233,  902437, 1402887,  194496,  104828,   98421,  300560,  223331,
  711091,    <NA>,   27343,  717756,  278039,   65721,   18014,   28710]
Length: 32, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_present.loc[:, "source_id"] = converted_source_id


In [24]:
df_present.head()

Unnamed: 0,SoJo source,Homepage URL,Present in Media Cloud (yes/no),Has health stories (yes/no),Has climate stories(yes/no),Notes/Questions,clean_domain,source_id
5,The Seattle Times: “Education Lab” + “Traffic ...,https://www.seattletimes.com/education-lab/,Yes,Yes,No,,seattletimes.com,24940.0
8,The New York Times: “Fixes”,https://www.nytimes.com/column/fixes,Yes,Yes,Yes,,nytimes.com,1.0
10,Boston Globe: “Things That Work”,https://apps.bostonglobe.com/metro/graphics/20...,yes,yes,no,,apps.bostonglobe.com,
13,Montana Free Press: “Long Streets”,https://montanafreepress.org/,yes,yes,yes,"although it has health and climate stories, th...",montanafreepress.org,368086.0
16,Milwaukee Journal Sentinel: “Wisconsin Ideas Lab”,https://www.jsonline.com/,yes,yes,no,not sojo,jsonline.com,36.0


In [25]:
# directory_api.source_list(name='bostonglobe.com')['results']
nan_count = df_present["source_id"].isna().sum()
print(f"Number of NaNs in source_id column: {nan_count}")

Number of NaNs in source_id column: 2


In [26]:
nan_rows = df_present[df_present["source_id"].isna()]
nan_rows_domain = df_present[df_present["clean_domain"].isna()]
print(f"Number of NaNs in source_id column: {len(nan_rows)}")
display(nan_rows)

Number of NaNs in source_id column: 2


Unnamed: 0,SoJo source,Homepage URL,Present in Media Cloud (yes/no),Has health stories (yes/no),Has climate stories(yes/no),Notes/Questions,clean_domain,source_id
10,Boston Globe: “Things That Work”,https://apps.bostonglobe.com/metro/graphics/20...,yes,yes,no,,apps.bostonglobe.com,
40,Mongabay: “Environment and Her”,https://india.mongabay.com/series/environment-...,yes,no,yes,"Environment and her is from Mongabay india, bu...",india.mongabay.com,


In [27]:
row = df_present[df_present["SoJo source"] == "Vox: “Future Perfect”"]
display(row)
print(directory_api.source_list(name="https://vox.com/future-perfect")["results"])
print(directory_api.source_list(name="vox.com")["results"][0]["id"])

Unnamed: 0,SoJo source,Homepage URL,Present in Media Cloud (yes/no),Has health stories (yes/no),Has climate stories(yes/no),Notes/Questions,clean_domain,source_id
34,Vox: “Future Perfect”,https://www.vox.com/future-perfect,yes?,yes,yes,"my_query = 'url:""vox.com/future-perfect/*""' ga...",vox.com,104828


[]
104828


Maybe because india.mongabay.com and apps.bostonglobe.com aren't recognised by the query engine. Also vox and vox future perfect have the same domain, so it's the same source id. This will count all vox articles as SoJo when it's just the future perfect ones. This is a problem we need to solve when attempting to create a collection on Media Cloud for Solutions Journalism (creating child sources)

## Querying from our Media Cloud collection: https://search.mediacloud.org/collections/262985244/edit

In [28]:
my_query = '"climate change"'
SOJO_COLLECTION = 262985244
results = search_api.story_count(my_query, start_date, end_date, collection_ids=[SOJO_COLLECTION])
print(results)

{'relevant': 622, 'total': 116981}


In [29]:
my_query = "healthcare"
SOJO_COLLECTION = 262985244
results = search_api.story_count(my_query, start_date, end_date, collection_ids=[SOJO_COLLECTION])
print(results)

{'relevant': 262, 'total': 116981}


In [6]:
my_query = "healthcare OR health OR diseases OR enrollees OR clinics OR confidential OR medicare OR nursing OR robert OR mental OR credits OR prevention OR enrolled OR clinic OR diagnosis OR disease OR cancer OR medication OR patient OR disabilities OR guidance OR wellbeing"
SOJO_COLLECTION = 262985244
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 7, 10)
results = search_api.story_count(my_query, start_date, end_date, collection_ids=[SOJO_COLLECTION])
print(results)

{'relevant': 2905, 'total': 43713}
