In [1]:
# Set up your API key and import needed things
import os, mediacloud.api
from importlib.metadata import version
from dotenv import load_dotenv
import datetime as dt
from IPython.display import JSON
import bokeh.io
bokeh.io.reset_output()
bokeh.io.output_notebook()
MC_API_KEY = 'fd180b0f9ce16f29121870731d39725c24f094e5'
search_api = mediacloud.api.SearchApi(MC_API_KEY)
f'Using Media Cloud python client v{version("mediacloud")}'

'Using Media Cloud python client v4.3.0'

# Querying boston globe climate solutions journalism article from url

In [None]:
my_query = 'url:"https://www.bostonglobe.com/2025/03/03/world/south-sudan-coffee-beans-climate-change/"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6,10)
search_api.story_list(my_query,start_date,end_date)

# Extracting fulltext data of the boston globe article

In [3]:
boston_globe_article = search_api.story('bbe590fcc23edb5d310ef99dd1dbddeb1223ff634588d2043ba18e80789700cf')['text']
with open("boston_globe_article.txt", "w") as file:
    file.write(boston_globe_article)

# Sentiment analysis

In [None]:
#%pip install textblob
from textblob import TextBlob

with open('boston_globe_article.txt', 'r') as f:
    text = f.read()

blob = TextBlob(text)
sentiment = blob.sentiment
print(f"Polarity: {sentiment.polarity:.2f}")

print(f"Subjectivity: {sentiment.subjectivity:.2f}")

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)

# Print the scores
print(scores)

# Interpret the compound score
compound_score = scores['compound']
if compound_score >= 0.05:
    sentiment = "Positive"
elif compound_score <= -0.05:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

print(f"Overall sentiment: {sentiment}")


# Repeating for Yale 360 article, but using Newspaper3k to scrape, because it was not found using the search API

In [None]:
#%pip install newspaper3k
#%pip install lxml_html_clean
from newspaper import Article

url = 'https://e360.yale.edu/features/eric-nost-interview'
article = Article(url)
article.download()
article.parse()

with open("yale_360_article.txt", "w", encoding="utf-8") as f:
    f.write(article.title + "\n\n")
    f.write(article.text)

blob = TextBlob(text)
sentiment = blob.sentiment
print(f"Polarity: {sentiment.polarity:.2f}")

print(f"Subjectivity: {sentiment.subjectivity:.2f}")

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)

# Print the scores
print(scores)

# Interpret the compound score
compound_score = scores['compound']
if compound_score >= 0.05:
    sentiment = "Positive"
elif compound_score <= -0.05:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

print(f"Overall sentiment: {sentiment}")

# Repeating for BBC article

In [None]:
import requests
from bs4 import BeautifulSoup

#url = 'https://www.bbc.com/news/articles/ce82p6yq061o' 
#url = 'https://www.bbc.com/news/articles/c1dekp93l6po'
url = 'https://www.bbc.com/news/articles/cvgdyl817p1o'

article = Article(url)
article.download()
article.parse()

with open("bbc_article.txt", "w", encoding="utf-8") as f:
    f.write(article.title + "\n\n")
    f.write(article.text)

with open('bbc_article.txt', 'r') as f:
    text = f.read()

blob = TextBlob(text)
sentiment = blob.sentiment
print(f"Polarity: {sentiment.polarity:.2f}")

print(f"Subjectivity: {sentiment.subjectivity:.2f}")

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)

# Print the scores
print(scores)

# Interpret the compound score
compound_score = scores['compound']
if compound_score >= 0.05:
    sentiment = "Positive"
elif compound_score <= -0.05:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

print(f"Overall sentiment: {sentiment}")

From this, we can see that it is not necessarily always true that solutions journalism = positive sentiment and non-solutions journalism = negative content, as we have found a sample of non-solutions journalism that has positive sentiment. But we may need to consider sentiment as a feature when creating our classifier pipeline.

# Modularising - extract full text and sentiment analysis as two separate funtions

In [None]:
my_query = 'url:"https://www.bbc.com/news/articles/cvgdyl817p1o"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6,10)
print(search_api.story_list(my_query,start_date,end_date)[0])

my_query = 'url:"https://e360.yale.edu/features/eric-nost-interview"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6,10)
x = search_api.story_list(my_query,start_date,end_date)
if len(x[0]) == 0:
    print("not found")

In [8]:

from newspaper import Article
import datetime as dt
import requests

def extract_full_text(url, output_file_name):
    my_query = f'url:"{url}"'
    start_date = dt.date(2025, 1, 1)
    end_date = dt.date(2025, 6, 10)

    article_text = ""

    try:
        result = search_api.story_list(my_query, start_date, end_date)[0]
    except Exception as e:
        print(f"Search API failed: {e}")
        result = None

    if result:
        try:
            article_text = search_api.story(result['id'])['text']
        except Exception as e:
            print(f"Error fetching from Media Cloud: {e}")
    else:
        try:
            article = Article(url)
            article.download()
            article.parse()
            article_text = article.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
        except Exception as e:
            print(f"Error parsing article: {e}")

    out_file = f"{output_file_name}.txt"
    with open(out_file, "w", encoding='utf-8') as file:
        file.write(article_text)

    print(f"Article saved to {out_file}")


import difflib

def compare_files(file1_path, file2_path):
    """Compares two text files and prints the differences."""
    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
        file1_lines = file1.readlines()
        file2_lines = file2.readlines()

    diff = difflib.unified_diff(file1_lines, file2_lines, fromfile=file1_path, tofile=file2_path)

    for line in diff:
        print(line, end="")

#def sentiment_analysis(file):
    #VADER - pos/neg/neutral
    #textblob - Polarity/Subjectivity

# Verifying that extract_full_text is working as expected

In [None]:
extract_full_text("https://www.bostonglobe.com/2025/03/03/world/south-sudan-coffee-beans-climate-change/","boston_globe_article_f")
compare_files('boston_globe_article_f.txt', 'boston_globe_article.txt')

In [None]:
extract_full_text("https://e360.yale.edu/features/eric-nost-interview","yale_360_article_f")
compare_files('yale_360_article_f.txt','yale_360_article.txt')

In [None]:
extract_full_text("https://www.bbc.com/news/articles/cvgdyl817p1o","bbc_article_f")
compare_files('bbc_article_f.txt','bbc_article.txt')

# Listing sources

In [None]:
my_query = 'All'
start_date = dt.date(2024, 1, 1)
end_date = dt.date(2025, 6,10)
source_list = search_api.sources(my_query,start_date,end_date)
import re
for i in source_list:
    print(i['source'])

In [2]:
my_query = 'url:"https://www.vox.com/future-perfect/*"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6, 10)

search_api.story_list(my_query, start_date, end_date)


my_query = 'url:"https://www.theguardian.com/world/series/the-upside/*"'
start_date = dt.date(2025, 1, 1)
end_date = dt.date(2025, 6, 10)

search_api.story_list(my_query, start_date, end_date)

my_query = 'url:"https://www.hcn.org/topic/what-works/*"'
start_date = dt.date(2021, 1, 1)
end_date = dt.date(2025, 6, 10)

print(search_api.story_list(my_query, start_date, end_date))

([], None)


In [49]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import dateutil.parser
import datetime as dt

def crawl_section_urls(base_url, max_pages=5):
    """
    Crawl article URLs from a section URL.
    Supports Vox, Guardian, and HCN based on URL patterns.
    max_pages limits pagination depth (adjust as needed).
    """
    article_urls = set()
    for page in range(1, max_pages + 1):
        # Construct page URL depending on site
        if "vox.com" in base_url:
            # Vox paginates with ?page=2 etc
            url = f"{base_url}?page={page}" if page > 1 else base_url
        elif "theguardian.com" in base_url:
            # Guardian paginates with /page/2 etc
            url = f"{base_url}/page/{page}" if page > 1 else base_url
        elif "hcn.org" in base_url:
            # HCN uses ?page=2 etc
            url = f"{base_url}?page={page}" if page > 1 else base_url
        else:
            # Unknown site — no pagination
            url = base_url

        print(f"Crawling page {page} at {url}")
        resp = requests.get(url)
        if resp.status_code != 200:
            print(f"Failed to load page: {url}")
            break

        soup = BeautifulSoup(resp.text, "html.parser")

        # Extract article links based on site
        links = []
        if "vox.com" in base_url:
            links = [a['href'] for a in soup.select("a[href*='/future-perfect/']") if a.has_attr('href')]
        elif "theguardian.com" in base_url:
            links = []
            links += [a['href'] for a in soup.select("a.js-headline-text")]
            links += [a['href'] for a in soup.select("a.u-faux-block-link__overlay")]
            links += [a['href'] for a in soup.select("div.fc-item__content a[href]")]
            links = list(set(links))
        elif "hcn.org" in base_url:
            links = [a['href'] for a in soup.select("h2.entry-title a")]

        # Clean and filter full URLs
        for link in links:
            if link.startswith('/'):
                # Make full URL
                if "vox.com" in base_url:
                    full_url = "https://www.vox.com" + link
                elif "theguardian.com" in base_url:
                    full_url = "https://www.theguardian.com" + link
                elif "hcn.org" in base_url:
                    full_url = "https://www.hcn.org" + link
                else:
                    full_url = link
            else:
                full_url = link

            article_urls.add(full_url)

        # Stop if no new links found on this page (end pagination)
        if not links:
            break

    return sorted(article_urls)

def fetch_and_filter_articles(urls, keywords, start_date=None, end_date=None):
    """
    Download articles, parse publish date, filter by keywords and date range.
    Returns list of dict with title, url, publish_date, and snippet.
    """
    results = []
    for url in urls:
        try:
            art = Article(url)
            art.download()
            art.parse()

            # Only text-based articles (no video/audio-only)
            if len(art.text) < 200:
                # Skip if text too short (likely not a full article)
                continue

            pub_date = art.publish_date
            if pub_date is None:
                # Try to parse date from meta tags manually
                # or skip article if no date
                pub_date = None
            else:
                # Normalize to date only
                pub_date = pub_date.date()

            # Filter by date
            if start_date and pub_date and pub_date < start_date:
                continue
            if end_date and pub_date and pub_date > end_date:
                continue

            # Keyword filter in title or text (case-insensitive)
            text_lower = art.text.lower()
            title_lower = (art.title or "").lower()
            if not any(kw.lower() in text_lower or kw.lower() in title_lower for kw in keywords):
                continue

            # Save minimal info
            snippet = art.text[:300].replace('\n', ' ') + '...'
            results.append({
                "title": art.title,
                "url": url,
                "publish_date": pub_date,
                "snippet": snippet
            })

        except Exception as e:
            print(f"Failed to process {url}: {e}")
    print(f"Found {len(results)} articles")
    return results

In [None]:
import datetime as dt

# Your section URLs:
vox_section = "https://www.vox.com/future-perfect"
guardian_section = "https://www.theguardian.com/world/series/the-upside"
hcn_section = "https://www.hcn.org/topic/what-works/"

# Keywords to filter by:
keywords = ["climate", "climate change", "health", "healthcare"]

# Date range for filtering articles:
start_date = dt.date(2024, 1, 1)
end_date = dt.date.today()

# Crawl article URLs (limit max_pages to control scraping load)
print("Crawling Vox articles...")
vox_urls = crawl_section_urls(vox_section, max_pages=5)
print(f"🔗 Vox URLs found: {len(vox_urls)}")

print("Crawling Guardian articles...")
guardian_urls = crawl_section_urls(guardian_section, max_pages=5)
print(f"🔗 Guardian URLs found: {len(guardian_urls)}")

print("Crawling HCN articles...")
hcn_urls = crawl_section_urls(hcn_section, max_pages=3)
print(f"🔗 HCN URLs found: {len(hcn_urls)}")

In [None]:
keywords = ["climate", "climate change"]
# Fetch articles and filter by keywords and date
print("Filtering Vox articles...")
vox_matches = fetch_and_filter_articles(vox_urls, keywords, start_date, end_date)
print("Filtering Guardian articles...")
guardian_matches = fetch_and_filter_articles(guardian_urls, keywords, start_date, end_date)
print("Filtering HCN articles...")
hcn_matches = fetch_and_filter_articles(hcn_urls, keywords, start_date, end_date)

# Print summary
print("\n=== Vox matches ===")
for article in vox_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

print("\n=== Guardian matches ===")
for article in guardian_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

print("\n=== HCN matches ===")
for article in hcn_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

In [None]:
keywords = ["health", "healthcare"]
# Fetch articles and filter by keywords and date
print("Filtering Vox articles...")
vox_matches = fetch_and_filter_articles(vox_urls, keywords, start_date, end_date)
print("Filtering Guardian articles...")
guardian_matches = fetch_and_filter_articles(guardian_urls, keywords, start_date, end_date)
print("Filtering HCN articles...")
hcn_matches = fetch_and_filter_articles(hcn_urls, keywords, start_date, end_date)

# Print summary
print("\n=== Vox matches ===")
for article in vox_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

print("\n=== Guardian matches ===")
for article in guardian_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

print("\n=== HCN matches ===")
for article in hcn_matches:
    print(f"- {article['title']}\n  {article['url']}\n")

In [None]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://www.theguardian.com/world/series/the-upside"
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")

# Look for relative URLs starting with /world/series/the-upside/
pattern = re.compile(r'^/world/series/the-upside/.*')

links = []
for a in soup.find_all("a", href=True):
    href = a['href']
    if pattern.match(href):
        full_url = f"https://www.theguardian.com{href}"
        links.append(full_url)

unique_links = list(set(links))
print(f"Found {len(unique_links)} unique links:")
for link in unique_links:
    print(link)

In [None]:
from newspaper import Article
import concurrent.futures

guardian_urls = [
    "https://www.theguardian.com/world/series/the-upside/2021/apr/02/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/18/all",
    "https://www.theguardian.com/world/series/the-upside/2022/apr/21/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/07/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/26/all",
    "https://www.theguardian.com/world/series/the-upside/2021/apr/16/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/25/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jul/09/all",
    "https://www.theguardian.com/world/series/the-upside/2021/apr/23/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jul/16/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/21/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/12/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/19/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/11/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/15/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/14/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/03/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/12/all",
    "https://www.theguardian.com/world/series/the-upside/2024/aug/11/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/26/all",
]

keywords = ["climate", "environment", "energy", "policy"]  # Replace with your keywords

def check_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        text = article.text.lower()
        title = article.title.lower()
        if any(k.lower() in text or k.lower() in title for k in keywords):
            return url, article.title
        else:
            return None
    except Exception as e:
        print(f"Failed to process {url}: {e}")
        return None

results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(check_article, url) for url in guardian_urls]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        if result:
            results.append(result)

print("Filtered Guardian matches:")
for url, title in results:
    print(f"- {title}\n  {url}")

In [None]:
from newspaper import Article
import concurrent.futures

guardian_urls = [
    "https://www.theguardian.com/world/series/the-upside/2021/apr/02/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/18/all",
    "https://www.theguardian.com/world/series/the-upside/2022/apr/21/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/07/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/26/all",
    "https://www.theguardian.com/world/series/the-upside/2021/apr/16/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/25/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jul/09/all",
    "https://www.theguardian.com/world/series/the-upside/2021/apr/23/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jul/16/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/21/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/12/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/19/all",
    "https://www.theguardian.com/world/series/the-upside/2021/jun/11/all",
    "https://www.theguardian.com/world/series/the-upside/2021/feb/15/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/14/all",
    "https://www.theguardian.com/world/series/the-upside/2021/may/03/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/12/all",
    "https://www.theguardian.com/world/series/the-upside/2024/aug/11/all",
    "https://www.theguardian.com/world/series/the-upside/2021/mar/26/all",
]

keywords = ["health", "healthcare"]  # Replace with your keywords

def check_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        text = article.text.lower()
        title = article.title.lower()
        if any(k.lower() in text or k.lower() in title for k in keywords):
            return url, article.title
        else:
            return None
    except Exception as e:
        print(f"Failed to process {url}: {e}")
        return None

results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(check_article, url) for url in guardian_urls]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        if result:
            results.append(result)

print("Filtered Guardian matches:")
for url, title in results:
    print(f"- {title}\n  {url}")
