Scripts

In [2]:
from newsapi import NewsApiClient
import datetime
import time
import pandas as pd
import os
from newspaper import Article
import sys

In [None]:
# Initialize NewsAPI Client
api_key = ""  # Replace with API key
newsapi = NewsApiClient(api_key=api_key)

# Input and output file paths
input_file = "news_articles_election_candidates_expanded.csv"
full_content_file = "news_articles_election_candidates_full_content.csv"

# Function to fetch full article content using newspaper3k
def fetch_full_content(article_url):
    try:
        article = Article(article_url)
        article.download()
        article.parse()
        return article.text  # Return the full article text
    except Exception as e:
        return None  # Return None if there is an error

# Set new date range
start_date = datetime.date(2024, 10, 12)  # Continue from where the previous script left off
end_date = datetime.date(2024, 8, 1)  # Adjust end date as needed for backward collection

# Prepare to store results
articles_data = []

# Expanded search queries
queries = [
    "2024 Presidential election",
    "US election AND (Donald Trump OR Kamala Harris)",
    "Biden administration AND 2024 election",
    "(Donald Trump OR Trump) AND 2024 election",
    "(Kamala Harris OR Harris) AND 2024 election",
    "Campaign financing AND 2024 election",
    "Voter turnout AND 2024 election",
    "Presidential debate AND 2024 election",
    "(Donald Trump OR Trump) AND rally AND 2024",
    "(Kamala Harris OR Harris) AND speech AND 2024",
    "(Donald Trump OR Kamala Harris) AND 2024 election",
    "(Trump OR Harris) AND campaign AND 2024 election"
]

# Track API request count to avoid exceeding limits
request_count = 0
max_requests = 100  # Free-tier daily API limit

# Load existing expanded and full content CSV files
existing_expanded_data = pd.read_csv(input_file) if os.path.exists(input_file) else pd.DataFrame()
existing_full_content_data = pd.read_csv(full_content_file) if os.path.exists(full_content_file) else pd.DataFrame()

# Track URLs to avoid duplicates
existing_urls = set(existing_expanded_data["url"]) if not existing_expanded_data.empty else set()
processed_urls = set(existing_full_content_data["url"]) if not existing_full_content_data.empty else set()

# Fetch new articles from NewsAPI
current_date = start_date
while current_date >= end_date:
    # Convert date to string for API
    date_str = current_date.strftime('%Y-%m-%d')
    print(f"Fetching articles for {date_str}...")

    for query in queries:
        try:
            # Check if API limit is reached
            if request_count >= max_requests:
                print("Reached API limit for the day. Exiting script.")
                sys.exit()

            # Fetch articles for the current query and date
            response = newsapi.get_everything(
                q=query,
                from_param=date_str,
                to=date_str,
                language="en",
                sort_by="relevancy",  # Fetch relevant articles
                page_size=100  # Max articles per API call
            )

            # Increment request count
            request_count += 1

            if response.get('status') != 'ok':
                print(f"API error: {response.get('message')}")
                sys.exit()

            # Process the articles
            if response.get('articles'):
                for article in response['articles']:
                    # Only add new articles that are not already saved
                    if article['url'] not in existing_urls:
                        articles_data.append({
                            "query": query,  # Include the query used for tracking
                            "source": article['source']['name'],
                            "author": article['author'],
                            "title": article['title'],
                            "description": article['description'],
                            "url": article['url'],
                            "published_at": article['publishedAt'],
                            "content": article['content']
                        })
                        # Add the URL to the set of existing URLs
                        existing_urls.add(article['url'])
        except Exception as e:
            # Log the error to a file
            with open("error_log.txt", "a") as log_file:
                log_file.write(f"Error fetching articles for {query} on {date_str}: {e}\n")
            print(f"Error fetching articles for {query} on {date_str}: {e}")
    
    # Move to the previous day
    current_date -= datetime.timedelta(days=1)

    # Avoid hitting API limits by adding a small delay between requests
    time.sleep(1)

# Save new articles to the expanded CSV file
if articles_data:
    new_data_df = pd.DataFrame(articles_data)
    new_data_df.to_csv(input_file, mode='a', header=not os.path.exists(input_file), index=False)
    print(f"Appended {len(new_data_df)} new articles to '{input_file}'.")

# Extract full content for new articles only
new_urls = {article["url"] for article in articles_data}  # URLs of newly fetched articles
urls_to_process = new_urls - processed_urls  # Exclude already processed URLs

if urls_to_process:
    full_content_data = []
    for url in urls_to_process:
        print(f"Fetching full content for {url}...")
        full_content = fetch_full_content(url)
        if full_content:
            full_content_data.append({
                "url": url,
                "full_content": full_content
            })
        else:
            print(f"Failed to fetch full content for {url}")

    # Append full content to the full content CSV file
    if full_content_data:
        full_content_df = pd.DataFrame(full_content_data)
        full_content_df.to_csv(full_content_file, mode='a', header=not os.path.exists(full_content_file), index=False)
        print(f"Appended {len(full_content_df)} new full content articles to '{full_content_file}'.")
else:
    print("No new articles to process for full content.")



In [None]:
# Load the CSV file
file_path = 'news_articles_election_candidates_expanded.csv'
data = pd.read_csv(file_path)

data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9610 entries, 0 to 9609
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   query         9610 non-null   object
 1   source        9610 non-null   object
 2   author        9093 non-null   object
 3   title         9607 non-null   object
 4   description   9598 non-null   object
 5   url           9610 non-null   object
 6   published_at  9610 non-null   object
 7   content       9610 non-null   object
dtypes: object(8)
memory usage: 600.8+ KB


In [None]:
file_path = 'news_articles_election_candidates_full_content.csv'
data_full_content = pd.read_csv(file_path)

data_full_content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9610 entries, 0 to 9609
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   query         9610 non-null   object
 1   source        9610 non-null   object
 2   author        9093 non-null   object
 3   title         9607 non-null   object
 4   description   9598 non-null   object
 5   url           9610 non-null   object
 6   published_at  9610 non-null   object
 7   content       9610 non-null   object
 8   full_content  7556 non-null   object
dtypes: object(9)
memory usage: 675.8+ KB


Remove entries with missing full content

START

In [3]:
data_full_content = pd.read_csv('news_articles_election_candidates_full_content_cleaned.csv')

In [8]:
data_full_content.head()

Unnamed: 0,query,source,author,title,description,url,published_at,content,full_content
0,2024 Presidential election,Wired,"Lily Hay Newman, Tess Owen",Russia Is Going All Out on Election Day Interf...,Along with other foreign influence operations—...,https://www.wired.com/story/russia-election-di...,2024-11-05T21:04:35Z,As the 2024 US presidential election comes to ...,As the 2024 US presidential election comes to ...
1,2024 Presidential election,The Verge,Justine Calma,Apple News will let you watch election results...,"On Election Day in the US, Apple News is rolli...",https://www.theverge.com/2024/11/5/24288777/el...,2024-11-05T16:34:12Z,Image: Cath Virginia / The Verge\r\n\n \n\n Fo...,For anyone obsessively watching election resul...
2,2024 Presidential election,NPR,Megan Pratz,Here's how NPR will report the 2024 election r...,NPR relies on results and race calls from The ...,https://www.npr.org/2024/11/04/g-s1-31268/2024...,2024-11-05T10:00:00Z,"Voters have been voting, ballots will be count...",Here's how NPR will report the 2024 election r...
3,2024 Presidential election,Business Insider,"insider@insider.com (John L. Dorman,Kelsey Vla...",The 2024 presidential election may come down t...,Election results in the swing states of Arizon...,https://www.businessinsider.com/what-are-2024-...,2024-11-05T22:13:18Z,"Over the past two decades, the road to the Whi...","On Election Day 2024, both parties are eyeing ..."
4,2024 Presidential election,CNET,Thomas Kika,How Do I Keep Track of Official Election Resul...,An avalanche of Election Day 2024 coverage is ...,https://www.cnet.com/tech/services-and-softwar...,2024-11-05T17:45:00Z,As if the 2024 election reporting could be any...,All anyone is talking about today is the presi...


In [6]:
print("Number of rows (articles):", data_full_content.shape[0])
print("Number of variables (columns):", data_full_content.shape[1])


Number of rows (articles): 7556
Number of variables (columns): 9


In [7]:
# Calculate the number of unique media outlets
unique_media_outlets = data_full_content['source'].nunique()

# Display the number of unique media outlets
print(f"Number of unique media outlets: {unique_media_outlets}")


Number of unique media outlets: 454


Number of nodes: 842
Number of edges: 34386
Sample nodes: [('Donald Trump', {'articles': [0, 1, 3, 4, 5, 6, 7, 9, 10, 12, 13, 15, 17, 18, 19, 20, 21, 24, 25, 27, 29, 30, 33, 34, 38, 40, 42, 43, 45, 49, 50, 51, 52, 53, 55, 57, 58, 64, 65, 67, 70, 71, 74, 75, 76, 77, 78, 82, 83, 84, 86, 91, 92, 93, 94, 95, 96, 97, 98, 99]}), ('Hillary Clinton', {'articles': [0, 33, 34, 3, 7, 12, 17, 49, 19, 82, 21, 92, 95]}), ('Clinton', {'articles': [0, 58, 3]}), ('Brad Raffensperger', {'articles': [0]}), ('Cait Conley', {'articles': [0]})]
Sample edges: [('Donald Trump', 'Brad Raffensperger', {'weight': 1}), ('Donald Trump', 'Tim Walz', {'weight': 7}), ('Donald Trump', 'Hillary Clinton', {'weight': 13}), ('Donald Trump', 'Adrian Fontes', {'weight': 2}), ('Donald Trump', 'Cait Conley', {'weight': 1})]


# NER

In [4]:
import networkx as nx
import spacy
from collections import defaultdict
import requests
import spacy.cli

# Download the en_core_web_trf model
spacy.cli.download("en_core_web_trf")

# Process only the first 100 articles for testing
data_sample = data_full_content.head(100)

# Load the spaCy English model
nlp = spacy.load("en_core_web_trf")

# Collect authors' names to exclude them from entities
authors_set = set()
for author in data_sample['author'].dropna():
    # Some articles may have multiple authors separated by commas
    authors = [a.strip() for a in author.split(',')]
    authors_set.update(authors)

# Lowercase authors' names for consistent comparison
authors_set = {author.lower() for author in authors_set}

# Function to retrieve U.S. politicians and their aliases
def get_current_us_congress_members():
    url = 'https://query.wikidata.org/sparql'
    query = '''
    SELECT DISTINCT ?person ?personLabel ?aliasLabel WHERE {
      VALUES ?position { wd:Q13217683 wd:Q13218630 }  # U.S. Senator and Representative
      ?person p:P39 ?positionStatement.
      ?positionStatement ps:P39 ?position;
                         pq:P580 ?startTime.
      FILTER NOT EXISTS { ?positionStatement pq:P582 ?endTime. }  # Position with no end time
      OPTIONAL { ?person skos:altLabel ?aliasLabel FILTER (LANG(?aliasLabel) = "en") }
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    '''
    headers = {'Accept': 'application/sparql-results+json'}
    response = requests.get(url, params={'query': query}, headers=headers, timeout=60)
    if response.status_code != 200:
        raise Exception(f"SPARQL query failed with status {response.status_code}: {response.text}")
    data = response.json()

    politician_aliases = {}
    for item in data['results']['bindings']:
        canonical_name = item['personLabel']['value']
        alias = item.get('aliasLabel', {}).get('value')
        if canonical_name not in politician_aliases:
            politician_aliases[canonical_name] = set()
            politician_aliases[canonical_name].add(canonical_name)
        if alias:
            politician_aliases[canonical_name].add(alias)
    return politician_aliases


# Retrieve the U.S. politicians and their aliases
politician_aliases_raw = get_current_us_congress_members()

additional_politicians = {
    "Joe Biden": {"Joe Biden", "Joseph Biden", "Biden", "President Biden"},
    "Donald Trump": {"Donald Trump", "Trump", "President Trump"},
    "Kamala Harris": {"Kamala Harris", "Harris", "Vice President Harris"},
    # Add other candidates and relevant figures
}

politician_aliases_raw.update(additional_politicians)

# Build the alias_to_canonical mapping
alias_to_canonical = {}
for canonical_name, aliases in politician_aliases_raw.items():
    for alias in aliases:
        alias_to_canonical[alias.lower()] = canonical_name

# Initialize mappings for mentions
article_mentions = defaultdict(set)  # Maps article index to mentioned politicians
politician_mentions = defaultdict(set)  # Maps politician to articles they're mentioned in

# Perform NER and normalize entity names
# Process articles
for idx, row in data_sample.iterrows():
    content = row['full_content']
    doc = nlp(content)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entity_name = ent.text.strip()
            entity_name_lower = entity_name.lower()
            # Exclude authors
            if entity_name_lower in authors_set:
                continue
            # Map entity name to canonical politician name
            canonical_name = alias_to_canonical.get(entity_name_lower)
            if canonical_name:
                article_mentions[idx].add(canonical_name)
                politician_mentions[canonical_name].add(idx)


Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  model.load_state_dict(torch.load(filelike, map_location=device))
  with torch.cuda.amp.autocast(self._mixed_precision):


# Creating the network

In [5]:
# Create the network graph
G = nx.Graph()

# Add nodes with attributes (politicians and articles)
for politician, articles in politician_mentions.items():
    G.add_node(politician, articles=list(articles))

# Add edges based on co-mentions in articles
for article_idx, mentioned_politicians in article_mentions.items():
    mentioned_politicians = list(mentioned_politicians)
    for i in range(len(mentioned_politicians)):
        for j in range(i + 1, len(mentioned_politicians)):
            p1, p2 = mentioned_politicians[i], mentioned_politicians[j]
            if G.has_edge(p1, p2):
                G[p1][p2]['weight'] += 1
            else:
                G.add_edge(p1, p2, weight=1)

# Output graph information
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print("Sample nodes with attributes:")
for node, attrs in list(G.nodes(data=True))[:5]:
    print(f"{node}: {attrs}")
print("Sample edges with weights:")
for u, v, attrs in list(G.edges(data=True))[:5]:
    print(f"{u} - {v}: {attrs}")

Number of nodes: 14
Number of edges: 36
Sample nodes with attributes:
Donald Trump: {'articles': [0, 1, 3, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 34, 38, 40, 42, 43, 45, 48, 49, 50, 51, 52, 53, 55, 57, 58, 64, 65, 67, 69, 70, 71, 74, 75, 76, 77, 78, 82, 83, 84, 85, 86, 91, 92, 93, 94, 95, 96, 97, 98, 99]}
Kamala Harris: {'articles': [1, 3, 4, 5, 6, 7, 9, 10, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 33, 34, 35, 38, 40, 42, 45, 49, 50, 51, 52, 53, 57, 58, 64, 65, 67, 70, 74, 75, 82, 83, 84, 85, 86, 91, 92, 93, 94, 95, 96, 97, 98, 99]}
Joe Biden: {'articles': [3, 5, 7, 9, 10, 11, 12, 17, 18, 19, 21, 24, 28, 33, 42, 45, 49, 74, 82, 92, 94, 96, 97]}
Ruben Gallego: {'articles': [3]}
Dina Titus: {'articles': [3]}
Sample edges with weights:
Donald Trump - Kamala Harris: {'weight': 55}
Donald Trump - Ruben Gallego: {'weight': 1}
Donald Trump - Steven Horsford: {'weight': 1}
Donald Trump - Dina Titus: {'weight': 1}
Donald Trump -