Scripts

In [2]:
from newsapi import NewsApiClient
import datetime
import time
import pandas as pd
import os
from newspaper import Article
import sys

In [None]:
# Initialize NewsAPI Client
api_key = ""  # Replace with API key
newsapi = NewsApiClient(api_key=api_key)

# Input and output file paths
input_file = "news_articles_election_candidates_expanded.csv"
full_content_file = "news_articles_election_candidates_full_content.csv"

# Function to fetch full article content using newspaper3k
def fetch_full_content(article_url):
    try:
        article = Article(article_url)
        article.download()
        article.parse()
        return article.text  # Return the full article text
    except Exception as e:
        return None  # Return None if there is an error

# Set new date range
start_date = datetime.date(2024, 10, 12)  # Continue from where the previous script left off
end_date = datetime.date(2024, 8, 1)  # Adjust end date as needed for backward collection

# Prepare to store results
articles_data = []

# Expanded search queries
queries = [
    "2024 Presidential election",
    "US election AND (Donald Trump OR Kamala Harris)",
    "Biden administration AND 2024 election",
    "(Donald Trump OR Trump) AND 2024 election",
    "(Kamala Harris OR Harris) AND 2024 election",
    "Campaign financing AND 2024 election",
    "Voter turnout AND 2024 election",
    "Presidential debate AND 2024 election",
    "(Donald Trump OR Trump) AND rally AND 2024",
    "(Kamala Harris OR Harris) AND speech AND 2024",
    "(Donald Trump OR Kamala Harris) AND 2024 election",
    "(Trump OR Harris) AND campaign AND 2024 election"
]

# Track API request count to avoid exceeding limits
request_count = 0
max_requests = 100  # Free-tier daily API limit

# Load existing expanded and full content CSV files
existing_expanded_data = pd.read_csv(input_file) if os.path.exists(input_file) else pd.DataFrame()
existing_full_content_data = pd.read_csv(full_content_file) if os.path.exists(full_content_file) else pd.DataFrame()

# Track URLs to avoid duplicates
existing_urls = set(existing_expanded_data["url"]) if not existing_expanded_data.empty else set()
processed_urls = set(existing_full_content_data["url"]) if not existing_full_content_data.empty else set()

# Fetch new articles from NewsAPI
current_date = start_date
while current_date >= end_date:
    # Convert date to string for API
    date_str = current_date.strftime('%Y-%m-%d')
    print(f"Fetching articles for {date_str}...")

    for query in queries:
        try:
            # Check if API limit is reached
            if request_count >= max_requests:
                print("Reached API limit for the day. Exiting script.")
                sys.exit()

            # Fetch articles for the current query and date
            response = newsapi.get_everything(
                q=query,
                from_param=date_str,
                to=date_str,
                language="en",
                sort_by="relevancy",  # Fetch relevant articles
                page_size=100  # Max articles per API call
            )

            # Increment request count
            request_count += 1

            if response.get('status') != 'ok':
                print(f"API error: {response.get('message')}")
                sys.exit()

            # Process the articles
            if response.get('articles'):
                for article in response['articles']:
                    # Only add new articles that are not already saved
                    if article['url'] not in existing_urls:
                        articles_data.append({
                            "query": query,  # Include the query used for tracking
                            "source": article['source']['name'],
                            "author": article['author'],
                            "title": article['title'],
                            "description": article['description'],
                            "url": article['url'],
                            "published_at": article['publishedAt'],
                            "content": article['content']
                        })
                        # Add the URL to the set of existing URLs
                        existing_urls.add(article['url'])
        except Exception as e:
            # Log the error to a file
            with open("error_log.txt", "a") as log_file:
                log_file.write(f"Error fetching articles for {query} on {date_str}: {e}\n")
            print(f"Error fetching articles for {query} on {date_str}: {e}")
    
    # Move to the previous day
    current_date -= datetime.timedelta(days=1)

    # Avoid hitting API limits by adding a small delay between requests
    time.sleep(1)

# Save new articles to the expanded CSV file
if articles_data:
    new_data_df = pd.DataFrame(articles_data)
    new_data_df.to_csv(input_file, mode='a', header=not os.path.exists(input_file), index=False)
    print(f"Appended {len(new_data_df)} new articles to '{input_file}'.")

# Extract full content for new articles only
new_urls = {article["url"] for article in articles_data}  # URLs of newly fetched articles
urls_to_process = new_urls - processed_urls  # Exclude already processed URLs

if urls_to_process:
    full_content_data = []
    for url in urls_to_process:
        print(f"Fetching full content for {url}...")
        full_content = fetch_full_content(url)
        if full_content:
            full_content_data.append({
                "url": url,
                "full_content": full_content
            })
        else:
            print(f"Failed to fetch full content for {url}")

    # Append full content to the full content CSV file
    if full_content_data:
        full_content_df = pd.DataFrame(full_content_data)
        full_content_df.to_csv(full_content_file, mode='a', header=not os.path.exists(full_content_file), index=False)
        print(f"Appended {len(full_content_df)} new full content articles to '{full_content_file}'.")
else:
    print("No new articles to process for full content.")



Remove entries with missing full content

START

In [2]:
import pandas as pd
data_full_content = pd.read_csv('news_articles_election_candidates_full_content_cleaned.csv')

# NER

In [3]:
import networkx as nx
import spacy
from collections import defaultdict
import requests
import spacy.cli
import json

# Download the en_core_web_trf model
spacy.cli.download("en_core_web_trf")

# Process only the first 100 articles for testing
data_sample = data_full_content.head(100)

# Load the spaCy English model
nlp = spacy.load("en_core_web_trf")

# Collect authors' names to exclude them from entities
authors_set = set()
for author in data_sample['author'].dropna():
    # Some articles may have multiple authors separated by commas
    authors = [a.strip() for a in author.split(',')]
    authors_set.update(authors)

# Lowercase authors' names for consistent comparison
authors_set = {author.lower() for author in authors_set}

# Function to retrieve U.S. politicians and their aliases
def get_current_us_congress_members():
    url = 'https://query.wikidata.org/sparql'
    query = '''
    SELECT DISTINCT ?person ?personLabel ?aliasLabel WHERE {
      VALUES ?position { wd:Q13217683 wd:Q13218630 }  # U.S. Senator and Representative
      ?person p:P39 ?positionStatement.
      ?positionStatement ps:P39 ?position;
                         pq:P580 ?startTime.
      FILTER NOT EXISTS { ?positionStatement pq:P582 ?endTime. }  # Position with no end time
      OPTIONAL { ?person skos:altLabel ?aliasLabel FILTER (LANG(?aliasLabel) = "en") }
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    '''
    headers = {'Accept': 'application/sparql-results+json'}
    response = requests.get(url, params={'query': query}, headers=headers, timeout=60)
    if response.status_code != 200:
        raise Exception(f"SPARQL query failed with status {response.status_code}: {response.text}")
    data = response.json()

    politician_aliases = {}
    for item in data['results']['bindings']:
        canonical_name = item['personLabel']['value']
        alias = item.get('aliasLabel', {}).get('value')
        if canonical_name not in politician_aliases:
            politician_aliases[canonical_name] = set()
            politician_aliases[canonical_name].add(canonical_name)
        if alias:
            politician_aliases[canonical_name].add(alias)
    return politician_aliases


# Retrieve the U.S. politicians and their aliases
politician_aliases_raw = get_current_us_congress_members()

# Load manual politician data from JSON file
def load_manual_politicians(file_path="manual_politicians.json"):
    try:
        with open(file_path, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"File not found: {file_path}. Using an empty dictionary.")
        return {}
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return {}

# Load manual politicians
manual_politicians = load_manual_politicians()

# Add manual politicians to the alias dictionary
politician_aliases_raw.update(manual_politicians)

# Logging function to verify additions
def log_aliases(aliases):
    print("Manual politician aliases added:")
    for canonical_name, aliases_set in aliases.items():
        print(f"{canonical_name}: {', '.join(aliases_set)}")

# Log the manual politician data being added
log_aliases(manual_politicians)


# Build the alias_to_canonical mapping
alias_to_canonical = {}
for canonical_name, aliases in politician_aliases_raw.items():
    for alias in aliases:
        alias_to_canonical[alias.lower()] = canonical_name

# Initialize mappings for mentions
article_mentions = defaultdict(set)  # Maps article index to mentioned politicians
politician_mentions = defaultdict(set)  # Maps politician to articles they're mentioned in

# Perform NER and normalize entity names
# Process articles
for idx, row in data_sample.iterrows():
    content = row['full_content']
    doc = nlp(content)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entity_name = ent.text.strip()
            entity_name_lower = entity_name.lower()
            # Exclude authors
            if entity_name_lower in authors_set:
                continue
            # Map entity name to canonical politician name
            canonical_name = alias_to_canonical.get(entity_name_lower)
            if canonical_name:
                article_mentions[idx].add(canonical_name)
                politician_mentions[canonical_name].add(idx)


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:05[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


  model.load_state_dict(torch.load(filelike, map_location=device))


Manual politician aliases added:
Joe Biden: Joe Biden, Joseph Biden, Biden, President Biden
Donald Trump: Donald Trump, Trump, President Trump
Kamala Harris: Kamala Harris, Harris, Vice President Harris
Barack Obama: Barack Obama, Obama, President Obama
Hillary Clinton: Hillary Clinton, Hillary, Secretary Clinton, Clinton
Bill Clinton: Bill Clinton, President Clinton
George W. Bush: George W. Bush, Bush, President Bush
Nancy Pelosi: Nancy Pelosi, Pelosi, Speaker Pelosi
Mitch McConnell: Mitch McConnell, McConnell, Senator McConnell
Bernie Sanders: Bernie Sanders, Sanders, Senator Sanders
Elizabeth Warren: Elizabeth Warren, Warren, Senator Warren
Kevin McCarthy: Kevin McCarthy, McCarthy, Speaker McCarthy
Alexandria Ocasio-Cortez: Alexandria Ocasio-Cortez, AOC, Ocasio-Cortez
Ted Cruz: Ted Cruz, Cruz, Senator Cruz
Marco Rubio: Marco Rubio, Rubio, Senator Rubio
Chuck Schumer: Chuck Schumer, Schumer, Senator Schumer, Majority Leader Schumer
Ron DeSantis: Ron DeSantis, DeSantis, Governor DeSa

### 1. Cell for Setup (Run Once)
This cell performs tasks that only need to be done once, such as downloading the spaCy model and loading manual politician data.

In [4]:
# Import necessary libraries
import spacy
import requests
import json

# Download the spaCy model (Run this only once)
try:
    import spacy.cli
    spacy.cli.download("en_core_web_trf")
except Exception as e:
    print("Model download skipped or already complete:", e)

# Load the spaCy English model
nlp = spacy.load("en_core_web_trf")

# Load manual politician data from JSON file
def load_manual_politicians(file_path="manual_politicians.json"):
    try:
        with open(file_path, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"File not found: {file_path}. Using an empty dictionary.")
        return {}
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return {}

# Load manual politicians (only needs to be loaded once)
manual_politicians = load_manual_politicians()


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### 2. Cell for Data Retrieval and Processing
Use this cell for tasks that depend on external data or need to be re-executed when the data changes (e.g., fetching politician aliases).

In [5]:
from collections import defaultdict

# Function to retrieve U.S. politicians and their aliases
def get_current_us_congress_members():
    url = 'https://query.wikidata.org/sparql'
    query = '''
    SELECT DISTINCT ?person ?personLabel ?aliasLabel WHERE {
      VALUES ?position { wd:Q13217683 wd:Q13218630 }  # U.S. Senator and Representative
      ?person p:P39 ?positionStatement.
      ?positionStatement ps:P39 ?position;
                         pq:P580 ?startTime.
      FILTER NOT EXISTS { ?positionStatement pq:P582 ?endTime. }  # Position with no end time
      OPTIONAL { ?person skos:altLabel ?aliasLabel FILTER (LANG(?aliasLabel) = "en") }
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    '''
    headers = {'Accept': 'application/sparql-results+json'}
    response = requests.get(url, params={'query': query}, headers=headers, timeout=60)
    if response.status_code != 200:
        raise Exception(f"SPARQL query failed with status {response.status_code}: {response.text}")
    data = response.json()

    politician_aliases = {}
    for item in data['results']['bindings']:
        canonical_name = item['personLabel']['value']
        alias = item.get('aliasLabel', {}).get('value')
        if canonical_name not in politician_aliases:
            politician_aliases[canonical_name] = set()
            politician_aliases[canonical_name].add(canonical_name)
        if alias:
            politician_aliases[canonical_name].add(alias)
    return politician_aliases

# Fetch U.S. politician aliases
politician_aliases_raw = get_current_us_congress_members()

# Merge with manual politicians
politician_aliases_raw.update(manual_politicians)

# Build alias-to-canonical mapping
alias_to_canonical = {}
for canonical_name, aliases in politician_aliases_raw.items():
    for alias in aliases:
        alias_to_canonical[alias.lower()] = canonical_name


### Cell for Article Processing
This cell processes your dataset (data_sample) and performs named entity recognition (NER). Run it whenever you need to process or analyze new articles.

In [9]:
import pandas as pd
import os
import pickle
from collections import defaultdict

# Assume data_full_content is already loaded
data_sample = data_full_content.head(8000)

# Collect authors' names to exclude them from entities
authors_set = set()
for author in data_sample['author'].dropna():
    authors = [a.strip() for a in author.split(',')]
    authors_set.update(authors)

# Lowercase authors' names for consistent comparison
authors_set = {author.lower() for author in authors_set}

# Initialize mappings for mentions
article_mentions = defaultdict(set)  # Maps article index to mentioned politicians
politician_mentions = defaultdict(set)  # Maps politician to articles they're mentioned in

# Define paths to save progress
article_mentions_file = "article_mentions.pkl"
politician_mentions_file = "politician_mentions.pkl"
error_log_file = "error_log.txt"

# Load intermediate progress if available
if os.path.exists(article_mentions_file):
    with open(article_mentions_file, "rb") as f:
        article_mentions = pickle.load(f)
if os.path.exists(politician_mentions_file):
    with open(politician_mentions_file, "rb") as f:
        politician_mentions = pickle.load(f)

# Initialize or load error log
error_log = []
if os.path.exists(error_log_file):
    with open(error_log_file, "r") as f:
        error_log = f.readlines()

# Process articles
for idx, row in data_sample.iterrows():
    try:
        # Skip already processed articles
        if idx in article_mentions:
            continue

        content = row['full_content']
        doc = nlp(content)

        for ent in doc.ents:
            if ent.label_ == "PERSON":
                entity_name = ent.text.strip()
                entity_name_lower = entity_name.lower()
                # Exclude authors
                if entity_name_lower in authors_set:
                    continue
                # Map entity name to canonical politician name
                canonical_name = alias_to_canonical.get(entity_name_lower)
                if canonical_name:
                    article_mentions[idx].add(canonical_name)
                    politician_mentions[canonical_name].add(idx)

        # Periodically save progress
        if idx % 100 == 0:
            with open(article_mentions_file, "wb") as f:
                pickle.dump(article_mentions, f)
            with open(politician_mentions_file, "wb") as f:
                pickle.dump(politician_mentions, f)
            print(f"Progress saved at article {idx}")

    except Exception as e:
        # Log the error
        error_message = f"Error processing article {idx}: {str(e)}\n"
        error_log.append(error_message)
        with open(error_log_file, "a") as f:
            f.write(error_message)
        print(error_message)

# Final save of progress
with open(article_mentions_file, "wb") as f:
    pickle.dump(article_mentions, f)
with open(politician_mentions_file, "wb") as f:
    pickle.dump(politician_mentions, f)

print("Processing complete.")


Progress saved at article 0
Progress saved at article 100


KeyboardInterrupt: 

In [8]:
import pandas as pd

# Assume data_full_content is already loaded
data_sample = data_full_content.head(1000)

# Collect authors' names to exclude them from entities
authors_set = set()
for author in data_sample['author'].dropna():
    authors = [a.strip() for a in author.split(',')]
    authors_set.update(authors)

# Lowercase authors' names for consistent comparison
authors_set = {author.lower() for author in authors_set}

# Initialize mappings for mentions
article_mentions = defaultdict(set)  # Maps article index to mentioned politicians
politician_mentions = defaultdict(set)  # Maps politician to articles they're mentioned in

# Process articles
for idx, row in data_sample.iterrows():
    content = row['full_content']
    doc = nlp(content)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entity_name = ent.text.strip()
            entity_name_lower = entity_name.lower()
            # Exclude authors
            if entity_name_lower in authors_set:
                continue
            # Map entity name to canonical politician name
            canonical_name = alias_to_canonical.get(entity_name_lower)
            if canonical_name:
                article_mentions[idx].add(canonical_name)
                politician_mentions[canonical_name].add(idx)


KeyboardInterrupt: 

# Creating the network

In [10]:
# Create the network graph
G = nx.Graph()

# Add nodes with attributes (politicians and articles)
for politician, articles in politician_mentions.items():
    G.add_node(politician, articles=list(articles))

# Add edges based on co-mentions in articles
for article_idx, mentioned_politicians in article_mentions.items():
    mentioned_politicians = list(mentioned_politicians)
    for i in range(len(mentioned_politicians)):
        for j in range(i + 1, len(mentioned_politicians)):
            p1, p2 = mentioned_politicians[i], mentioned_politicians[j]
            if G.has_edge(p1, p2):
                G[p1][p2]['weight'] += 1
            else:
                G.add_edge(p1, p2, weight=1)

# Output graph information
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print("Sample nodes with attributes:")
for node, attrs in list(G.nodes(data=True))[:5]:
    print(f"{node}: {attrs}")
print("Sample edges with weights:")
for u, v, attrs in list(G.edges(data=True))[:5]:
    print(f"{u} - {v}: {attrs}")

Number of nodes: 55
Number of edges: 322
Sample nodes with attributes:
Donald Trump: {'articles': [0, 1, 3, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 34, 38, 40, 42, 43, 45, 48, 49, 50, 51, 52, 53, 55, 57, 58, 64, 65, 67, 69, 70, 71, 74, 75, 76, 77, 78, 82, 83, 84, 85, 86, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147]}
Hillary Clinton: {'articles': [0, 128, 3, 7, 137, 138, 139, 12, 141, 17, 146, 19, 21, 33, 34, 49, 58, 82, 92, 95, 105]}
Kamala Harris: {'articles': [1, 3, 4, 5, 6, 7, 9, 10, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 28, 30, 33, 34, 35, 38, 40, 42, 45, 49, 50, 51, 52, 53, 57, 58, 64, 65, 67, 70, 74, 75, 82, 83, 84, 85, 86, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,