### Loading Data

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import networkx as nx
import spacy
import os
import pickle
from collections import defaultdict
import requests
import spacy.cli
import json
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import re
import matplotlib.dates as mdates
from sklearn.metrics.pairwise import cosine_similarity
from community import community_louvain
import matplotlib.cm as cm
from networkx.algorithms.community.quality import modularity


In [None]:
# Load the dataset
data_full_content = pd.read_csv('news_articles_election_candidates_full_content_cleaned.csv')

# Load the spaCy English model
try:
    nlp = spacy.load("en_core_web_trf")
except OSError:
    spacy.cli.download("en_core_web_trf")
    nlp = spacy.load("en_core_web_trf")

# Initialize NLTK's VADER sentiment analyzer
nltk.download('vader_lexicon', quiet=True)
sia = SentimentIntensityAnalyzer()

# Load manual politician data from JSON file
def load_manual_politicians(file_path="manual_politicians.json"):
    try:
        with open(file_path, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"File not found: {file_path}. Using an empty dictionary.")
        return {}
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return {}

# Load manual politicians
manual_politicians = load_manual_politicians()

# Function to retrieve U.S. politicians and their aliases
def get_current_us_congress_members():
    url = 'https://query.wikidata.org/sparql'
    query = '''
    SELECT DISTINCT ?person ?personLabel ?aliasLabel WHERE {
      VALUES ?position { wd:Q13217683 wd:Q13218630 }  # U.S. Senator and Representative
      ?person p:P39 ?positionStatement.
      ?positionStatement ps:P39 ?position;
                         pq:P580 ?startTime.
      FILTER NOT EXISTS { ?positionStatement pq:P582 ?endTime. }  # Position with no end time
      OPTIONAL { ?person skos:altLabel ?aliasLabel FILTER (LANG(?aliasLabel) = "en") }
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    '''
    headers = {'Accept': 'application/sparql-results+json'}
    response = requests.get(url, params={'query': query}, headers=headers, timeout=60)
    if response.status_code != 200:
        raise Exception(f"SPARQL query failed with status {response.status_code}: {response.text}")
    data = response.json()

    politician_aliases = {}
    for item in data['results']['bindings']:
        canonical_name = item['personLabel']['value']
        alias = item.get('aliasLabel', {}).get('value')
        if canonical_name not in politician_aliases:
            politician_aliases[canonical_name] = set()
            politician_aliases[canonical_name].add(canonical_name)
        if alias:
            politician_aliases[canonical_name].add(alias)
    return politician_aliases

# Fetch U.S. politician aliases
politician_aliases_raw = get_current_us_congress_members()

# Merge with manual politicians
politician_aliases_raw.update(manual_politicians)

# Build alias-to-canonical mapping
alias_to_canonical = {}
for canonical_name, aliases in politician_aliases_raw.items():
    for alias in aliases:
        alias_to_canonical[alias.lower()] = canonical_name

### Loading saved article meantions and policitian meantions

In [None]:
# Paths to the saved files
article_mentions_file = "article_mentions.pkl"
politician_mentions_file = "politician_mentions.pkl"

# Load the saved mentions
with open(article_mentions_file, "rb") as f:
    article_mentions = pickle.load(f)
with open(politician_mentions_file, "rb") as f:
    politician_mentions = pickle.load(f)

### Saving and loading sentiment score

In [None]:
# Path to save sentiment scores
sentiment_scores_file = "sentiment_scores.pkl"

# Check if sentiment scores are already saved
if os.path.exists(sentiment_scores_file):
    # Load the saved sentiment scores
    with open(sentiment_scores_file, "rb") as f:
        sentiment_scores = pickle.load(f)
    data_full_content['sentiment_scores'] = data_full_content.index.map(sentiment_scores)
    print("Loaded sentiment scores from file.")
else:
    # Compute sentiment scores
    print("Computing sentiment scores...")
    sentiment_scores = {}
    for idx, content in data_full_content['full_content'].items():
        if isinstance(content, str):
            sentiment_scores[idx] = sia.polarity_scores(content)
        else:
            sentiment_scores[idx] = {'compound': 0}
    # Save the sentiment scores
    with open(sentiment_scores_file, "wb") as f:
        pickle.dump(sentiment_scores, f)
    data_full_content['sentiment_scores'] = data_full_content.index.map(sentiment_scores)
    print("Sentiment scores computed and saved.")

### Saving and loading graph for network 1

In [None]:
# Path to save the graph
graph_file = "politician_mention_graph.graphml"

# Check if the graph is already saved
if os.path.exists(graph_file):
    # Load the saved graph
    G = nx.read_graphml(graph_file)
    print("Loaded graph G from file.")
else:
    # Create the network graph
    G = nx.Graph()
    
    # Add nodes with attributes (politicians and articles)
    for politician, articles in politician_mentions.items():
        G.add_node(politician, articles=list(articles))
    
    # Add edges based on co-mentions in articles
    for article_idx, mentioned_politicians in article_mentions.items():
        mentioned_politicians = list(mentioned_politicians)
        for i in range(len(mentioned_politicians)):
            for j in range(i + 1, len(mentioned_politicians)):
                p1, p2 = mentioned_politicians[i], mentioned_politicians[j]
                if G.has_edge(p1, p2):
                    G[p1][p2]['weight'] += 1
                else:
                    G.add_edge(p1, p2, weight=1)
                    
     # Convert list attributes to strings for GraphML compatibility
    for node, data in G.nodes(data=True):
        data['articles'] = ",".join(map(str, data['articles']))
    # Save the graph
    nx.write_graphml(G, graph_file)
    print("Graph G created and saved.")

In [None]:
# Output graph information
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print("Sample nodes with attributes:")
for node, attrs in list(G.nodes(data=True))[:5]:
    print(f"{node}: {attrs}")
print("Sample edges with weights:")
for u, v, attrs in list(G.edges(data=True))[:5]:
    print(f"{u} - {v}: {attrs}")

### Saving and loading bipartite graph 

In [None]:
# Path to save the bipartite graph
bipartite_graph_file = "bipartite_graph.graphml"

# Check if the bipartite graph is already saved
if os.path.exists(bipartite_graph_file):
    # Load the saved bipartite graph
    B = nx.read_graphml(bipartite_graph_file)
    print("Loaded bipartite graph B from file.")
else:
    # Create a Bipartite Graph
    B = nx.Graph()

    # Add meta-nodes for media categories
    B.add_node("Democratic Media", bipartite=0, type="Media")
    B.add_node("Republican Media", bipartite=0, type="Media")

    # Initialize dictionaries to track mentions and sentiments
    democratic_mentions = defaultdict(list)
    republican_mentions = defaultdict(list)

    # Define media outlet categories
    democratic_outlets = [
        "cnn", "msnbc", "new york times", "nyt", "npr", "guardian", "huffpost", "huffington post",
        "slate", "vox", "politico", "buzzfeed", "buzzfeed news"
    ]
    republican_outlets = [
        "fox news", "fox", "breitbart", "wall street journal", "wsj", "washington times",
        "national review", "daily caller", "blaze", "newsmax", "federalist", "oann",
        "one america news network"
    ]

    # Function to map source names to media categories
    def get_media_category(source):
        source = str(source).strip().lower()
        # Remove common prefixes and suffixes
        source = re.sub(r'^(the|www\.)\s+', '', source)
        source = re.sub(r'\.com$', '', source)
        source = source.replace('-', ' ')
        source = source.replace('.', ' ')
        source = source.replace(',', '')
        
        # Check for democratic media
        for pattern in democratic_outlets:
            if pattern in source:
                return 'Democratic Media'
        
        # Check for republican media
        for pattern in republican_outlets:
            if pattern in source:
                return 'Republican Media'
        
        return None  # Unknown media category

    # Add a new column for media category
    data_full_content['media_category'] = data_full_content['source'].apply(get_media_category)

    # Process articles
    for idx, row in tqdm(data_full_content.iterrows(), total=data_full_content.shape[0], desc="Building Bipartite Graph"):
        category = row.get('media_category')
        if not category:
            continue

        sentiment = row['sentiment_scores']['compound'] if 'sentiment_scores' in row else 0
        mentioned_politicians = article_mentions.get(idx, set())

        for politician in mentioned_politicians:
            if category == "Democratic Media":
                democratic_mentions[politician].append(sentiment)
            elif category == "Republican Media":
                republican_mentions[politician].append(sentiment)

    # Add politician nodes and edges to the bipartite graph
    all_politicians = set(democratic_mentions.keys()).union(republican_mentions.keys())

    for politician in all_politicians:
        # Add politician node
        B.add_node(politician, bipartite=1, type="Politician")

        # Calculate edge attributes for Democratic media
        if politician in democratic_mentions:
            mention_count = len(democratic_mentions[politician])
            avg_sentiment = sum(democratic_mentions[politician]) / mention_count
            B.add_edge("Democratic Media", politician, weight=mention_count, sentiment=avg_sentiment)

        # Calculate edge attributes for Republican media
        if politician in republican_mentions:
            mention_count = len(republican_mentions[politician])
            avg_sentiment = sum(republican_mentions[politician]) / mention_count
            B.add_edge("Republican Media", politician, weight=mention_count, sentiment=avg_sentiment)

    # Save the bipartite graph
    nx.write_graphml(B, bipartite_graph_file)
    print("Bipartite graph B created and saved.")

In [None]:
# Output graph information
print(f"Number of nodes: {B.number_of_nodes()}")
print(f"Number of edges: {B.number_of_edges()}")
print("Sample nodes with attributes:")
for node, attrs in list(B.nodes(data=True))[:5]:
    print(f"{node}: {attrs}")
print("Sample edges with weights:")
for u, v, attrs in list(B.edges(data=True))[:5]:
    print(f"{u} - {v}: {attrs}")

### BERT transformer analysis

In [None]:
# Define the sentiment analysis function
# Load the tokenizer and model for sentiment analysis
# Using 'nlptown/bert-base-multilingual-uncased-sentiment' which outputs ratings from 1 (very negative) to 5 (very positive)
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Move model to GPU if available for faster processing
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def get_sentiment_score(text):
    # Ensure the text is a string
    if not isinstance(text, str):
        return None  # or you can return a default value like 3 (neutral)
    
    # Tokenize the input text
    inputs = tokenizer.encode_plus(
        text,
        return_tensors='pt',      # Return PyTorch tensors
        truncation=True,          # Truncate long texts to model's max length
        max_length=512,           # Max length for BERT
        padding='max_length'      # Pad shorter texts
    )
    
    # Move inputs to device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted class
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()
    
    # The model predicts ratings from 0 to 4 (0: very negative, 4: very positive)
    # Adjust the rating to be from 1 to 5
    sentiment_rating = predicted_class + 1
    
    return sentiment_rating

In [None]:
# Path to save transformer sentiment scores
transformer_sentiment_file = "transformer_sentiment_scores.pkl"

# Check if transformer sentiment scores are already saved
if os.path.exists(transformer_sentiment_file):
    # Load the saved sentiment scores
    with open(transformer_sentiment_file, "rb") as f:
        transformer_sentiment_scores = pickle.load(f)
    data_full_content['transformer_sentiment'] = data_full_content.index.map(transformer_sentiment_scores)
    print("Loaded transformer sentiment scores from file.")
else:
    # Compute transformer sentiment scores
    print("Computing transformer sentiment scores...")
    transformer_sentiment_scores = {}
    for idx, content in tqdm(data_full_content['full_content'].items(), desc="Sentiment Analysis"):
        score = get_sentiment_score(content)
        transformer_sentiment_scores[idx] = score
    # Save the sentiment scores
    with open(transformer_sentiment_file, "wb") as f:
        pickle.dump(transformer_sentiment_scores, f)
    data_full_content['transformer_sentiment'] = data_full_content.index.map(transformer_sentiment_scores)
    print("Transformer sentiment scores computed and saved.")

### Sentiment over time

In [None]:
# Ensure 'published_at' column is in datetime format
data_full_content['published_at'] = pd.to_datetime(data_full_content['published_at'], errors='coerce')

# Drop rows with NaT in 'published_at' (optional, depending on your data)
data_full_content = data_full_content.dropna(subset=['published_at'])

# Extract the date from the timestamp
data_full_content['date'] = data_full_content['published_at'].dt.date

## Aggregate Sentiments Over Time
# Compute the average sentiment per day for each politician
sentiment_over_time = []

for politician, indices in politician_mentions.items():
    # Subset data for articles mentioning the politician
    politician_data = data_full_content.loc[list(indices)]
    
    if not politician_data.empty:
        # Ensure 'sentiment_score' exists in data_full_content
        if 'sentiment_score' not in politician_data.columns:
            # Compute 'sentiment_score' if not present
            from nltk.sentiment.vader import SentimentIntensityAnalyzer
            sia = SentimentIntensityAnalyzer()
            politician_data['sentiment_score'] = politician_data['full_content'].apply(
                lambda content: sia.polarity_scores(content)['compound'] if isinstance(content, str) else 0
            )
        
        # Group by date and compute average sentiment
        daily_sentiment = (
            politician_data.groupby('date')['sentiment_score']
            .mean()
            .reset_index()
        )
        daily_sentiment['normalized_sentiment'] = (daily_sentiment['sentiment_score'] - 0) / 1  # Adjust if needed
        daily_sentiment['politician'] = politician
        sentiment_over_time.append(daily_sentiment)

# Combine all politicians' sentiment data into a single DataFrame
if sentiment_over_time:
    sentiment_over_time_df = pd.concat(sentiment_over_time, ignore_index=True)
else:
    sentiment_over_time_df = pd.DataFrame()
    print("No sentiment data available.")

# Save the DataFrame
sentiment_over_time_df.to_pickle(sentiment_over_time_file)
print("Sentiment over time data computed and saved.")

# Now proceed with visualization
# Display the resulting DataFrame
print(sentiment_over_time_df.head())

### Visualize 

In [None]:
## Visualize Sentiment Trends

# Select top politicians to visualize
top_politicians = ['Joe Biden', 'Donald Trump', 'Kamala Harris']

# Plot sentiment over time for each politician
plt.figure(figsize=(12, 6))

for politician in top_politicians:
    data = sentiment_over_time_df[sentiment_over_time_df['politician'] == politician]
    if not data.empty:
        plt.plot(data['date'], data['normalized_sentiment'], label=politician)
    else:
        print(f"No data available for {politician}")

# Formatting the plot
plt.title('Sentiment Over Time for Selected Politicians')
plt.xlabel('Date')
plt.ylabel('Normalized Sentiment')
plt.legend()
plt.grid(True)

# Improve date formatting
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))  # Every 7 days
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

### Key events plotted

In [None]:
## Identify Key Events and Correlate with Sentiment Changes

# Define key events
# Source: https://en.wikipedia.org/wiki/Timeline_of_the_2024_United_States_presidential_election
key_events = {
    # '2024-09-10': 'ABC presidential debate',
    '2024-10-27': 'Trump Rally at Madison Square Garden',
    # '2024-10-01': 'CBS vice presidential debate',
    '2024-10-25': 'Washington Post announces they will not endorse',
    '2024-11-01': 'Green Party endorses Kamala Harris',
    # Add more events as needed...
}

# Plot sentiment over time with key events
plt.figure(figsize=(12, 6))

for politician in top_politicians:
    data = sentiment_over_time_df[sentiment_over_time_df['politician'] == politician]
    if not data.empty:
        plt.plot(data['date'], data['normalized_sentiment'], label=politician)
    else:
        print(f"No data available for {politician}")

# Add vertical lines for key events
for date_str, event in key_events.items():
    event_date = pd.to_datetime(date_str).date()
    plt.axvline(x=event_date, color='grey', linestyle='--', alpha=0.7)
    plt.text(event_date, plt.ylim()[1], event, rotation=90, verticalalignment='top', fontsize=8)

# Formatting the plot
plt.title('Sentiment Over Time with Key Events')
plt.xlabel('Date')
plt.ylabel('Normalized Sentiment')
plt.legend()
plt.grid(True)

# Improve date formatting
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))  # Every 7 days
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

### Community Decetion

#### Sentiment profiles for politicians 

In [None]:
# Initialize sentiment profiles
sentiment_profiles = {}

for politician in all_politicians:
    profile = {}
    # Extract sentiment scores from edges
    if B.has_edge("Democratic Media", politician):
        profile["Democratic Media"] = B["Democratic Media"][politician]["sentiment"]
    else:
        profile["Democratic Media"] = 0  # No connection

    if B.has_edge("Republican Media", politician):
        profile["Republican Media"] = B["Republican Media"][politician]["sentiment"]
    else:
        profile["Republican Media"] = 0  # No connection

    sentiment_profiles[politician] = profile

#### Create similarity based graph

In [None]:
# Convert sentiment profiles to a DataFrame
profiles_df = pd.DataFrame.from_dict(sentiment_profiles, orient="index").fillna(0)

# Compute similarity between sentiment profiles
similarity_matrix = cosine_similarity(profiles_df)

# Create a graph where edge weights are similarity scores
P = nx.Graph()
politicians = profiles_df.index.tolist()

for i, pol1 in enumerate(politicians):
    for j, pol2 in enumerate(politicians):
        if i != j and similarity_matrix[i, j] > 0:  # Ignore self-loops
            P.add_edge(pol1, pol2, weight=similarity_matrix[i, j])

#### Apply community detection

In [None]:
# Detect communities
partition = community_louvain.best_partition(P, weight="weight")

# Group nodes by community
communities = defaultdict(list)
for politician, community_id in partition.items():
    communities[community_id].append(politician)

# Print community results
for community_id, members in communities.items():
    print(f"Community {community_id}: {len(members)} members")
    print(f"Politicians: {members}\n")

#### Analyse and visualize communities 

In [None]:
for community_id, members in communities.items():
    print(f"Community {community_id}:")
    community_profiles = profiles_df.loc[members].mean()
    print(f"Average Sentiment Profile:")
    print(community_profiles)
    print("\n")

In [None]:
# Assign colors based on community
cmap = cm.get_cmap("viridis", max(partition.values()) + 1)
node_colors = [cmap(partition[node]) for node in P.nodes()]

# Draw the graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(P, seed=42)  # Use a spring layout for visualization
nx.draw(P, pos, with_labels=True, node_color=node_colors, node_size=300, font_size=8, alpha=0.8)
plt.title("Community Network Based on Sentiment Similarity")
plt.show()

#### Compute modularity

In [None]:
# Format communities for modularity calculation
formatted_communities = [members for members in communities.values()]

In [None]:
# Compute modularity score
modularity_score = modularity(P, formatted_communities, weight="weight")

print(f"Modularity Score: {modularity_score}")