In [None]:
AIzaSyCrWhqc8Jr_sNHm2pIIIUyNy6bD8nJV8tI

In [1]:
!pip install --upgrade feedparser pandas numpy matplotlib beautifulsoup4 networkx python-dotenv google-generativeai


Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m695.7 kB/s[0m eta [36m0:00:00[0m
Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib
  Downloading matplotlib-3.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━

In [5]:
import feedparser
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import requests
import time
import re
from bs4 import BeautifulSoup
import os
import json
import networkx as nx
from networkx.algorithms import community
import google.generativeai as genai

# Configure Gemini API with a hardcoded API key
GEMINI_API_KEY = ""  # Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)

# Model configuration - using Gemini 2.0 Flash
model = genai.GenerativeModel('gemini-2.0-flash')

# Define RSS feeds
rss_feeds = {
    "Reuters Business": "http://feeds.reuters.com/reuters/businessNews",
    "BBC World": "http://feeds.bbci.co.uk/news/world/rss.xml",
    "Al Jazeera": "https://www.aljazeera.com/xml/rss/all.xml"
}

class NewsCollector:
    def __init__(self, feeds_dict, days_limit=14):
        """
        Initialize the NewsCollector with RSS feeds and time limit

        Args:
            feeds_dict: Dictionary of RSS feeds with name as key and URL as value
            days_limit: Number of days to look back for articles
        """
        self.feeds_dict = feeds_dict
        self.days_limit = days_limit
        self.articles_df = None

    def fetch_articles(self):
        """Fetch articles from all feeds and filter by date"""
        all_articles = []
        cutoff_date = datetime.now() - timedelta(days=self.days_limit)

        for source_name, feed_url in self.feeds_dict.items():
            try:
                print(f"Fetching articles from {source_name}...")
                feed = feedparser.parse(feed_url)

                for entry in feed.entries:
                    # Handle different date formats
                    if hasattr(entry, 'published_parsed'):
                        pub_date = datetime(*entry.published_parsed[:6])
                    elif hasattr(entry, 'updated_parsed'):
                        pub_date = datetime(*entry.updated_parsed[:6])
                    else:
                        # Skip if no date available
                        continue

                    # Filter by date
                    if pub_date >= cutoff_date:
                        article = {
                            'title': entry.title,
                            'summary': self._clean_html(entry.summary if hasattr(entry, 'summary') else ""),
                            'link': entry.link,
                            'published_date': pub_date,
                            'source': source_name
                        }
                        all_articles.append(article)

                # Be nice to the servers
                time.sleep(1)

            except Exception as e:
                print(f"Error fetching {source_name}: {str(e)}")

        # Convert to dataframe
        self.articles_df = pd.DataFrame(all_articles)
        print(f"Collected {len(self.articles_df)} articles")
        return self.articles_df

    def _clean_html(self, text):
        """Remove HTML tags from text"""
        return BeautifulSoup(text, 'html.parser').get_text()

    def save_articles(self, filename="collected_news_articles.csv"):
        """Save articles to CSV file"""
        if self.articles_df is not None:
            self.articles_df.to_csv(filename, index=False)
            print(f"Articles saved to {filename}")
        else:
            print("No articles to save. Run fetch_articles() first.")

    def load_articles(self, filename="collected_news_articles.csv"):
        """Load articles from CSV file"""
        self.articles_df = pd.read_csv(filename)
        # Convert date string back to datetime
        self.articles_df['published_date'] = pd.to_datetime(self.articles_df['published_date'])
        print(f"Loaded {len(self.articles_df)} articles from {filename}")
        return self.articles_df

class NLPProcessor:
    def __init__(self, model):
        """
        Initialize the NLP processor with a Gemini model

        Args:
            model: Initialized Gemini model
        """
        self.model = model
        self.processed_data = []

    def extract_entities_events(self, text):
        """
        Extract named entities, events, and sentiment using Gemini

        Args:
            text: Article text (title + summary)

        Returns:
            Dictionary with extracted entities, events, and sentiment
        """
        prompt = f"""
        Analyze this news article text and extract the following information in JSON format:

        Text: {text}

        Return a JSON object with these keys:
        1. "entities": Extract named entities as an object with these categories as keys:
           - "organizations": List of organization names
           - "persons": List of person names
           - "locations": List of location names
           - "products": List of product names (if any)
           - "events": List of event types/names (if any)

        2. "key_events": List of main events or actions described (verb-focused phrases, 2-5 items)

        3. "sentiment": Object with:
           - "overall": Overall sentiment of the article ("positive", "negative", or "neutral")
           - "entity_sentiments": Array of objects with "entity" and "sentiment" for main entities

        Return ONLY the JSON with no additional text.
        """

        try:
            response = self.model.generate_content(prompt)
            # Extract JSON from response
            response_text = response.text

            # Handle cases where the model might include markdown code blocks
            if "```json" in response_text:
                json_text = response_text.split("```json")[1].split("```")[0].strip()
            elif "```" in response_text:
                json_text = response_text.split("```")[1].strip()
            else:
                json_text = response_text.strip()

            extracted_data = json.loads(json_text)
            return extracted_data

        except Exception as e:
            print(f"Error in NLP processing: {str(e)}")
            # Return empty structure if analysis fails
            return {
                "entities": {
                    "organizations": [],
                    "persons": [],
                    "locations": [],
                    "products": [],
                    "events": []
                },
                "key_events": [],
                "sentiment": {
                    "overall": "neutral",
                    "entity_sentiments": []
                }
            }

    def process_articles(self, articles_df):
        """
        Process all articles in the dataframe

        Args:
            articles_df: DataFrame with articles

        Returns:
            DataFrame with processed articles
        """
        self.processed_data = []

        for _, article in articles_df.iterrows():
            # Combine title and summary for better context
            full_text = f"{article['title']} {article['summary']}"

            print(f"Processing article: {article['title'][:40]}...")

            try:
                # Extract information using NLP
                extracted_info = self.extract_entities_events(full_text)

                # Store processed data
                processed_article = {
                    'article_id': _,
                    'title': article['title'],
                    'source': article['source'],
                    'published_date': article['published_date'],
                    'link': article['link'],
                    'extracted_info': extracted_info
                }

                self.processed_data.append(processed_article)

                # Be nice to the API rate limits
                time.sleep(0.5)

            except Exception as e:
                print(f"Error processing article: {str(e)}")

        print(f"Processed {len(self.processed_data)} articles")
        return self.processed_data

    def save_processed_data(self, filename="processed_articles.json"):
        """Save processed data to JSON file"""
        with open(filename, 'w') as f:
            # Convert datetime objects to strings for JSON serialization
            serializable_data = []
            for item in self.processed_data:
                serialized_item = item.copy()
                if isinstance(serialized_item['published_date'], pd.Timestamp):
                    serialized_item['published_date'] = serialized_item['published_date'].isoformat()
                serializable_data.append(serialized_item)

            json.dump(serializable_data, f, indent=2)
        print(f"Processed data saved to {filename}")

    def load_processed_data(self, filename="processed_articles.json"):
        """Load processed data from JSON file"""
        with open(filename, 'r') as f:
            self.processed_data = json.load(f)
        print(f"Loaded {len(self.processed_data)} processed articles from {filename}")
        return self.processed_data

class KnowledgeGraphBuilder:
    def __init__(self):
        """Initialize the knowledge graph builder"""
        self.G = nx.Graph()
        self.entity_types = {
            "organizations": "Organization",
            "persons": "Person",
            "locations": "Location",
            "products": "Product",
            "events": "Event"
        }
        self.sentiment_colors = {
            "positive": "green",
            "neutral": "gray",
            "negative": "red"
        }

    def build_graph(self, processed_data):
        """
        Build knowledge graph from processed article data

        Args:
            processed_data: List of processed articles with extracted information
        """
        # Reset graph
        self.G = nx.Graph()

        for article in processed_data:
            article_id = article['article_id']
            source = article['source']
            date = article['published_date']
            link = article['link']
            title = article['title']

            # Add article node
            self.G.add_node(
                f"article_{article_id}",
                type="Article",
                title=title,
                source=source,
                date=date,
                link=link,
                sentiment=article['extracted_info']['sentiment']['overall']
            )

            # Add entity nodes and connect to article
            for entity_type, entities in article['extracted_info']['entities'].items():
                if entity_type in self.entity_types and entities:
                    for entity in entities:
                        entity_id = f"{entity.lower().replace(' ', '_')}_{self.entity_types[entity_type]}"

                        # Add entity if not exists
                        if not self.G.has_node(entity_id):
                            self.G.add_node(
                                entity_id,
                                name=entity,
                                type=self.entity_types[entity_type]
                            )

                        # Connect entity to article
                        self.G.add_edge(
                            entity_id,
                            f"article_{article_id}",
                            type="mentioned_in"
                        )

            # Add connections between entities in the same article
            entity_ids = []
            for entity_type, entities in article['extracted_info']['entities'].items():
                if entity_type in self.entity_types and entities:
                    for entity in entities:
                        entity_id = f"{entity.lower().replace(' ', '_')}_{self.entity_types[entity_type]}"
                        entity_ids.append(entity_id)

            # Connect co-occurring entities
            for i, entity1 in enumerate(entity_ids):
                for entity2 in entity_ids[i+1:]:
                    if self.G.has_edge(entity1, entity2):
                        # Increment weight if edge exists
                        self.G[entity1][entity2]['weight'] += 1
                    else:
                        # Create new edge
                        self.G.add_edge(entity1, entity2, type="co_occurs_with", weight=1)

            # Add sentiment information to entities
            for entity_sentiment in article['extracted_info']['sentiment']['entity_sentiments']:
                entity = entity_sentiment.get('entity')
                sentiment = entity_sentiment.get('sentiment')

                if entity and sentiment:
                    # Try to find the entity in different types
                    for entity_type in self.entity_types.values():
                        entity_id = f"{entity.lower().replace(' ', '_')}_{entity_type}"
                        if self.G.has_node(entity_id):
                            # Add or update sentiment attribute as a string
                            if 'sentiments_str' not in self.G.nodes[entity_id]:
                                self.G.nodes[entity_id]['sentiments_str'] = sentiment
                            else:
                                self.G.nodes[entity_id]['sentiments_str'] += "," + sentiment

                            # Keep track of predominant sentiment
                            sentiments_list = self.G.nodes[entity_id]['sentiments_str'].split(',')
                            pos_count = sentiments_list.count('positive')
                            neg_count = sentiments_list.count('negative')
                            neu_count = sentiments_list.count('neutral')

                            if pos_count >= neg_count and pos_count >= neu_count:
                                self.G.nodes[entity_id]['predominant_sentiment'] = 'positive'
                            elif neg_count >= pos_count and neg_count >= neu_count:
                                self.G.nodes[entity_id]['predominant_sentiment'] = 'negative'
                            else:
                                self.G.nodes[entity_id]['predominant_sentiment'] = 'neutral'
                            break

        print(f"Built graph with {self.G.number_of_nodes()} nodes and {self.G.number_of_edges()} edges")
        return self.G

    def save_graph(self, filename="news_knowledge_graph.graphml"):
        """Save graph to GraphML file"""
        # Convert non-serializable attributes
        for node, data in self.G.nodes(data=True):
            for key, value in data.items():
                if isinstance(value, (pd.Timestamp, datetime)):
                    self.G.nodes[node][key] = str(value)
                # Convert any remaining lists to strings to avoid GraphML errors
                elif isinstance(value, list):
                    self.G.nodes[node][key] = ','.join(map(str, value))

        # Also check edge attributes
        for u, v, data in self.G.edges(data=True):
            for key, value in list(data.items()):
                if isinstance(value, list):
                    data[key] = ','.join(map(str, value))

        # Save graph
        nx.write_graphml(self.G, filename)
        print(f"Graph saved to {filename}")

        # Also save as GEXF for Gephi compatibility
        gexf_file = filename.replace('.graphml', '.gexf')
        nx.write_gexf(self.G, gexf_file)
        print(f"Graph also saved to {gexf_file}")

    def load_graph(self, filename="news_knowledge_graph.graphml"):
        """Load graph from GraphML file"""
        self.G = nx.read_graphml(filename)
        print(f"Loaded graph with {self.G.number_of_nodes()} nodes and {self.G.number_of_edges()} edges")
        return self.G

class RiskAnalyzer:
    def __init__(self, graph):
        """
        Initialize the risk analyzer

        Args:
            graph: NetworkX graph with news entities and articles
        """
        self.G = graph
        self.risk_signals = []

    def calculate_node_metrics(self):
        """Calculate centrality measures for all nodes"""
        # Degree centrality
        degree_cent = nx.degree_centrality(self.G)
        # Betweenness centrality (can be computationally expensive for large graphs)
        betweenness_cent = nx.betweenness_centrality(self.G, k=10)  # Use sampling for larger graphs

        # Add metrics to graph
        for node in self.G.nodes():
            self.G.nodes[node]['degree_centrality'] = degree_cent[node]
            self.G.nodes[node]['betweenness_centrality'] = betweenness_cent[node]

        return {
            'degree_centrality': degree_cent,
            'betweenness_centrality': betweenness_cent
        }

    def find_communities(self):
        """Find communities in the graph using Louvain method"""
        communities = community.louvain_communities(self.G)

        # Add community membership to nodes
        for i, comm in enumerate(communities):
            for node in comm:
                self.G.nodes[node]['community'] = i

        return communities

    def identify_risk_signals(self):
        """
        Identify potential risk signals in the graph

        Returns:
            List of risk signals with details
        """
        self.risk_signals = []

        # 1. Identify highly connected entities involved in negative sentiment articles
        for node in self.G.nodes():
            if not node.startswith('article_') and self.G.nodes[node].get('type') != 'Article':

                # Check if entity appears in multiple articles
                connected_articles = [n for n in self.G.neighbors(node) if n.startswith('article_')]

                if len(connected_articles) >= 2:  # Entity appears in multiple articles

                    # Count negative sentiment articles
                    negative_articles = [
                        art for art in connected_articles
                        if self.G.nodes[art].get('sentiment') == 'negative'
                    ]

                    if len(negative_articles) >= 1:
                        risk_signal = {
                            'entity': node,
                            'entity_name': self.G.nodes[node].get('name', node),
                            'entity_type': self.G.nodes[node].get('type', 'Unknown'),
                            'degree_centrality': self.G.nodes[node].get('degree_centrality', 0),
                            'connected_articles': len(connected_articles),
                            'negative_articles': len(negative_articles),
                            'negative_ratio': len(negative_articles) / len(connected_articles),
                            'risk_type': 'High-visibility entity with negative sentiment'
                        }
                        self.risk_signals.append(risk_signal)

        # 2. Find clusters of entities that co-occur often with negative sentiment
        communities = self.find_communities()

        for i, comm in enumerate(communities):
            # Filter for non-article nodes
            entity_nodes = [n for n in comm if not n.startswith('article_')]

            if len(entity_nodes) >= 3:  # Only consider substantial communities
                # Find all articles connected to this community
                community_articles = set()
                for entity in entity_nodes:
                    for neighbor in self.G.neighbors(entity):
                        if neighbor.startswith('article_'):
                            community_articles.add(neighbor)

                # Count negative sentiment articles
                negative_articles = [
                    art for art in community_articles
                    if self.G.nodes[art].get('sentiment') == 'negative'
                ]

                if len(negative_articles) >= 2 and len(negative_articles)/len(community_articles) >= 0.3:
                    risk_signal = {
                        'community_id': i,
                        'entity_count': len(entity_nodes),
                        'entities': [self.G.nodes[n].get('name', n) for n in entity_nodes[:5]],  # Top 5 entities
                        'connected_articles': len(community_articles),
                        'negative_articles': len(negative_articles),
                        'negative_ratio': len(negative_articles) / len(community_articles),
                        'risk_type': 'Entity cluster with negative sentiment'
                    }
                    self.risk_signals.append(risk_signal)

        print(f"Identified {len(self.risk_signals)} potential risk signals")
        return self.risk_signals

    def extract_risk_subgraph(self, risk_signal, max_nodes=20):
        """
        Extract a subgraph related to a specific risk signal

        Args:
            risk_signal: Risk signal dictionary
            max_nodes: Maximum number of nodes in the subgraph

        Returns:
            NetworkX subgraph
        """
        if 'entity' in risk_signal:
            # Case 1: Risk based on a specific entity
            entity = risk_signal['entity']
            nodes = set([entity])

            # Add connected articles
            connected_articles = [n for n in self.G.neighbors(entity) if n.startswith('article_')]
            nodes.update(connected_articles)

            # Add other entities connected to these articles (limited to maintain clarity)
            for article in connected_articles[:min(5, len(connected_articles))]:
                for neighbor in self.G.neighbors(article):
                    if not neighbor.startswith('article_') and len(nodes) < max_nodes:
                        nodes.add(neighbor)

        elif 'community_id' in risk_signal:
            # Case 2: Risk based on a community
            community_id = risk_signal['community_id']
            nodes = set()

            # Get entities in this community
            for node, data in self.G.nodes(data=True):
                if data.get('community') == community_id and not node.startswith('article_'):
                    nodes.add(node)
                    if len(nodes) >= max_nodes * 0.5:  # Limit to half the max nodes
                        break

            # Add connected articles and their entities
            articles = set()
            for entity in list(nodes):
                for neighbor in self.G.neighbors(entity):
                    if neighbor.startswith('article_'):
                        articles.add(neighbor)
                        if len(articles) >= 5:  # Limit to 5 articles
                            break

            nodes.update(articles)

            # Add some more entities from these articles
            for article in articles:
                for neighbor in self.G.neighbors(article):
                    if not neighbor.startswith('article_') and len(nodes) < max_nodes:
                        nodes.add(neighbor)

        # Create subgraph
        subgraph = self.G.subgraph(nodes).copy()
        return subgraph

    def visualize_risk_subgraph(self, subgraph, title="Risk Signal Subgraph", filename="risk_subgraph.png"):
        """
        Visualize a risk subgraph

        Args:
            subgraph: NetworkX subgraph
            title: Plot title
            filename: Output filename
        """
        plt.figure(figsize=(12, 10))

        # Create position layout
        pos = nx.spring_layout(subgraph, seed=42)

        # Prepare node colors based on type
        node_colors = []
        node_sizes = []

        for node in subgraph.nodes():
            if node.startswith('article_'):
                # Color articles by sentiment
                sentiment = subgraph.nodes[node].get('sentiment', 'neutral')
                if sentiment == 'positive':
                    node_colors.append('green')
                elif sentiment == 'negative':
                    node_colors.append('red')
                else:
                    node_colors.append('gray')
                node_sizes.append(600)
            else:
                # Color entities by type
                entity_type = subgraph.nodes[node].get('type', 'Unknown')
                if entity_type == 'Organization':
                    node_colors.append('blue')
                elif entity_type == 'Person':
                    node_colors.append('orange')
                elif entity_type == 'Location':
                    node_colors.append('purple')
                elif entity_type == 'Product':
                    node_colors.append('cyan')
                elif entity_type == 'Event':
                    node_colors.append('magenta')
                else:
                    node_colors.append('yellow')
                node_sizes.append(300)

        # Draw nodes
        nx.draw_networkx_nodes(
            subgraph, pos,
            node_color=node_colors,
            node_size=node_sizes,
            alpha=0.8
        )

        # Draw edges
        nx.draw_networkx_edges(
            subgraph, pos,
            edge_color='gray',
            width=1.0,
            alpha=0.5
        )

        # Prepare labels
        labels = {}
        for node in subgraph.nodes():
            if node.startswith('article_'):
                # Truncate article titles for readability
                title = subgraph.nodes[node].get('title', '')
                labels[node] = title[:20] + '...' if len(title) > 20 else title
            else:
                # Use entity names
                labels[node] = subgraph.nodes[node].get('name', node.split('_')[0])

        # Draw labels
        nx.draw_networkx_labels(
            subgraph, pos,
            labels=labels,
            font_size=8,
            font_color='black'
        )

        # Create legend
        legend_elements = [
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Negative Article'),
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10, label='Positive Article'),
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='gray', markersize=10, label='Neutral Article'),
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='Organization'),
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='orange', markersize=10, label='Person'),
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='purple', markersize=10, label='Location'),
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='cyan', markersize=10, label='Product'),
            plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='magenta', markersize=10, label='Event')
        ]
        plt.legend(handles=legend_elements, loc='upper right')

        plt.title(title)
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()

        print(f"Visualization saved to {filename}")

    def generate_risk_report(self, top_n=5, filename="risk_analysis_report.json"):
        """
        Generate a comprehensive risk report based on the analysis

        Args:
            top_n: Number of top risk signals to include in the report
            filename: Output filename for the report JSON
        """
        # Sort risk signals by negative_ratio * connected_articles (impact metric)
        for signal in self.risk_signals:
            signal['impact_score'] = signal.get('negative_ratio', 0) * signal.get('connected_articles', 0)

        sorted_signals = sorted(self.risk_signals, key=lambda x: x.get('impact_score', 0), reverse=True)
        top_signals = sorted_signals[:top_n]

        # Generate visualizations for top signals
        for i, signal in enumerate(top_signals):
            try:
                # Extract and visualize subgraph
                subgraph = self.extract_risk_subgraph(signal)

                # Generate title based on signal type
                if 'entity' in signal:
                    title = f"Risk Signal: {signal['entity_name']} ({signal['entity_type']})"
                else:
                    title = f"Risk Signal: Community Cluster {signal['community_id']}"

                vis_filename = f"risk_signal_{i+1}.png"
                self.visualize_risk_subgraph(subgraph, title=title, filename=vis_filename)

                # Add visualization filename to the signal
                signal['visualization'] = vis_filename

            except Exception as e:
                print(f"Error generating visualization for risk signal {i+1}: {str(e)}")

        # Compile report
        report = {
            'timestamp': datetime.now().isoformat(),
            'total_risk_signals': len(self.risk_signals),
            'top_risk_signals': top_signals,
            'graph_metrics': {
                'total_nodes': self.G.number_of_nodes(),
                'total_edges': self.G.number_of_edges(),
                'density': nx.density(self.G),
                'average_clustering': nx.average_clustering(self.G),
            }
        }

        # Save report
        with open(filename, 'w') as f:
            json.dump(report, f, indent=2, default=str)

        print(f"Risk analysis report saved to {filename}")
        return report

def main():
    """Main function to run the news analysis system"""
    print("Starting News Analysis System...")

    # Step 1: Collect news articles
    collector = NewsCollector(rss_feeds, days_limit=14)

    try:
        # Try to load existing articles first
        articles_df = collector.load_articles()
    except:
        # If file doesn't exist, fetch new articles
        articles_df = collector.fetch_articles()
        collector.save_articles()

    # Step 2: Process articles with NLP
    processor = NLPProcessor(model)

    try:
        # Try to load existing processed data
        processed_data = processor.load_processed_data()
    except:
        # If file doesn't exist, process articles
        processed_data = processor.process_articles(articles_df)
        processor.save_processed_data()

    # Step 3: Build knowledge graph
    graph_builder = KnowledgeGraphBuilder()
    graph = graph_builder.build_graph(processed_data)
    graph_builder.save_graph()

    # Step 4: Analyze graph for risk signals
    risk_analyzer = RiskAnalyzer(graph)
    risk_analyzer.calculate_node_metrics()
    risk_signals = risk_analyzer.identify_risk_signals()
    risk_report = risk_analyzer.generate_risk_report()

    print("\nAnalysis complete!")
    print(f"Found {len(risk_signals)} potential risk signals")
    print("Check the output files for detailed results.")

    # Print summary of top risk signals
    print("\nTop Risk Signals Summary:")
    for i, signal in enumerate(risk_report['top_risk_signals'][:3]):
        print(f"\n{i+1}. ", end="")
        if 'entity' in signal:
            print(f"Entity: {signal['entity_name']} ({signal['entity_type']})")
            print(f"   Connected to {signal['connected_articles']} articles ({signal['negative_articles']} negative)")
            print(f"   Impact score: {signal['impact_score']:.2f}")
        else:
            print(f"Community cluster with {signal['entity_count']} entities")
            print(f"   Key entities: {', '.join(signal['entities'][:3])}")
            print(f"   Connected to {signal['connected_articles']} articles ({signal['negative_articles']} negative)")
            print(f"   Impact score: {signal['impact_score']:.2f}")

if __name__ == "__main__":
    main()

Starting News Analysis System...
Fetching articles from Reuters Business...
Fetching articles from BBC World...
Fetching articles from Al Jazeera...
Collected 47 articles
Articles saved to collected_news_articles.csv
Processing article: European leaders pressure Russia over 30...
Processing article: Trump administration considers suspendin...
Processing article: Maga says Pope Leo may be American, but ...
Processing article: Taylor Swift criticises Lively-Baldoni c...
Processing article: How will Pope Leo lead? His first days m...
Processing article: Elton John and Dua Lipa seek protection ...
Processing article: Mexico sues Google over 'Gulf of America...
Processing article: Turkish Tufts University student release...
Processing article: US confirms plan for private firms to de...
Processing article: Xi shows he wants to be close to Putin -...
Processing article: No water, no power - Port Sudan reeling ...
Processing article: Moon dust 'rarer than gold' arrives in U...
Processing arti