# Imports (to delete some)

In [4]:
import os
import re
import json
import collections
import pandas as pd
import marvin
import fitz
import argparse
import gradio as gr
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict
from tabulate import tabulate
from scipy.spatial.distance import cosine
from difflib import SequenceMatcher
from typing import List, Dict, Union, Optional
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents.base import Document
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from difflib import SequenceMatcher
from Classes.Chatbot import SummarizerGPT, ChatGPT, BigSummarizerGPT
import logging
import traceback
import ast
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import base64
import io
import mammoth

logging.basicConfig(level=logging.DEBUG, filename='app.log', filemode='w',
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

  from .autonotebook import tqdm as notebook_tqdm


# Set variables

In [5]:
topic = "VodafoneZiggo's acquisition of UEFA broadcasting rights and the public backlash concerning accessibility for non-subscribers, particularly among older audiences unfamiliar with streaming technology. Issues with account registration, app functionality, and reliance on YouTube for viewing have fueled criticism, while a pending complaint by KPN may further escalate the situation."
query = "Ziggo UEFA uitzendrechten"
main_folder = "KnowledgeBase/TopicalAnalysis/ZiggoBroadcast"
analysis_targets = ["detailed issue analysis focused on VodafoneZiggo's acquisition of UEFA broadcasting rights. Identify and summarize the primary concerns raised. Analyze specific problems related to the registration process for the Ziggo Sport Free account, technical issues with the Ziggo-GO app the broader sentiment toward streaming requirements and accessibility challenges for non-subscribers and older audiences. Highlight recurring themes, technical bottlenecks, and the impact of these issues on public perception of VodafoneZiggo's reputation. Present the findings as a structured report with clear insights into the key problems and their underlying causes.", "stakeholder mapping analysis related to VodafoneZiggo's acquisition of UEFA broadcasting rights. Extract all relevant stakeholders mentioned in the input coverage, including organizations, individuals, customer groups, and regulatory bodies. Categorize these stakeholders based on their roles (e.g., customers, competitors, regulators, media outlets) and their level of influence or interest in the issue. For each stakeholder, provide an analysis of their priorities, concerns, and potential impact on VodafoneZiggo's reputation and operations. Include details about their expressed opinions, involvement in the issue, and any public or formal statements they have made. Summarize the findings in a structured format, highlighting key stakeholders and their relevance to the current broadcasting rights scenario.","Conduct a messaging analysis of VodafoneZiggo‚Äôs UEFA broadcasting rights narrative. Examine the key messages communicated by VodafoneZiggo regarding these rights, focusing on the themes, tone, and promises conveyed. Evaluate how these messages align with the identified priorities and concerns of key stakeholders, including customers, regulators, and the general public. Assess the consistency of the UEFA narrative with VodafoneZiggo‚Äôs broader brand positioning in sports, entertainment, and other market segments. Identify any gaps or discrepancies in the messaging and highlight opportunities to improve alignment with stakeholder expectations and the company's overall strategic goals. Provide a structured report with insights into the effectiveness of the current messaging and actionable recommendations for refinement."]

## New google search result extraction

In [6]:
from urllib.parse import urlparse
from datetime import datetime
import re
from bs4 import BeautifulSoup
import requests
from urllib.parse import quote_plus
import time
import random

def setup_directory_structure(main_folder):
    """
    Creates the required directory structure under the main folder.
    
    Args:
        main_folder (str): Path to the main folder
        
    Returns:
        dict: Dictionary containing paths to all created directories
    """
    # Define the structure
    structure = {
        'MediaArticles': ['IndividualOutputs', 'FinalOutputs'],
        'Websearch': ['IndividualOutputs', 'FinalOutputs']
    }
    
    paths = {}
    
    try:
        # Create main folder if it doesn't exist
        os.makedirs(main_folder, exist_ok=True)
        paths['main'] = main_folder
        
        # Create subfolders
        for subfolder, inner_folders in structure.items():
            # Create subfolder path
            subfolder_path = os.path.join(main_folder, subfolder)
            os.makedirs(subfolder_path, exist_ok=True)
            paths[subfolder] = subfolder_path
            
            # Create inner folders
            for inner_folder in inner_folders:
                inner_path = os.path.join(subfolder_path, inner_folder)
                os.makedirs(inner_path, exist_ok=True)
                paths[f"{subfolder}_{inner_folder}"] = inner_path
        
        print(f"Directory structure created successfully under {main_folder}")
        return paths
        
    except Exception as e:
        print(f"Error creating directory structure: {str(e)}")
        raise

def extract_metadata(soup, url, response):
    """
    Extract additional metadata from the webpage.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML content
        url (str): URL of the page
        response (requests.Response): Response object containing headers
    
    Returns:
        dict: Dictionary containing extracted metadata
    """
    metadata = {}
    
    # Extract domain information
    parsed_url = urlparse(url)
    metadata['full_domain'] = parsed_url.netloc
    metadata['top_level_domain'] = '.'.join(parsed_url.netloc.split('.')[-2:])
    
    # Extract dates
    try:
        # Try to get last modified from headers
        last_modified = response.headers.get('last-modified')
        if last_modified:
            metadata['last_modified'] = datetime.strptime(
                last_modified, '%a, %d %b %Y %H:%M:%S %Z'
            ).strftime('%Y-%m-%d')
        
        # Look for publication date in meta tags
        pub_date = None
        
        # Check meta tags with property attribute
        for meta_name in ['article:published_time', 'publication_date', 'og:published_time']:
            meta_tag = soup.find('meta', property=meta_name)
            if meta_tag and meta_tag.get('content'):
                pub_date = meta_tag.get('content')
                break
        
        # If not found, check meta tags with name attribute
        if not pub_date:
            for meta_name in ['publication_date', 'date', 'datePublished']:
                meta_tag = soup.find('meta', attrs={'name': meta_name})
                if meta_tag and meta_tag.get('content'):
                    pub_date = meta_tag.get('content')
                    break
        
        # If not found, check meta tags with itemprop attribute
        if not pub_date:
            meta_tag = soup.find('meta', attrs={'itemprop': 'datePublished'})
            if meta_tag and meta_tag.get('content'):
                pub_date = meta_tag.get('content')
        
        if pub_date:
            # Try different date formats
            for date_format in ['%Y-%m-%d', '%Y/%m/%d', '%Y-%m-%dT%H:%M:%S']:
                try:
                    metadata['publication_date'] = datetime.strptime(
                        pub_date.split('+')[0], date_format
                    ).strftime('%Y-%m-%d')
                    break
                except ValueError:
                    continue
    except Exception as e:
        print(f"Error extracting dates from {url}: {str(e)}")
    
    # Extract author information
    try:
        author = None
        
        # Check meta tag with property="author"
        author_meta = soup.find('meta', property='author')
        if author_meta:
            author = author_meta.get('content')
        
        # Check meta tag with name="author"
        if not author:
            author_meta = soup.find('meta', attrs={'name': 'author'})
            if author_meta:
                author = author_meta.get('content')
        
        # Check meta tag with property="article:author"
        if not author:
            author_meta = soup.find('meta', property='article:author')
            if author_meta:
                author = author_meta.get('content')
        
        # Check for author link
        if not author:
            author_link = soup.find('a', attrs={'rel': 'author'})
            if author_link:
                author = author_link.get_text().strip()
        
        # Check for elements with author-related classes
        if not author:
            author_patterns = ['author', 'byline']
            for pattern in author_patterns:
                author_elem = soup.find(class_=re.compile(pattern, re.I))
                if author_elem:
                    author = author_elem.get_text().strip()
                    break
        
        if author:
            # Clean up author string
            author = re.sub(r'^By\s+|^Author:\s+', '', author, flags=re.I)
            author = re.sub(r'\s+', ' ', author).strip()
            metadata['author'] = author
            
    except Exception as e:
        print(f"Error extracting author from {url}: {str(e)}")
    
    # Add word count if content exists
    if 'content' in metadata and metadata['content']:
        metadata['word_count'] = count_words(metadata['content'])
    
    return metadata

def count_words(text):
    """
    Count words in text content.
    
    Parameters:
        text (str): Text content to analyze
        
    Returns:
        int: Word count
    """
    # Remove extra whitespace and split
    words = re.findall(r'\b\w+\b', text.lower())
    return len(words)

def google_search_scraper(query, num_results=120, language='en', country='us', safe_search='off', fetch_content=True):
    """
    Scrape Google search results and optionally fetch content from each result.
    
    Parameters:
        query (str): Search query
        num_results (int): Number of results to return (default: 10)
        language (str): Search language (default: 'en')
        country (str): Country code (default: 'us')
        safe_search (str): Safe search setting ('off', 'moderate', 'strict')
        fetch_content (bool): Whether to fetch full content for each result (default: True)
    
    Returns:
        list: List of dictionaries containing search results and their content
    """
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': f'{language}-{country},en;q=0.5',
        'Referer': 'https://www.google.com/',
        'DNT': '1',
    }
    
    base_url = 'https://www.google.com/search'
    params = {
        'q': query,
        'num': num_results,
        'hl': language,
        'gl': country,
        'safe': safe_search,
    }
    
    encoded_params = '&'.join([f'{k}={quote_plus(str(v))}' for k, v in params.items()])
    search_url = f'{base_url}?{encoded_params}'
    
    try:
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        search_results = []
        for result in soup.select('div.g'):
            try:
                title_elem = result.select_one('h3')
                link_elem = result.select_one('a')
                snippet_elem = result.select_one('div.VwiC3b')
                
                if title_elem and link_elem and snippet_elem:
                    result_dict = {
                        'title': title_elem.get_text(),
                        'link': link_elem['href'],
                        'snippet': snippet_elem.get_text(),
                    }
                    
                    if fetch_content:
                        time.sleep(random.uniform(1, 3))
                        # Fetch page content
                        try:
                            page_response = requests.get(result_dict['link'], headers=headers, timeout=10)
                            page_response.raise_for_status()
                            page_soup = BeautifulSoup(page_response.text, 'html.parser')
                            
                            # Get main content
                            result_dict['content'] = fetch_page_content(result_dict['link'])
                            
                            # Add word count
                            if result_dict['content']:
                                result_dict['word_count'] = count_words(result_dict['content'])
                            
                            # Extract additional metadata
                            metadata = extract_metadata(page_soup, result_dict['link'], page_response)
                            result_dict.update(metadata)
                            
                        except Exception as e:
                            print(f"Error fetching content for {result_dict['link']}: {e}")
                    
                    search_results.append(result_dict)
            except Exception as e:
                print(f"Error processing result: {e}")
                continue
                
        return search_results
        
    except requests.exceptions.RequestException as e:
        print(f"Error occurred: {e}")
        return []

def fetch_page_content(url):
    """
    Fetch and parse content from a specific URL.
    
    Parameters:
        url (str): URL to fetch content from
    
    Returns:
        str: Extracted text content from the page
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove script and style elements
        for script in soup(['script', 'style']):
            script.decompose()
            
        # Get text content
        text = soup.get_text()
        
        # Clean up text
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text
        
    except Exception as e:
        print(f"Error fetching content from {url}: {e}")
        return ""

def filter_search_results(search_results):
    """
    Filter search results based on content length and type criteria.
    
    Parameters:
        search_results (list): List of dictionaries containing search results with content
        
    Returns:
        list: Filtered list of search results
    """
    def is_valid_content(content):
        # Check content length
        if not 500 <= len(content) <= 30000:
            return False
            
        # Common PDF binary markers and metadata patterns
        pdf_indicators = [
            '%PDF-', # PDF header
            'stream\n', # PDF stream marker
            'endstream', # PDF stream end marker
            'obj\n', # PDF object marker
            'endobj', # PDF object end marker
            '/Type/', # PDF type marker
            '/Pages', # PDF pages marker
            '/Contents', # PDF contents marker
            '\x00\x01\x02\x03', # Binary data patterns
            '\\x', # Escaped binary data
            'JVBERi0', # Base64 encoded PDF header
        ]
        
        # Convert content to string if it isn't already
        content_str = str(content)
        
        # Check for PDF indicators
        for indicator in pdf_indicators:
            if indicator in content_str:
                return False
                
        # Check for high concentration of non-printable characters
        non_printable_count = sum(1 for char in content_str if not (32 <= ord(char) <= 126 or char in '\n\t'))
        if non_printable_count / len(content_str) > 0.1:  # More than 10% non-printable characters
            return False
            
        return True
    
    # Filter the results
    filtered_results = []
    for result in search_results:
        if 'content' in result and result['content']:
            if is_valid_content(result['content']):
                filtered_results.append(result)
    
    # Print filtering statistics
    print(f"Original results: {len(search_results)}")
    print(f"Filtered results: {len(filtered_results)}")
    print(f"Removed {len(search_results) - len(filtered_results)} results")
    
    return filtered_results

def generate_summaries(filtered_results, topic_of_interest, output_folder, analysis_target):
    """
    Generate summaries for filtered search results.
    
    Parameters:
        filtered_results (list): List of dictionaries containing search results
        topic_of_interest (str): Main topic to focus the analysis on
        output_folder (str): Folder to save individual summaries
        analysis_target (str): Specific analysis target
    """
    # Create individual outputs folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Create main summary file in FinalOutputs folder
    final_outputs_folder = os.path.join(os.path.dirname(output_folder), "FinalOutputs")
    os.makedirs(final_outputs_folder, exist_ok=True)
    main_summary_path = os.path.join(final_outputs_folder, "compiled_summaries.md")
    
    system_prompt = f"""You are an expert analyst focusing on {topic_of_interest}. Your task is to analyze content and extract key insights related to this topic. You must be factual, specific, and concise while capturing the most relevant information from the source material.

Important guidelines:
1. Focus only on information directly related to {topic_of_interest}
2. Produce insights which comprehensively describe contained information about the topic of interest.
3. Avoid generic statements or common knowledge. Make your insights tangible and specific to the situation.
4. Be objective and analytical in your assessment. Use a maximum of information which comes from the articles content which you are processing.
5. Ultimately your output will be used to perform this following task: {analysis_target}. Make sure to include elements that are relevant to this task.

Your response must strictly follow the specified format to ensure consistency."""

    compiled_summary = f"# Content Analysis: {topic_of_interest}\nGenerated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
    
    for idx, result in enumerate(filtered_results, 1):
        print(f"Processing result {idx}/{len(filtered_results)}")
        
        if not result.get('content'):
            continue
          
        # Create user prompt for each article
        user_prompt = f"""
Analyze the following content in the context of {topic_of_interest}. 
        
Source Information:
- Title: {result.get('title', 'Unknown Title')}
- Domain: {result.get('full_domain', 'Unknown Source')}
- Date: {result.get('publication_date', 'Unknown Date')}
- Author: {result.get('author', 'Unknown Author')}

Content:
{result.get('content')}

Provide your analysis in exactly this format:

##¬†Title: {result.get('title', 'Unknown Title')}
## Domain: {result.get('full_domain', 'Unknown Source')}
## Date: {result.get('publication_date', 'Unknown Date')}
## Author: {result.get('author', 'Unknown Author')}

SUMMARY:
[3-4 sentences capturing the main points in relation to {topic_of_interest}]

KEY INSIGHTS:
1. [First key insight - few descriptive sentences]

2. [Second key insight - few descriptive sentences]
...
X. [Optional Xth insights if highly relevant]

RELEVANCE ASSESSMENT:
- Explanation: [1-2 sentences on why this content matters for {topic_of_interest}]

Keep your response focused and concise. Include only the most relevant information to {topic_of_interest}."""

        try:
            
            chatbot = ChatGPT(
                system_prompt=system_prompt,
                model_name="gpt-4o-mini",
                temperature=0,
                max_tokens=1200,
                )

            # Generate summary using your ChatGPT class
            response = chatbot.ask(user_prompt)
            print(response)

            # Create individual summary file
            safe_title = "".join(x for x in result.get('title', f'article_{idx}') if x.isalnum() or x in (' ', '-', '_'))
            file_path = os.path.join(output_folder, f"{safe_title[:100]}.md")
            
            # Write individual summary
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(f"# [{result.get('title', 'Untitled')}]({result.get('link', '#')})\n")
                f.write(f"Source: {result.get('full_domain', 'Unknown Source')}\n")
                f.write(f"Date: {result.get('publication_date', 'Unknown Date')}\n")
                f.write(f"Author: {result.get('author', 'Unknown Author')}\n\n")
                f.write(response)
            
            # Add to compiled summary
            compiled_summary += response
            compiled_summary += "\n\n---\n\n"
            
        except Exception as e:
            print(f"Error processing result {idx}: {str(e)}")
            continue
    
    # Write compiled summary
    with open(main_summary_path, 'w', encoding='utf-8') as f:
        f.write(compiled_summary)
    
    print(f"\nProcessing complete. Summaries saved to {output_folder}")
    print(f"Compiled summary saved as {main_summary_path}")
    
    return main_summary_path

def extract_bullet_points(main_folder, topic, analysis_target, query, bullet_points_path):
    """
    Extract focused insights from compiled summaries related to regulatory impacts
    on environmental claims and corporate communications.
    """

    input_path = os.path.join(os.path.dirname(bullet_points_path), "compiled_summaries.md")
    output_path = os.path.join(os.path.dirname(input_path), "BulletPointList.md")
    
    # Initialize the bullet points list with metadata
    bullet_points_content = f"""# Bullet point list of relevant facts and information from the Web searches.
Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Search Query: {query}

## Analysis Focus
{topic}

## Extracted Insights
"""

    # Set up the summarizer with specific parameters
    chatbot = BigSummarizerGPT(
        model_name="chatgpt-4o-latest",
        temperature=0,
        max_tokens=4000
    )

    question = f"""Review the provided content and extract key findings as a comprehensive bullet point list. 

Focus on information that relates to: {topic}

These findings will be used to: {analysis_target}

For each point:
- Include specific details (dates, companies, regulators, outcomes) when available. 
- Focus on concrete examples and evidence from the text
- Extract quantitative data where available
- Make sure it is related to : {topic}

Format your response as a simple bullet point list:
- Start each point with "- "
- Make each point detailed and self-contained
- Include only information supported by the source material
- At the end of your bullet point, you must add the related domain into brackets.

Here is an example input:
## Title: Leidraad Duurzaamheidsclaims | ACM.nl
## Domain: www.acm.nl
## Date: Unknown Date
## Author: Unknown Author

SUMMARY:
The updated "Leidraad Duurzaamheidsclaims" by the Autoriteit Consument & Markt (ACM) provides guidelines for companies on formulating sustainability claims, emphasizing the need for accuracy and transparency. As consumers increasingly demand reliable information regarding the sustainability of products, the ACM's guidelines aim to combat greenwashing by establishing clear rules for claims, which could lead to significant shifts in corporate messaging strategies. The enforcement actions taken against companies for misleading claims indicate a growing regulatory scrutiny that may deter aspirational sustainability goals if companies fear repercussions for ambitious statements.

KEY INSIGHTS:
1. The ACM has established five key rules for sustainability claims, which require companies to provide accurate, specific, and substantiated information. This regulatory framework is designed to enhance consumer trust and reduce instances of greenwashing, thereby holding companies accountable for their environmental communications.

2. Recent enforcement actions by the ACM, including the cessation of misleading claims by major companies like Vattenfall and Albert Heijn, illustrate the active role of regulatory bodies in monitoring corporate sustainability messaging. These actions not only correct misleading practices but also signal to other companies the importance of compliance, potentially leading to a more cautious approach in future sustainability claims.

3. The guidelines encourage companies to describe future sustainability ambitions in concrete and measurable terms, which may lead to a more realistic portrayal of corporate sustainability efforts. However, this could also result in a chilling effect where companies may hesitate to set ambitious sustainability goals for fear of regulatory backlash if they fail to meet them.

RELEVANCE ASSESSMENT:
- Explanation: This content is crucial as it highlights the regulatory landscape in the Netherlands regarding corporate environmental claims, showcasing how enforcement actions can reshape corporate behavior and communication strategies. The emphasis on transparency and accountability in sustainability claims is vital for fostering consumer trust and ensuring that companies do not engage in misleading practices, which is essential for the integrity of sustainability initiatives.

---

## Title: Duurzaamheidsclaims
## Domain: www.afm.nl
## Date: Unknown Date
## Author: Unknown Author

SUMMARY:
The content outlines the regulatory framework established by the Dutch Authority for the Financial Markets (AFM) regarding sustainability claims made by financial market participants. It emphasizes the necessity for companies to substantiate their environmental claims, thereby addressing potential instances of greenwashing. The guidelines aim to ensure transparency and accountability in corporate communications, which may lead to a shift in messaging strategies as companies adapt to avoid regulatory repercussions. This scrutiny could also create a chilling effect on aspirational sustainability goals if firms perceive the risk of enforcement actions as outweighing the benefits of ambitious claims.

KEY INSIGHTS:
1. The AFM has set clear guidelines for sustainability claims, mandating that companies must provide verifiable evidence for their environmental assertions. This requirement is designed to combat greenwashing and enhance the credibility of corporate communications.

2. The emphasis on substantiation and clarity in sustainability messaging may lead companies to adopt more conservative approaches in their marketing strategies, potentially stifling innovative and aspirational sustainability initiatives due to fear of regulatory backlash.

3. The regulatory framework aligns with broader European regulations, such as the Sustainable Finance Disclosure Regulation (SFDR) and the Corporate Sustainability Reporting Directive (CSRD), indicating a comprehensive approach to sustainability accountability across the financial sector.

RELEVANCE ASSESSMENT:
- Explanation: This content is significant as it highlights the regulatory landscape in the Netherlands that directly influences how companies communicate their environmental commitments. The AFM's guidelines not only aim to prevent misleading claims but also shape corporate behavior and strategy in the context of sustainability, impacting both market practices and consumer trust.

Here is the expected output:
- The ACM (Autoriteit Consument & Markt) has outlined five key rules for sustainability claims to ensure accuracy, specificity, and substantiation, aiming to combat greenwashing and enhance consumer trust. These rules target corporate accountability for environmental communications. [www.acm.nl]

- Enforcement actions by the ACM, such as those against Vattenfall and Albert Heijn, show active monitoring of corporate sustainability claims, correcting misleading practices and signaling the importance of compliance to other companies. [www.acm.nl]

....

- Enforcement cases illustrate how greenwashing allegations lead to significant shifts in corporate messaging. Companies aim to balance regulatory compliance with maintaining consumer trust and brand integrity. [www.acm.nl]

- The "chilling effect" on aspirational sustainability goals, caused by fear of regulatory repercussions, presents a challenge for companies attempting to set innovative or long-term environmental targets. [www.afm.nl, www.acm.nl]

- The Dutch regulatory approach aligns with European Union frameworks, ensuring consistency across industries while setting a high standard for environmental claim verifications. This integration influences both local and international corporate strategies. [www.afm.nl]

"""

    try:
        # Get response from the model
        response = chatbot.ask(question, input_path)
        print(response)
        
        # Extract and clean bullet points
        bullet_points = re.findall(r'(?:^|\n)- .+', response)
        
        if bullet_points:
            # Add all bullet points to content
            bullet_points_content += "\n" + "\n".join(point.strip() for point in bullet_points if point.strip())
        else:
            bullet_points_content += "\n- No relevant findings extracted from the provided content. Further research recommended.\n"
        
        # Write the final output
        with open(output_path, "w", encoding='utf-8') as file:
            file.write(bullet_points_content)
        
        print(f"Analysis complete. Results saved to: {output_path}")
        return output_path
        
    except Exception as e:
        print(f"Error during analysis: {str(e)}")
        return None
    
def generate_comprehensive_analysis(main_folder, topic, analysis_target, bullet_points_path):
    """
    Generate a comprehensive analysis from bullet points and identify missing information.
    """
    # Read the bullet points file
    with open(bullet_points_path, "r") as file:
        bullet_points_content = file.read()
    
    # Set up initial summarizer for comprehensive analysis
    chatbot1 = ChatGPT(
        model_name="chatgpt-4o-latest",
        temperature=0,
        max_tokens=4000
    )
    
    question1 = f"""
Create a comprehensive analytical document that will support in producing an analysis of {topic}. 

The analysis has the following goals: {analysis_target}

Your source input includes a list of bullet points containing key findings from various sources. You should use the source into the relevant section of the analysis.

Structure your analysis to:
1. Begin with an executive overview.

2. Organize the findings into sections, with adequate names.

For each section:
   - Synthesize related findings into coherent narratives
   - Support points with specific examples and evidence from the sources. Cite these sources. The sources are the domains written in square brackets.
   - Highlight quantitative data where available
   - Draw connections between different findings
   - Identify patterns and trends
   - Cite the relevant sources domains which were used for the analysis and insights of the specific section.

This analysis should:
- Be thorough yet focused on information that supports: {analysis_target}
- Maintain objectivity while providing clear insights
- Use clear headings and subheadings for easy navigation
- Include source citations [domain.com] in each key sections, at the end of the relevant sentences but not at the end of your output. 

Here are the findings to analyze:
{bullet_points_content}
"""
    
    try:
        # Generate initial comprehensive analysis
        response1 = chatbot1.ask(question1)
        
        # Set up path for output
        output_path = os.path.join(os.path.dirname(bullet_points_path), "TopicalAnalysis.md")
        
        # Write initial analysis
        with open(output_path, "w") as file:
            file.write(response1 + "\n\n")
        
        # Set up second summarizer for identifying missing information
        chatbot2 = ChatGPT(
            model_name="chatgpt-4o-latest",
            temperature=0,
            max_tokens=3000
        )
        
        question2 = f"""
Review the bullet point list and comprehensive analysis to identify any significant information that wasn't fully incorporated into the analysis.

Focus on finding information from the bullet points that:
1. Relates directly to {topic}
2. Would contribute to {analysis_target}
3. Wasn't adequately covered in the comprehensive analysis
4. Make sure you cite the relevant source [domain.com] for each missing point.

Present any missing information in this format:

## Additional Key Information
- [Missing Point topic]: Brief explanation of its relevance to the analysis.
....
- [Missing Point topic]: Brief explanation of its relevance to the analysis.

Only include information that adds substantial value to the analysis. Avoid repeating information that's already well-covered.
If there are no additional key points to add, mention that no significant information was omitted in the initial analysis.

Here is the original bullet point list:
{bullet_points_content}

Here is the comprehensive analysis:
{response1}

Do not write a conclusion. Only produce the bullet points list with the title: ## Additional Key Information
"""
        
        # Generate and append missing information
        response2 = chatbot2.ask(question2)
        with open(output_path, "a") as file:
            file.write(response2)
        
        print(f"Comprehensive analysis has been written to: {output_path}")
        return output_path
        
    except Exception as e:
        print(f"Error during analysis generation: {str(e)}")
        return None

def process_analysis_target(main_folder, topic, analysis_target, filtered_results, query, index):
    """
    Process a single analysis target and generate all related outputs.
    """
    # Create target-specific subfolder
    target_folder = os.path.join(main_folder, "Websearch", f"Analysis_{index + 1}")
    os.makedirs(os.path.join(target_folder, "IndividualOutputs"), exist_ok=True)
    os.makedirs(os.path.join(target_folder, "FinalOutputs"), exist_ok=True)
    
    # Define output paths for this target
    output_folder = os.path.join(target_folder, "IndividualOutputs")
    
    # Generate summaries for this target
    main_summary_file = generate_summaries(
        filtered_results=filtered_results,
        topic_of_interest=topic,
        output_folder=output_folder,
        analysis_target=analysis_target
    )
    
    # Define bullet points path for this target
    bullet_points_path = os.path.join(target_folder, "FinalOutputs", "BulletPointList.md")
    
    # Extract bullet points
    bullet_points_list = extract_bullet_points(
        main_folder=target_folder,
        topic=topic,
        analysis_target=analysis_target,
        query=query,
        bullet_points_path=bullet_points_path  # Add this argument
    )
    
    # Generate comprehensive analysis
    output_file = generate_comprehensive_analysis(
        main_folder=target_folder,
        topic=topic,
        analysis_target=analysis_target,
        bullet_points_path=bullet_points_path
    )
    
    return {
        'summary_file': main_summary_file,
        'bullet_points': bullet_points_path,
        'analysis_file': output_file
    }

def process_multiple_analysis_targets(main_folder, topic, analysis_targets, query):
    """
    Process multiple analysis targets and generate outputs for each.
    
    Args:
        main_folder (str): Base directory for outputs
        topic (str): Main topic of analysis
        analysis_targets (list): List of analysis target strings
        query (str): Search query used
    
    Returns:
        list: List of dictionaries containing paths to generated files for each target
    """
    # Initial setup and search
    paths = setup_directory_structure(main_folder)
    results = google_search_scraper(
        query=query,
        num_results=120,
        language="nl",
        country="nl",
        safe_search="moderate"
    )
    filtered_results = filter_search_results(results)
    
    # Process each analysis target
    outputs = []
    for i, target in enumerate(analysis_targets):
        print(f"\nProcessing analysis target {i + 1}/{len(analysis_targets)}:")
        print(f"Target: {target[:100]}...")
        
        try:
            target_outputs = process_analysis_target(
                main_folder=main_folder,
                topic=topic,
                analysis_target=target,
                filtered_results=filtered_results,
                query=query,
                index=i
            )
            outputs.append({
                'target': target,
                'outputs': target_outputs
            })
            print(f"Successfully processed analysis target {i + 1}")
        except Exception as e:
            print(f"Error processing analysis target {i + 1}: {str(e)}")
            continue
    
    return outputs

In [7]:
outputs = process_multiple_analysis_targets(
    main_folder=main_folder,
    topic=topic,
    analysis_targets=analysis_targets,
    query=query
)

Directory structure created successfully under KnowledgeBase/TopicalAnalysis/ZiggoBroadcast
Error fetching content for https://www.totaaltv.nl/nieuws/ziggo-trapt-nieuw-europees-uefa-voetbalavontuur-officieel-af/: 403 Client Error: Forbidden for url: https://myprivacy.dpgmedia.nl/consent/?siteKey=gepksszd79ws1uhc&callbackUrl=https%3A%2F%2Fwww.totaaltv.nl%2Fnieuws%2Fziggo-trapt-nieuw-europees-uefa-voetbalavontuur-officieel-af%2F
Error fetching content for https://www.totaaltv.nl/nieuws/ziggo-sport-verliest-meer-uitzendrechten/: 403 Client Error: Forbidden for url: https://myprivacy.dpgmedia.nl/consent/?siteKey=gepksszd79ws1uhc&callbackUrl=https%3A%2F%2Fwww.totaaltv.nl%2Fnieuws%2Fziggo-sport-verliest-meer-uitzendrechten%2F
Error fetching content for https://www.nu.nl/entertainment/6235204/ziggo-heeft-vanaf-2024-exclusief-alle-uitzendrechten-van-europees-voetbal.html: 403 Client Error: Forbidden for url: https://www.nu.nl/entertainment/6235204/ziggo-heeft-vanaf-2024-exclusief-alle-uitzendr

KeyboardInterrupt: 

In [6]:
def test_single_analysis(analysis_index, topic, analysis_target, query):
    """
    Test the analysis pipeline starting from an existing compiled summary.
    
    Args:
        analysis_index (int): Index of the analysis (1, 2, or 3)
        topic (str): Main topic
        analysis_target (str): Analysis target description
        query (str): Original search query
    """
    try:
        # Set up paths
        analysis_folder = os.path.join(main_folder, "Websearch", f"Analysis_{analysis_index}")
        compiled_summary_path = os.path.join(analysis_folder, "FinalOutputs", "compiled_summaries.md")
        bullet_points_path = os.path.join(analysis_folder, "FinalOutputs", "BulletPointList.md")
        
        # Verify compiled summary exists
        if not os.path.exists(compiled_summary_path):
            print(f"Error: Compiled summary not found at {compiled_summary_path}")
            return False
            
        print(f"\nProcessing Analysis {analysis_index}")
        print(f"Using compiled summary from: {compiled_summary_path}")
        
        # Extract bullet points
        bullet_points_list = extract_bullet_points(
            main_folder=analysis_folder,
            topic=topic,
            analysis_target=analysis_target,
            query=query,
            bullet_points_path=bullet_points_path
        )
        
        if bullet_points_list:
            print(f"Successfully generated bullet points at: {bullet_points_path}")
            
            # Generate comprehensive analysis
            output_file = generate_comprehensive_analysis(
                main_folder=analysis_folder,
                topic=topic,
                analysis_target=analysis_target,
                bullet_points_path=bullet_points_path
            )
            
            if output_file:
                print(f"Successfully generated comprehensive analysis at: {output_file}")
                return True
                
        return False
        
    except Exception as e:
        print(f"Error processing analysis {analysis_index}: {str(e)}")
        return False

def run_test():
    """
    Run the test for all three analyses.
    """
    print("Starting test run from existing compiled summaries...")
    
    results = []
    for i, target in enumerate(analysis_targets, 1):
        print(f"\nTesting Analysis {i}")
        success = test_single_analysis(
            analysis_index=i,
            topic=topic,
            analysis_target=target,
            query=query
        )
        results.append(success)
        
    # Print summary
    print("\nTest Results Summary:")
    for i, success in enumerate(results, 1):
        print(f"Analysis {i}: {'Success' if success else 'Failed'}")

if __name__ == "__main__":
    run_test()

Starting test run from existing compiled summaries...

Testing Analysis 1

Processing Analysis 1
Using compiled summary from: KnowledgeBase/TopicalAnalysis/ZiggoBroadcast/Websearch/Analysis_1/FinalOutputs/compiled_summaries.md
Processing chunk: 1/10
Processing chunk: 2/10
Processing chunk: 3/10
Processing chunk: 4/10
Processing chunk: 5/10
Processing chunk: 6/10
Processing chunk: 7/10
Processing chunk: 8/10
Processing chunk: 9/10
Processing chunk: 10/10
- VodafoneZiggo's acquisition of exclusive UEFA broadcasting rights for the Champions League, Europa League, and Conference League (2024‚Äì2027) has sparked significant public backlash, particularly from older audiences unfamiliar with streaming technology. [www.ziggo.nl, www.marketingtribune.nl, www.vodafoneziggo.nl]

- The requirement for non-subscribers to create a Ziggo Sport Free account to access UEFA matches has been criticized as overly complex, especially for older users who struggle with digital registration processes. This ha

In [10]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import requests
from bs4 import BeautifulSoup
import time
import random

def google_api_search(query, api_key, cx, num_results=100, language='nl', country='nl'):
    """
    Search using Google Custom Search API.
    
    Parameters:
        query (str): Search query
        api_key (str): Google API key
        cx (str): Custom Search Engine ID
        num_results (int): Number of results to return (max 100)
        language (str): Search language
        country (str): Country code
    
    Returns:
        list: List of dictionaries containing search results
    """
    try:
        # Build Google Custom Search API service
        service = build("customsearch", "v1", developerKey=api_key)
        
        search_results = []
        # API can only return 10 results per request, so we need to make multiple requests
        for i in range(0, min(num_results, 100), 10):
            try:
                # Execute search request
                result = service.cse().list(
                    q=query,
                    cx=cx,
                    start=i + 1,
                    num=min(10, num_results - i),
                    lr=f'lang_{language}',
                    gl=country
                ).execute()
                
                # Process each search result
                if 'items' in result:
                    for item in result['items']:
                        result_dict = {
                            'title': item.get('title', ''),
                            'link': item.get('link', ''),
                            'snippet': item.get('snippet', ''),
                            'full_domain': extract_domain(item.get('link', '')),
                            'publication_date': item.get('pagemap', {}).get('metatags', [{}])[0].get('article:published_time', '')
                        }
                        
                        # Fetch page content
                        content = fetch_page_content(result_dict['link'])
                        if content:
                            result_dict['content'] = content
                            result_dict['word_count'] = count_words(content)
                        
                        search_results.append(result_dict)
                
                # Respect API rate limits
                time.sleep(1)
                
            except HttpError as error:
                print(f"Error during API request: {error}")
                continue
                
        return search_results
        
    except Exception as e:
        print(f"Error occurred: {e}")
        return []

def extract_domain(url):
    """Extract domain from URL."""
    from urllib.parse import urlparse
    return urlparse(url).netloc

def fetch_page_content(url):
    """Fetch and parse content from a URL."""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove script and style elements
        for script in soup(['script', 'style']):
            script.decompose()
            
        # Get text content
        text = soup.get_text()
        
        # Clean up text
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text
        
    except Exception as e:
        print(f"Error fetching content from {url}: {e}")
        return ""

def count_words(text):
    """Count words in text content."""
    import re
    words = re.findall(r'\b\w+\b', text.lower())
    return len(words)

# Example usage:

api_key = 'AIzaSyAAXjZg7e-24r5ZuSYGL6z5xoXjQLvO9uQ'
cx = '00fa03e65d253466d'

results = google_api_search(
    query="Ziggo UEFA uitzendrechten",
    api_key=api_key,
    cx=cx,
    num_results=100,
    language='nl',
    country='nl'
)

# Filter and process results as before
filtered_results = filter_search_results(results)
results

Error fetching content from https://www.totaaltv.nl/nieuws/ziggo-koopt-wellicht-kat-in-zak-met-dure-voetbalrechten/: 403 Client Error: Forbidden for url: https://myprivacy.dpgmedia.nl/consent/?siteKey=gepksszd79ws1uhc&callbackUrl=https%3A%2F%2Fwww.totaaltv.nl%2Fnieuws%2Fziggo-koopt-wellicht-kat-in-zak-met-dure-voetbalrechten%2F
Error fetching content from https://www.totaaltv.nl/nieuws/ziggo-uitzendrechten-krijgen-betere-bescherming-tegen-illegale-iptv/: 403 Client Error: Forbidden for url: https://myprivacy.dpgmedia.nl/consent/?siteKey=gepksszd79ws1uhc&callbackUrl=https%3A%2F%2Fwww.totaaltv.nl%2Fnieuws%2Fziggo-uitzendrechten-krijgen-betere-bescherming-tegen-illegale-iptv%2F
Error fetching content from https://www.nu.nl/tech/6333555/ziggo-verliest-ondanks-uitzendrechten-champions-league-opnieuw-veel-klanten.html: 403 Client Error: Forbidden for url: https://www.nu.nl/tech/6333555/ziggo-verliest-ondanks-uitzendrechten-champions-league-opnieuw-veel-klanten.html
Error fetching content fro

[{'title': 'Ziggo Sport maakt meer Europees voetbal dan ooit voor heel ...',
  'link': 'https://www.vodafoneziggo.nl/nieuws/ziggo-sport-maakt-meer-europees-voetbal-dan-ooit-voor-heel-nederland-beschikbaar/',
  'snippet': 'Apr 29, 2024 ... Vanaf augustus heeft Ziggo Sport drie jaar lang de exclusieve uitzendrechten voor de UEFA Champions League, de UEFA Europa League en de UEFA\xa0...',
  'full_domain': 'www.vodafoneziggo.nl',
  'publication_date': '',
  'content': 'Ziggo Sport maakt meer Europees voetbal dan ooit voor heel Nederland beschikbaarOverslaan en ga naar inhoudNieuwsZiggo Sport maakt meer Europees voetbal dan ooit voor heel Nederland beschikbaarHomeOver onsNieuwsVerhalenResultatenWerken bijToegankelijkheid Zoeken nlNieuwsZiggo Sport maakt meer Europees voetbal dan ooit voor heel Nederland beschikbaarZiggo Sport maakt meer Europees voetbal dan ooit voor heel Nederland beschikbaar30 april 2024Vanaf augustus heeft Ziggo Sport drie jaar lang de exclusieve uitzendrechten voor de U

In [9]:
filtered_results

[{'title': 'The Shell Trial | 23-24 | Dutch National Opera',
  'link': 'https://www.operaballet.nl/en/dutch-national-opera/2023-2024/shell-trial',
  'snippet': 'The Shell Trial: a brand-new opera with 82 costumes, unique props, and dazzling special effects ‚Äì but produced as sustainably as possible.',
  'full_domain': 'www.operaballet.nl',
  'publication_date': '',
  'content': "The Shell Trial | 23-24 | Dutch National Opera Skip to main content nl Search Menu Search term Search Welcome Hoofdnavigatie Opera Programme Dutch National Opera Studio Boekman News Opera Forward Festival Chorus Dutch National Opera Studio Board & artistic staff Orchestras History Previously at DNO Become a supernumerary actor Newsletter Ballet Programme Dutch National Ballet Studio Boekman News Dancers Jump Junior Company Dutch Ballet Orchestra Artistic & musical staff Previously at HNB History Hans van Manen Foundation Alexandra Radius Prize Newsletter Your visit Plan your visit Season packages Visitor infor