In [275]:
%config Completer.use_jedi = True

In [276]:
import os
import requests
from bs4 import BeautifulSoup
from typing import List
from dotenv import load_dotenv
from openai import OpenAI
import google.generativeai as gemini_model
import anthropic
import gradio as gr
import json

In [277]:
# This code loads environment variables from a .env file and explicitly sets API keys (OpenAI, Anthropic, and Google)  
# in the system's environment using os.environ. This ensures secure access to the API keys within the program.

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
os.environ['ANTHROPIC_API_KEY'] = os.getenv("ANTHROPIC_API_KEY")
os.environ['GOOGLE_API_KEY'] = os.getenv("GOOGLE_API_KEY")


In [278]:
# This code initializes API clients for different AI models:  
# - `openai = OpenAI()` creates an instance of the OpenAI API client.  
# - `gemini_model.configure()` configures the Gemini model (Google AI), likely setting up parameters or authentication.  

openai = OpenAI()
gemini_model.configure()


In [279]:
import configparser

# This code initializes a ConfigParser instance to read configuration settings from a file.  
# - `config = configparser.ConfigParser()` creates a parser object for handling configuration files.  
# - `config.read('settings.config')` loads and reads the `settings.config` file, making its values accessible in the program.  

config = configparser.ConfigParser()
config.read('settings.config')


['settings.config']

In [280]:
from bs4 import BeautifulSoup
import requests

class WebsiteScrap:
    """
    A class for scraping and extracting textual content and links from a given webpage.

    Attributes:
    - url (str): The URL of the webpage to scrape.
    - title (str): The title of the webpage (or "No title" if unavailable).
    - text (str): The cleaned textual content of the webpage, excluding styles, scripts, inputs, and images.
    - link (list): A list of all valid hyperlinks found on the webpage.
    """

    def __init__(self, url): 
        self.url = url 
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title of the webpage
        self.title = soup.title.text if soup.title else "No title"

        # Remove unwanted elements (styles, scripts, input fields, and images)
        for nondesired in soup.body(['style', 'script', 'input', 'img']):
            nondesired.decompose()

        # Extract and clean the text content of the page
        self.text = soup.body.get_text(strip=True, separator=" ")

        # Extract all links from the webpage
        links = [link['href'] for link in soup.find_all('a', href=True)]
        self.link = [link for link in links if link]

    def get_contents(self):
        """Returns a formatted string containing the webpage title and its extracted text content."""
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


In [281]:
import json

class StructureLinks: 
    """
    A class for extracting, structuring, and retrieving webpage content and its related links 
    using AI models (GPT-4o-mini and Gemini-2.0).

    Attributes:
    - link_system_prompt (str): System-level prompt for guiding the AI's response.
    - link_user_prompt (str): User-specific prompt formatted with webpage title and links.
    - reponse (dict): Stores the structured JSON response from AI models.
    - web (WebsiteScrap): Instance of the WebsiteScrap class representing the main webpage.
    """

    def __init__(self, website, config): 
        self.link_system_prompt = config.get('PROMPT', 'link_system_prompt')
        self.link_user_prompt = config.get('PROMPT', 'link_user_prompt')
        self.link_user_prompt = self.link_user_prompt.format(title=website.title, link=website.link)
        
        self.reponse = ""  # Stores the AI-generated structured response
        self.web = website  # Stores the main webpage object

    def get_json_response_gpt_model(self): 
        """Fetches a structured JSON response from OpenAI's GPT-4o-mini model."""
        completion = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": self.link_system_prompt},
                {"role": "user", "content": self.link_user_prompt}
            ],
            response_format={"type": "json_object"},
        )

        result = completion.choices[0].message.content
        self.reponse = json.loads(result)  # Parse the JSON response

    def get_json_response_gemini_model(self): 
        """Fetches a structured JSON response from Google's Gemini-2.0 AI model."""
        
        completion = gemini_model.GenerativeModel(
            model_name="gemini-2.0-flash-exp",
            system_instruction=self.link_system_prompt,
            generation_config={"response_mime_type": "application/json"}  
        )

        result = completion.generate_content(self.link_user_prompt)
        print("Raw Gemini Response:", result.text)  

        # Ensure the response is not empty or whitespace
        if not result.text.strip():  
            print("The response is empty or contains only spaces.")
            self.reponse = {}
        else:
            try:
                self.reponse = json.loads(result.text)  # Parse JSON response
                print("Successfully decoded response:", self.reponse)
            except json.JSONDecodeError as e:
                print(f"JSON decoding error: {e}")
                self.reponse = {}

    def get_sub_link_content(self): 
        """
        Retrieves and combines the content of the main webpage with AI-structured sub-links.

        Returns:
        - (str): A formatted string containing the main webpage content and extracted sub-links.
        """
        result = "Landing page \n"
        result += self.web.get_contents()

        # Extract content from sub-links categorized by the AI model
        for rep in self.reponse.get('links', []):  
            result += f"\n\n {rep['type']},\n"
            result += WebsiteScrap(rep["url"]).get_contents()

        return result


In [282]:
class BrochureGeneration:
    """
    A class for generating a company brochure by extracting and structuring web content
    using AI models (GPT-4o-mini or Gemini).

    Attributes:
    - links (StructureLinks): An instance of StructureLinks to process web links.
    - web (WebsiteScrap): An instance of WebsiteScrap to scrape the main webpage.
    - config (ConfigParser): Configuration settings containing AI prompts.
    - system_prompt (str): System-level prompt for structuring the AI response.
    - user_prompt (str): User-specific prompt formatted with company details.
    """

    def __init__(self, website, link, config):
        self.links = link
        self.web = website
        self.config = config
        self.system_prompt = self.config.get('PROMPT', 'brochure_system_prompt')
        self.user_prompt = self.config.get('PROMPT', 'brochure_user_prompt')

    def get_brochure(self, company_name, url, model_name):
        """
        Generates a company brochure using AI models by extracting content from the given URL.

        Parameters:
        - company_name (str): Name of the company for which the brochure is being generated.
        - url (str): Website URL to extract information from.
        - model_name (str): AI model to use for structuring the content ('GPT' or 'gemini').
        """
        
        # Reinitialize the website scraping and link processing objects with the provided URL
        self.web = self.web(url)
        self.links = self.links(self.web, self.config)
        
        # Format the user prompt with the company's name
        self.user_prompt = self.user_prompt.format(company_name=company_name)
        
        # Use the selected AI model to process and structure the webpage links
        if model_name == "GPT":
            self.links.get_json_response_gpt_model()
        elif model_name == "gemini":
            self.links.get_json_response_gemini_model()

        # Append extracted sub-link content to the AI prompt
        self.user_prompt += "\n\n" + self.links.get_sub_link_content()
        
        # Stream AI-generated content for the brochure
        stream = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": self.user_prompt}
            ],
            stream=True
        )

        response = ""
        print("hello")  # Debugging statement
        
        for chunk in stream:
            response += chunk.choices[0].delta.content or ""
            yield response


In [283]:
import gradio as gr

# Initialize the BrochureGeneration class with WebsiteScrap, StructureLinks, and the config
brochure = BrochureGeneration(WebsiteScrap, StructureLinks, config)

# Create a Gradio interface to interact with the brochure generation process
view = gr.Interface(
    fn=brochure.get_brochure,  # The function that will be called when the user interacts with the interface
    inputs=[  # The inputs that the user will provide through the UI
        gr.Textbox(label="Company name:"),  # Textbox for the company name
        gr.Textbox(label="Landing page URL including http:// or https://"),  # Textbox for the URL of the landing page
        gr.Dropdown(["GPT", "gemini"], label="Select model")  # Dropdown to select the AI model (GPT or Gemini)
    ],
    outputs=[gr.Markdown(label="Brochure:")],  # The output will be a Markdown area to display the generated brochure
    flagging_mode="never"  # Flagging mode is disabled in this case
)

# Launch the Gradio interface
view.launch()


* Running on local URL:  http://127.0.0.1:7878

To create a public link, set `share=True` in `launch()`.




hello
