<img src="https://drive.google.com/uc?export=view&id=1wYSMgJtARFdvTt5g7E20mE4NmwUFUuog" width="200">

[![Gen AI Experiments](https://img.shields.io/badge/Gen%20AI%20Experiments-GenAI%20Bootcamp-blue?style=for-the-badge&logo=artificial-intelligence)](https://github.com/buildfastwithai/gen-ai-experiments)
[![Gen AI Experiments GitHub](https://img.shields.io/github/stars/buildfastwithai/gen-ai-experiments?style=for-the-badge&logo=github&color=gold)](http://github.com/buildfastwithai/gen-ai-experiments)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1cOlv_y_RnIdhgvX9LosMTlzjTcJZgQ9h?usp=sharing)



## Master Generative AI in 8 Weeks
**What You'll Learn:**
- Master cutting-edge AI tools & frameworks
- 6 weeks of hands-on, project-based learning
- Weekly live mentorship sessions
- Join Innovation Community

Learn by building. Get expert mentorship and work on real AI projects.
[Start Your Journey](https://www.buildfastwithai.com/genai-course)

# WebScraping with AI


  ## 1. Traditional WebScraping using Beautiful Soup


> Beautiful Soup (Python) parses HTML/XML, turning it into a navigable structure. This lets you easily search and extract data from websites, making it useful for web scraping tasks.




In [None]:
!pip install requests beautifulsoup4 tiktoken

In [None]:
import requests
from bs4 import BeautifulSoup

def beautiful_soup_scrape_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return str(soup)

In [None]:
url = "https://buildfastwithai.com/courses"

In [None]:
data = beautiful_soup_scrape_url(url)
print(data)

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_headings(url):
    """
    Scrape all headings (h1 to h6) from a webpage.

    :param url: The URL of the webpage to scrape
    :return: A list of dictionaries containing heading tag and text
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    headings = []
    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): ## "fetch the headings"
        headings.append({
            'tag': tag.name,
            'text': tag.get_text(strip=True)
        })

    return headings


url = "https://buildfastwithai.com/courses"
data = scrape_headings(url)
print(data)

[{'tag': 'h1', 'text': '404'}]




---


## 2. Scraping ScrapegraphAI




> Scrapegraph uses AI to simplify web scraping. Instead of writing complex code, you tell it what data you want, and it figures out how to extract it. It works on websites and even local files like HTML.



In [None]:
%%capture
!pip install scrapegraphai --upgrade
!apt install chromium-chromedriver
!pip install nest_asyncio
!pip install playwright
!playwright install

In [None]:
!pip install -qU scrapegraphai[burr]

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from google.colab import userdata

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

### 2.2.1 SmartScraperGraph

single-page scraper that only needs a user prompt and an input source;



In [None]:
graph_config_openai = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "gpt-4o-mini",
        "temperature":0,
    },
    "verbose":True,
}

In [None]:
from scrapegraphai.graphs import SmartScraperGraph


smart_scraper_graph = SmartScraperGraph(
    prompt="List all courses and their description.",
    # also accepts a string with the already downloaded HTML code
    source="https://www.buildfastwithai.com/resources",
    config=graph_config_openai
)

result = smart_scraper_graph.run()

--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.buildfastwithai.com/resources) ---
--- Executing Parse Node ---
--- Executing GenerateAnswer Node ---


In [None]:
result

{'courses': [{'title': 'Basics of Python for Gen AI',
   'description': 'This comprehensive course covers essential Python fundamentals, guiding participants from basic syntax and data structures to more advanced concepts like functions, loops, and working with external libraries. Perfect for beginners and tech enthusiasts alike, the course culminates in a practical exploration of using Python to interact with GPT models, providing a solid foundation for future AI endeavors.'},
  {'title': 'Create Your AI Girlfriend',
   'description': 'This comprehensive course explores the cutting-edge technologies behind creating an AI girlfriend. Participants will learn to generate photo-realistic images using AI tools, simulate engaging conversations with Large Language Models, and bring their creation to life with Text-to-Speech technologies. The course covers step-by-step guides for image creation, chat setup, and voice integration, culminating in an interactive Q&A session.'},
  {'title': 'Gen 

In [None]:
import json

# https://www.buildfastwithai.com/genai-course

smart_scraper_graph = SmartScraperGraph(
    prompt="Give me a summary of this webpage",
    source="https://www.buildfastwithai.com/genai-course",
    config=graph_config_openai
)

result = smart_scraper_graph.run()
print(json.dumps(result,indent=2))

In [None]:
smart_scraper_graph = SmartScraperGraph(
    prompt="List of the products and their description with prices",
    # also accepts a string with the already downloaded HTML code
    source="https://www.orae.in/",
    config=graph_config_openai
)

result = smart_scraper_graph.run()

print(json.dumps(result,indent=2))

--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.orae.in/) ---
--- Executing Parse Node ---
--- Executing GenerateAnswer Node ---


{
  "products": [
    {
      "name": "Sequence Cork Yoga Mat",
      "description": "Regular price Rs. 5,499.00 Regular price ~~Rs. 7,000.00~~ Sale price Rs. 5,499.00",
      "price": "Rs. 5,499.00"
    },
    {
      "name": "Rise Cork Yoga Mat",
      "description": "Regular price Rs. 5,499.00 Regular price ~~Rs. 7,000.00~~ Sale price Rs. 5,499.00",
      "price": "Rs. 5,499.00"
    },
    {
      "name": "Pose Cork Yoga Mat",
      "description": "Regular price Rs. 5,499.00 Regular price ~~Rs. 7,000.00~~ Sale price Rs. 5,499.00",
      "price": "Rs. 5,499.00"
    },
    {
      "name": "Cork Support Block",
      "description": "Regular price Rs. 999.00 Regular price ~~Rs. 1,500.00~~ Sale price Rs. 999.00",
      "price": "Rs. 999.00"
    },
    {
      "name": "Cork Yoga Roller",
      "description": "Regular price Rs. 1,199.00 Regular price ~~Rs. 1,700.00~~ Sale price Rs. 1,199.00",
      "price": "Rs. 1,199.00"
    },
    {
      "name": "Yoga Starter Kit",
      "description": 

In [None]:
smart_scraper_graph = SmartScraperGraph(
    prompt="List and prices of top 10 bank stocks",
    # also accepts a string with the already downloaded HTML code
    source="https://www.moneycontrol.com/",
    config=graph_config_openai
)

result = smart_scraper_graph.run()

print(json.dumps(result,indent=2))

--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.moneycontrol.com/) ---
--- Executing Parse Node ---
--- Executing GenerateAnswer Node ---


{
  "top_10_bank_stocks": "NA"
}


### 2.2.2 SpeechGraph
  WebScrape -> Audio

In [None]:
from scrapegraphai.graphs import SpeechGraph

# slight changes in graph_config.
graph_config = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "gpt-3.5-turbo",
    },
    "tts_model": {
        "api_key": OPENAI_API_KEY,
        "model": "tts-1",
        "voice": "alloy"
    },
    "output_path": "website_summary.mp3",
}

speech_graph = SpeechGraph(
    prompt="Make an Audio Summary on this blog",
    source="https://www.marktechpost.com/2024/06/18/meet-deepseek-coder-v2-by-deepseek-ai-the-first-open-source-ai-model-to-surpass-gpt4-turbo-in-coding-and-math-supporting-338-languages-and-128k-context-length/",
    config=graph_config
)

In [None]:
result = speech_graph.run()
answer = result.get("answer", "No answer found")

In [None]:
from IPython.display import Audio
wn = Audio("website_summary.mp3", autoplay=True)
display(wn)



---


## 3. Web Scraping using Jina



> Jina (AI) cleans webpages for AI. It grabs a URL, removes extra elements, and gives you the main content in a format perfect for AI tools.



### 3.1 Intro to Jina

In [None]:
def scrape_jina_ai(url: str) -> str:
  response = requests.get("https://r.jina.ai/" + url)
  return response.text

In [None]:
result = scrape_jina_ai("https://www.buildfastwithai.com/genai-course")
print(result)

### 3.2 Competitors analysis using Jina

In [None]:
# List of cometitors

competitor_sites = [
    {
        "name": "Articulate 360 by Adobe",
        "url": "https://www.articulate.com/360/pricing/freelancers"
    },
    {
        "name": "7taps",
        "url": "https://www.7taps.com/pricing"
    },
    {
        "name": "Mindsmith AI",
        "url": "https://www.mindsmith.ai/pricing"
    },
    {
        "name": "Cards-microlearning",
        "url": "https://www.cards-microlearning.com/en/tarifs"
    },
]


In [None]:
pip install prettytable tqdm --quiet

In [None]:
from typing import List, Callable, Dict
from prettytable import PrettyTable, ALL
from tqdm import tqdm

def view_scraped_content(scrape_url_functions: List[Dict[str, Callable[[str], str]]], sites_list: List[Dict[str, str]], characters_to_display: int = 500, table_max_width: int = 50) -> List[Dict[str, str]]:
    content_table_headers = ["Site Name"] + [f"{func['name']} content" for func in scrape_url_functions]
    cost_table_headers = ["Site Name"] + [f"{func['name']} cost" for func in scrape_url_functions]

    content_table = PrettyTable()
    content_table.field_names = content_table_headers

    cost_table = PrettyTable()
    cost_table.field_names = cost_table_headers

    scraped_data = []

    for site in sites_list:
        content_row = [site['name']]
        cost_row = [site['name']]
        site_data = {"provider": site['name'], "sites": []}

        for scrape_function in scrape_url_functions:
            function_name = scrape_function['name']
            for _ in tqdm([site], desc=f"Processing site {site['name']} using {function_name}"):
                try:
                    content = scrape_function['function'](site['url'])
                    content_snippet = content[:characters_to_display]
                    content_row.append(content_snippet)

                    cost = calculate_cost(content)
                    cost_row.append(f"${cost:.6f}")

                    site_data["sites"].append({"name": function_name, "content": content})
                except Exception as e:
                    error_message = f"Error: {str(e)}"
                    content_row.append(error_message)
                    cost_row.append("Error")

                    site_data["sites"].append({"name": function_name, "content": error_message})
                    continue

        content_table.add_row(content_row)
        cost_table.add_row(cost_row)
        scraped_data.append(site_data)

    content_table.max_width = table_max_width
    content_table.hrules = ALL

    cost_table.max_width = table_max_width
    cost_table.hrules = ALL

    print("Content Table:")
    print(content_table)

    print("\nCost Table:\nThis is how much it would cost to use gpt-4o to parse this content for extraction.")
    print(cost_table)

    return scraped_data

In [None]:
list_of_scraper_functions = [
      {"name": "Jina AI", "function": scrape_jina_ai}
      ]

In [None]:
all_content = view_scraped_content(list_of_scraper_functions, competitor_sites, 700, 20)

In [None]:
pip install openai --quiet

In [None]:
from google.colab import userdata
from openai import OpenAI

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

client = OpenAI(api_key=OPENAI_API_KEY)

def extract(user_input: str):
  entity_extraction_system_message = {"role": "system", "content": "Get me the three pricing tiers from this website's content, and return as a JSON with three keys: {cheapest: {name: str, price: float}, middle: {name: str, price: float}, most_expensive: {name: str, price: float}}"}

  messages = [entity_extraction_system_message]
  messages.append({"role": "user", "content": user_input})

  response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        stream=False,
        response_format={"type": "json_object"}
    )

  return response.choices[0].message.content

In [None]:
def display_extracted_content(results: List[Dict[str, any]], num_objects: int):
    table = PrettyTable()
    table.field_names = ["Site", "Provider Name", "Extracted Content"]

    # Ensure num_objects does not exceed the length of the results list
    num_objects = min(num_objects, len(results))

    # Process the specified number of items from the results list with a progress bar
    for result in tqdm(results[:num_objects], desc="Processing results"):
        provider_name = result["provider"]

        for site in result["sites"]:
            function_name = site["name"]
            content = site["content"]

            # Progress bar for each function
            for _ in tqdm(range(1), desc=f"Extracting content with {provider_name} for {function_name}"):
                extracted_content = extract(content)
                table.add_row([provider_name, function_name, extracted_content])

    table.max_width = 50  # Set the maximum width for better display
    table.hrules = ALL

    print("Extracted Content Table:")
    print(table)


In [None]:
display_extracted_content(all_content, num_objects=9)