In [12]:
import os
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_openai import ChatOpenAI
from bs4 import BeautifulSoup
import requests
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain_openai import ChatOpenAI
import pandas as pd
from pydantic import BaseModel, Field
from typing import Optional, List
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [13]:
pd.set_option('display.max_colwidth', None)

In [14]:
os.environ["OPENAI_API_KEY"] = ""                      # enter your keys
os.environ["SERPER_API_KEY"] = ""

In [15]:
def extract_html_from_url(url):
    try:
        headers = {"User-Agent": ""}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  
        soup = BeautifulSoup(response.content, "html.parser")
        excluded_tagNames = ["footer", "nav"]
        for tag_name in excluded_tagNames:
            for unwanted_tag in soup.find_all(tag_name):
                unwanted_tag.extract()
        for a_tag in soup.find_all("a"):
            href = a_tag.get("href")
            if href:
                a_tag.string = f"{a_tag.get_text()} ({href})"
        return ' '.join(soup.stripped_strings) 
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None


In [16]:
def sites_generator(agent_input:str, no_websites:int):
    "enter the search query and number of websites ,it will return url of related sites"
    serper_search = GoogleSerperAPIWrapper(k=no_websites)
    websites = serper_search.results(agent_input)
    campaigns = websites['organic']
    return campaigns

In [17]:
# Example : Change according to content stucture you want

class CampaignItem(BaseModel):
    campaign_name: str = Field(description="The name of sports campaign")
    description: str = Field(description="Detail description of the campaign ")

class CampaignList(BaseModel):
    campaigns: List[CampaignItem]


In [18]:
llm = ChatOpenAI(temperature=0)
output_parser = PydanticOutputParser(pydantic_object = CampaignList)

prompt_template = """
You are an expert making web scrapping and analyzing HTML raw code.
If there is no explicit information don't make any assumption.
Extract all objects that matched the instructions from the following html
{html_text}
Provide them in a list, also if there is a next page link remember to add it to the object.
Please, follow carefulling the following instructions
{format_instructions}
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["html_text"],
    partial_variables={"format_instructions": output_parser.get_format_instructions}
)

chain = prompt | llm | output_parser

In [23]:
def count_tokens(text):
    # use token count for more accurate results
    return len(text)

def extract_content(campaigns):
    for camp in campaigns:
        url = camp['link']
        html_text_parsed = extract_html_from_url(url)
        
        token_count = count_tokens(html_text_parsed)
        
        if token_count < 15000:
            response = chain.invoke(input={"html_text": html_text_parsed})
            campaigns_extracted = response.campaigns
        else:
            # Use Langchain Recursive Splitter to split the text
            splitter = RecursiveCharacterTextSplitter(chunk_size=15000, chunk_overlap=500)
            splits = splitter.split_text(html_text_parsed)
            campaigns_extracted = []
            for split in splits:
                response = chain.invoke(input={"html_text": split})
                campaigns_extracted.extend(response.campaigns)

        rows = []
        columns = ['campaign_name', 'description']
        for campaign_extracted in campaigns_extracted:
            data = {
                "campaign_name": campaign_extracted.campaign_name,
                "description": campaign_extracted.description
            }
            rows.append(data)

        result_df = pd.DataFrame(rows, columns=columns)
    return result_df


In [24]:
agent_input = """Scrape all the football campaign"""
campaigns = sites_generator(agent_input=agent_input,no_websites=50)

In [None]:
new_df = extract_content(campaigns)

In [None]:
new_df