In [4]:
from pydantic import BaseModel,Field
from typing import List
import json

class User(BaseModel):
    name : str = Field(description="name of the organziation")
    logo : str = Field(description="Logo of the given website")
    detailed_description : str = Field(description='A detailed description of what the organization does ')
    services_offered: List[str] = Field(description="A list of services offered by the organization on the given website")
    color_theme : List[str] = Field(description="The color of the website in the format of hexa code")




from crawl4ai import LLMConfig,AsyncWebCrawler,CacheMode,CrawlerRunConfig,BrowserConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
import os 
from dotenv import load_dotenv
load_dotenv()

llm_strategy = LLMExtractionStrategy(
    llm_config=LLMConfig(
        provider="gemini/gemini-2.0-flash",
        api_token=os.getenv("GOOGLE_API_KEY"),
    ),
    schema=User.model_json_schema(),
    extraction_type="schema",
        instruction="""
You are analyzing a webpage to extract structured information about the organization behind it.

Your goal is to extract the following:

1. **Name**: The name of the organization or company.
2. **Logo**: The URL of the primary logo image (typically found in the header or near the company name).
3. **Detailed Description**: A clear and informative summary of what the organization does. 
   - This should come from the section of the page typically labeled or titled "About", "Who We Are", "Our Story", or similar.
   - If the page does not have a heading, look for paragraphs or text blocks that describe the company's purpose, mission, background, or offerings.
   - Do not include text that is clearly part of blog posts, testimonials, products, or contact details.

Tips:
- Focus on sections that describe the identity, mission, background, or goals of the organization.
- If multiple descriptive sections exist, prioritize the one closest to the top of the page or under an "About"-like heading.
- Avoid generic filler content like navigation menus, service listings, or unrelated calls to action.

Return the data in the format defined by the schema.
"""
,    chunk_token_threshold=1000,
    overlap_rate=0.0,
    apply_chunking=True,
    input_format="markdown",   # or "html", "fit_markdown"
    extra_args={"temperature": 0.0, "max_tokens": 800}
)

crawl_config = CrawlerRunConfig(
    extraction_strategy=llm_strategy,
    cache_mode=CacheMode.BYPASS
)


browser_cfg = BrowserConfig(headless=True)

async def get_data(url:str):

    async with AsyncWebCrawler(config= browser_cfg) as crawler:
        result = await crawler.arun(
            url = url,
            config = crawl_config)
    
    if result.success:
        print(f"Successfully scraped : '\n\n\n {result.extracted_content}")
        return json.loads(result.extracted_content)
    else:
        print(f"The code exited with eroor {result.error_message}")


In [5]:
result = await get_data ('https://www.growthsutra.pro/')

Successfully scraped : '


 [
    {
        "name": "GROWTHSUTRA LLP",
        "logo": "https://static.wixstatic.com/media/708455_927be4579fd04a1289dd85befae03e98~mv2.png/v1/fill/w_49,h_26,al_c,q_85,usm_0.66_1.00_0.01,blur_2,enc_auto/708455_927be4579fd04a1289dd85befae03e98~mv2.png",
        "detailed_description": "GrowthSutra helps companies build winning teams for an AI-First World. They offer services like Go-To-Market Partner, On-Demand CMO, and Revenue Architect.",
        "services_offered": [
            "Go-To-Market Partner",
            "On-Demand CMO",
            "Revenue Architect",
            "XPRT Co-Pilots"
        ],
        "color_theme": [],
        "error": false
    },
    {
        "name": "GrowthSutra",
        "logo": "https://static.wixstatic.com/media/cb6b3d_5c8f2b020ebe48b69bc8c163cc480156~mv2.png/v1/fill/w_60,h_60,al_c,q_85,usm_0.66_1.00_0.01,enc_avif,quality_auto/GrowthSutra%20Logo.png",
        "detailed_description": "At GrowthSutra, we are the go-to exp

In [1]:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_hex_colors(url: str, limit: int = 5) -> list:
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find inline styles
        inline_styles = [tag.get('style', '') for tag in soup.find_all(style=True)]
        css_text = ' '.join(inline_styles)

        # Find linked stylesheets
        css_links = [link['href'] for link in soup.find_all('link', rel='stylesheet') if 'href' in link.attrs]

        for href in css_links:
            full_url = urljoin(url, href)
            try:
                css_response = requests.get(full_url, timeout=5)
                css_text += ' ' + css_response.text
            except:
                continue

        # Extract hex codes
        hex_colors = re.findall(r'#[0-9a-fA-F]{3,6}', css_text)
        hex_colors = list(dict.fromkeys(hex_colors))  # remove duplicates, preserve order
        return hex_colors[:limit]  # return top `limit` hex codes
    except Exception as e:
        print(f"Error extracting hex colors: {e}")
        return []


In [2]:
extract_hex_colors('https://www.growthsutra.pro/ ')

['#FFFFFF', '#4F4F4F', '#101213', '#DBDBDB', '#299E75']

In [3]:
from IPython.display import display, HTML

colors = ['#FFFFFF', '#4F4F4F', '#101213', '#DBDBDB', '#299E75']

html = "<div style='display:flex; gap:10px;'>"
for color in colors:
    html += f"<div style='width:60px; height:60px; background-color:{color}; border:1px solid #000;'></div>"
html += "</div>"

display(HTML(html))


In [None]:
from SearchAndRecommendation.url_recommendation.url_utils import get_urls

urls = await get_urls("Growth suthra")

```json
[
"https://growthsutras.com/",
"https://www.sutramanagement.com/",
"https://www.youtube.com/@growthsutras"
]
```


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
from SearchAndRecommendation.url_recommendation.url_utils import get_urls

urls = await get_urls("Growth suthra")

In [5]:
urls

'[\n"https://growthsutras.com/",\n"https://www.suthra.com/",\n"https://www.growsutra.com/"\n]\n'

In [9]:
import json

json.loads(urls)

['https://growthsutras.com/',
 'https://www.suthra.com/',
 'https://www.growsutra.com/']

In [4]:
s = '''json
[
"https://growthsutras.com/",
"https://www.sutramanagement.com/",
"https://www.youtube.com/@growthsutras"
]
'''
import json
import re

def extract_list_from_string(s):
    # Remove any prefix like 'json' and extract the JSON array part
    match = re.search(r"\[.*\]", s, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            print("Failed to parse list.")
    else:
        print("No list found.")
    return None



In [13]:
from langchain_google_genai import ChatGoogleGenerativeAI
from WebScraper.state import User
from typing import Optional
import json
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


llm = ChatGoogleGenerativeAI(model = 'gemini-2.0-flash')


template = '''
You are a smart autocompletion assistant for writing professional sales proposals.

Based on the given context and company details, generate a list of 3 high-quality autocompletion suggestions. Each suggestion should:
- Start from the last significant word or phrase in the context.
- Be a natural and extended continuation (at least 6-7 sentences).
- Be relevant to the context and company details.
- Maintain a persuasive, business-oriented tone suitable for formal proposals.

Return only a valid Python list of 3 strings. Each string should be a longer paragraph-style continuation. Do not include any explanation, markdown, or other formatting.

Context:
{text}

Company Details:
{company_details}
'''



prompt = ChatPromptTemplate.from_template(template)



chain = prompt | llm | StrOutputParser()


def get_recommendation(text:str , org : Optional[User]):
    result = chain.invoke({"text":text , "company_details":org}) # returns list of 3 items 
    result = result.strip()
    if result.startswith('```python'):
        result = result[len('```python'):].strip()
    elif result.startswith('```'):
        result = result[len('```'):].strip()
    if result.endswith('```'):
        result = result[:-3].strip()

    return json.loads(result)
 


In [14]:
result = get_recommendation("Shareholders",org = None)

In [15]:
result

["Shareholders are a crucial component of any successful enterprise, and we understand the importance of delivering consistent value and transparent communication. Our proposed solution is designed not only to enhance operational efficiency and profitability but also to provide clear, concise reporting that keeps shareholders informed and confident in the company's direction. We believe that by implementing these strategies, we can foster a stronger relationship between the company and its shareholders, leading to increased investment and long-term stability. This approach prioritizes sustainable growth and responsible governance, ensuring that the interests of all stakeholders are aligned. Furthermore, our team is committed to providing ongoing support and guidance to ensure a smooth transition and continued success in achieving these objectives, ultimately maximizing shareholder value.",
 'Shareholders expect a return on their investment, and our proposal directly addresses this expe

In [None]:
from SearchAndRecommendation.url_recommendation.url_utils import get_urls

await get_urls("IIIT KOTA")

In backend ['https://www.iiitkota.ac.in/', 'https://tpcell.iiitkota.ac.in/', 'https://iiitkota.ac.in/department']


['https://www.iiitkota.ac.in/',
 'https://tpcell.iiitkota.ac.in/',
 'https://iiitkota.ac.in/department']

In [3]:
from SearchAndRecommendation.url_recommendation.url_utils import get_urls_from_company_name

result = await get_urls_from_company_name("Growth sutra")

In backend ['https://growthsutras.com/', 'https://www.growsutra.com/', 'https://www.pensutra.com/']


In [6]:
result

['https://growthsutras.com/',
 'https://www.growsutra.com/',
 'https://www.sutra.hr/']

In [1]:
from SearchAndRecommendation.prompt_suggestion.recommend import get_recommendation

get_recommendation("Shareholders are very")

ModuleNotFoundError: No module named 'chains'

In [1]:
from WebScraper.scrape import get_data

result = await get_data("https://growthsutras.com")

Successfully scraped : '


 [
    {
        "name": "GROWTHSUTRA LLP",
        "logo": null,
        "description": "The provided text does not contain a detailed description of GrowthSutra LLP.  The website focuses on services offered (Go-To-Market Partner, On-Demand CMO, Revenue Architect, XPRT Co-Pilots) and events, but lacks a dedicated 'About Us' section providing background information or company mission.",
        "services": [
            "Go-To-Market Partner",
            "On-Demand CMO",
            "Revenue Architect",
            "XPRT Co-Pilots"
        ],
        "error": false
    },
    {
        "name": "GrowthSutra",
        "logo": "https://static.wixstatic.com/media/cb6b3d_5c8f2b020ebe48b69bc8c163cc480156~mv2.png/v1/fill/w_60,h_60,al_c,q_85,usm_0.66_1.00_0.01,enc_avif,quality_auto/GrowthSutra%20Logo.png",
        "description": "At GrowthSutra, we are the go-to experts dedicated to accelerate brand and revenue growth for startups and SMBs. We provide the Fortune 50

In [2]:
result

User(name='GROWTHSUTRA LLP', logo='https://static.wixstatic.com/media/cb6b3d_5c8f2b020ebe48b69bc8c163cc480156~mv2.png/v1/fill/w_60,h_60,al_c,q_85,usm_0.66_1.00_0.01,enc_avif,quality_auto/GrowthSutra%20Logo.png', description="At GrowthSutra, we are the go-to experts dedicated to accelerate brand and revenue growth for startups and SMBs. We provide the Fortune 500-caliber strategic thinking and flawless execution needed to gain market access, customer traction, and investor interest. Our team is comprised of seasoned marketing, communications, sales, and leadership executives, each with over 20+ years of real-world expertise launching and scaling disruptive brands across technology, e-commerce, climate, FMCG, and other major industries. We combine our team's unmatched experience with rigorous project governance and proven data-driven frameworks to get you measurable results fast. Our approach is tailored to each client's unique needs and objectives.", services=['Go-To-Market Partner', 'O

In [1]:
with open("requirements.txt", "r") as f:
    lines = f.readlines()

with open("clean_requirements.txt", "w") as f:
    for line in lines:
        if "@" not in line:
            f.write(line)


In [3]:
from serpapi import GoogleSearch # type: ignore
import os 
from dotenv import load_dotenv
load_dotenv()

SERP = os.getenv('SERPAPI_API_KEY')

def geturls(name):
    params = {
        "engine": "google",
        "q":f"Get the urls like official link of the webiste of the organization {name}", 
        "api_key": SERP,    # your SerpAPI key as a string or env var
        "num": 5           # get top 5 results
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    return results





In [None]:
a = {'a':1,'b':2}
a.

<function dict.keys()>

In [2]:
import timeit
from SearchAndRecommendation.websiterecommendation.url_utils import get_urls
start = timeit.timeit()
result = await get_urls("Growth sutra")
end = timeit.timeit()
print(end*1000-start*1000)

[
"https://www.growthsutras.com/",
"https://sutramgmt.com/",
"https://www.itsutra.com/",
"https://sutra.co/",
"https://www.growthsutras.com/team",
"https://www.growthsutras.com/contact",
"https://sutramgmt.com/contact"
]
5.81877199988412


In [7]:
import timeit 


start = timeit.timeit()
result = geturls("Growth sutra")
end = timeit.timeit()
print(end-start)

-0.005855645000337972
