In [3]:
import requests
from bs4 import BeautifulSoup
import re, json, time
from urllib.parse import urljoin, urlparse
from datetime import datetime
from collections import deque

In [4]:
PRIORITY_KEYWORDS = ["about","company","product","pricing","contact","careers"]
MAX_PAGES = 14 #As prescribes before in the documents as limited pages so it's set upto 10 - 15 pages
TIMEOUT = (5, 10)
HEADERS = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
print(f"Max pages:{MAX_PAGES},\nPriority:{PRIORITY_KEYWORDS[:7]}")

Max pages:14,
Priority:['about', 'company', 'product', 'pricing', 'contact', 'careers']


In [23]:
# Give Your Url at here for the "url"
#url="https://en.wikipedia.org/wiki/Cognizant"  # Enter your URL here 
url=input('enter your url here or paste here')   

enter your url here or paste here https://www.truemeds.in/


In [24]:
def fetch_url(url):
    for attempt in range(3):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            resp.raise_for_status()
            return resp.text
        except:
            if attempt == 2:
                return None
            time.sleep(1)
    return None

# Testing the fetcher
test_html = fetch_url(url)
print(f"   Fetched {len(test_html) if test_html else 0} chars from url")


   Fetched 914914 chars from url


In [25]:
#implementing and verifying the crawler work
def priority_score(url):
    path = urlparse(url).path.lower()
    return sum(3 for kw in PRIORITY_KEYWORDS if kw in path)

def crawl(start_url):
    queue = deque([start_url])
    visited = set()
    pages = {}
    errors = []
    
    while queue and len(visited) < MAX_PAGES:
        # Pick highest priority URL
        url = max(queue, key=priority_score)
        queue.remove(url)
        
        if url in visited: continue
            
        print(f"[{len(visited)+1}/{MAX_PAGES}] {url}")
        html = fetch_url(url)
        
        if html is None:
            errors.append({"url": url, "error": "failed"})
            visited.add(url)
            continue
            
        visited.add(url)
        pages[url] = html
        
        # Finding thed internal links ok the url given
        soup = BeautifulSoup(html, 'html.parser')
        for a in soup.find_all('a', href=True):
            href = urljoin(url, a['href'])
            if (urlparse(href).netloc == urlparse(start_url).netloc and 
                href not in visited and href not in queue):
                queue.append(href)
    
    return pages, list(visited),errors



In [26]:
pages, visited, errors = crawl(url)
print(f"âœ… Crawled {len(pages)} pages, {len(errors)} errors")

[1/14] https://www.truemeds.in/
[2/14] https://www.truemeds.in/blog/category/well-being/pregnancy-reproduction
[3/14] https://www.truemeds.in/about-us
[4/14] https://www.truemeds.in/blog/category/lifestyle/product-reviews
[5/14] https://www.truemeds.in/all-medicine-list
[6/14] https://www.truemeds.in/categories/personal-care-1
[7/14] https://www.truemeds.in/categories/personal-care/skin-care-125
[8/14] https://www.truemeds.in/categories/personal-care/hair-care-127
[9/14] https://www.truemeds.in/categories/personal-care/baby-and-mom-care-122
[10/14] https://www.truemeds.in/categories/personal-care/sexual-wellness-123
[11/14] https://www.truemeds.in/categories/personal-care/oral-care-128
[12/14] https://www.truemeds.in/categories/personal-care/elderly-care-124
[13/14] https://www.truemeds.in/categories/personal-care/skin-care/skin-cream-60
[14/14] https://www.truemeds.in/categories/personal-care/skin-care/sunscreen-61
âœ… Crawled 14 pages, 0 errors


In [29]:
# Making Extracters to work properly and helping them with the initiations 
def extract_contacts(text):
    emails = re.findall(r'[\w\.-]+@[\w\.-]+\w+', text)
    phones = re.findall(r'(\+?\d[\d\s\-\(\)]{7,})', text)
    date_pattern= [r'\d{1,2}-\d{1,2}-\d{4}',  
        r'\d{4}-\d{1,2}-\d{1,2}',  
        r'\d{2}/\d{2}/\d{4}'       
    ]   
    return list(set(emails))[:5], [p.strip() for p in phones if len(p)>9][:3]

def extract_social(soup):
    social = {}
    for a in soup.find_all('a', href=True):
        href = a['href'].lower()
        if 'linkedin' in href: social['linkedin'] = a['href']
        if 'twitter.com' in href or 'x.com' in href: social['twitter'] = a['href']
    return social


In [30]:
def build_json(pages, visited, errors, start_url):
    # Creating the JSON files based on prefered strucure
    record = {
        "identity": {"name": None, "url": start_url, "tagline": None},
        "business": {"description": [], "offerings": []},
        "contact": {"emails": [], "phones": []},
        "pages": {"key_pages": {}, "social": {}},
        "meta": {
            "timestamp": datetime.utcnow().isoformat(),
            "pages_crawled": visited,
            "errors": errors
        }
    }
    
    # Extracting the information from all pages as extracted
    all_emails,all_phones,all_social =[],[],{}
    
    for url, html in pages.items():
        soup = BeautifulSoup(html,'html.parser')
        
        # Extracting Company name from title
        title = soup.title.string.strip() if soup.title else ""
        if not record["identity"]["name"] and title:
            record["identity"]["name"] = title.split(' | ')[0]
        
        # Extracting Contacts info
        text = soup.get_text()
        emails, phones = extract_contacts(text)
        all_emails.extend(emails)
        all_phones.extend(phones)
        
        #Extracting Social info
        all_social.update(extract_social(soup))
        
        # Key_pages
        path = urlparse(url).path.lower()
        for kw in PRIORITY_KEYWORDS:
            if kw in path:
                record["pages"]["key_pages"][kw] = url
    
    # Finalizing the tracker in order to give prior responses
    record["contact"]["emails"] = list(set(all_emails))
    record["contact"]["phones"] = list(set(all_phones))
    record["pages"]["social"] = all_social
    
    return record


In [31]:
def scrape_company(url):
    """Step 7: COMPLETE PIPELINE"""
    print(f"\nðŸŽ¯ Scraping {url}")
    pages, visited, errors = crawl(url)
    record = build_json(pages, visited, errors, url)
    
    # Save JSON file
    filename = f"{urlparse(url).netloc}_profile.json"
    with open(filename, 'w') as f:
        json.dump(record, f, indent=2)
    
    print(f"Saved: {filename}")
    return record


In [32]:
#By here the scraper creates the json files which required for the inspection using the fetcher 
summary=scrape_company(url)
print("Files created:")


ðŸŽ¯ Scraping https://www.truemeds.in/
[1/14] https://www.truemeds.in/
[2/14] https://www.truemeds.in/blog/category/well-being/pregnancy-reproduction
[3/14] https://www.truemeds.in/about-us
[4/14] https://www.truemeds.in/blog/category/lifestyle/product-reviews
[5/14] https://www.truemeds.in/all-medicine-list
[6/14] https://www.truemeds.in/categories/personal-care-1
[7/14] https://www.truemeds.in/categories/personal-care/skin-care-125
[8/14] https://www.truemeds.in/categories/personal-care/hair-care-127
[9/14] https://www.truemeds.in/categories/personal-care/baby-and-mom-care-122
[10/14] https://www.truemeds.in/categories/personal-care/sexual-wellness-123
[11/14] https://www.truemeds.in/categories/personal-care/oral-care-128
[12/14] https://www.truemeds.in/categories/personal-care/elderly-care-124
[13/14] https://www.truemeds.in/categories/personal-care/skin-care/skin-cream-60
[14/14] https://www.truemeds.in/categories/personal-care/skin-care/sunscreen-61


  "timestamp": datetime.utcnow().isoformat(),


Saved: www.truemeds.in_profile.json
Files created:


In [33]:
def show_summary(record):
    print("\n" + "="*50)
    print("Company Summary")
    print("="*50)
    print(f"Name: {record['identity']['name'] or 'Not found'}")
    print(f"Emails: {len(record['contact']['emails'])}")
    print(f"Phones: {len(record['contact']['phones'])}")
    print(f"Pages: {len(record['meta']['pages_crawled'])}")
show_summary(summary)


Company Summary
Name: Order Medicine Online
Emails: 2
Phones: 3
Pages: 14
