In [16]:
import os
import hashlib
import re
import requests as r
from bs4 import BeautifulSoup as bs
from concurrent.futures import ThreadPoolExecutor as te,as_completed 
from concurrent.futures import ThreadPoolExecutor as te
from urllib.parse import urlparse

# Create folder if it doesn't exist in current working directory
CACHE_DIR = "cached_pages"
os.makedirs(CACHE_DIR, exist_ok=True)

def sanitize_filename(url):
    """
    Create a readable filename from a URL (e.g., franchisebazar.com_industry_food.html)
    """
    parsed = urlparse(url)
    domain = parsed.netloc.replace('.', '_')
    path = parsed.path.strip('/').replace('/', '_')
    filename = f"{domain}_{path}.html"
    # Remove illegal characters (Windows safe)
    filename = re.sub(r'[<>:"/\\|?*]', '', filename)
    return filename

def cache_url(url, refresh=False):
    """
    Checks if the URL has been cached. If yes and refresh=False, load from disk.
    If not, fetch and save to disk.
    """
    filename = sanitize_filename(url)
    filepath = os.path.join(CACHE_DIR, filename)

    if os.path.exists(filepath) and not refresh:
        with open(filepath, 'r', encoding='utf-8') as file:
            print(f"[CACHE] Using cached: {filepath}")
            return file.read()

    print(f"[LIVE] Fetching and caching: {url}")
    response = r.get(url, timeout=10)
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(response.text)
    return response.text

In [18]:


url = 'https://www.franchisebazar.com/'
response = cache_url(url)

soup = bs(response,'html.parser')
cards = soup.find("ul", class_="franchise-container open")

industry_links = cards.find_all('li')

industries = {}
for li in industry_links:
    a_tag = li.find('a')
    if a_tag:
        link = a_tag['href']
        industry = a_tag.get_text(strip=True)
        industries[industry]=url+link
print(industries)

[CACHE] Using cached: cached_pages\www_franchisebazar_com_.html
{'Agents, Dealers & Distributors': 'https://www.franchisebazar.com//industry/agents-dealers-distributors', 'Automotive Franchise': 'https://www.franchisebazar.com//industry/automotive-franchise', 'Beauty Franchise': 'https://www.franchisebazar.com//industry/beauty-franchise', 'Beverage Franchise': 'https://www.franchisebazar.com//industry/beverage-franchise', 'Business Services Franchise': 'https://www.franchisebazar.com//industry/business-services-franchise', 'Cleaning Franchise': 'https://www.franchisebazar.com//industry/cleaning-franchise-opportunities', 'Clothing Franchise': 'https://www.franchisebazar.com//industry/clothing-franchise-opportunities', 'Computer & Internet Franchise': 'https://www.franchisebazar.com//industry/computer-internet-franchise', 'Construction Franchise': 'https://www.franchisebazar.com//industry/construction-related-franchise', 'Consultancy Franchise': 'https://www.franchisebazar.com//industry/

In [22]:
def get_industries(name,link):
    try:
        industry_type=name
        url = link
        response = cache_url(url)
        soup = bs(response,'html.parser')
        cards = soup.find('div',class_="row karya")
        #if no output from above code then empty dictionary will be returned
        if not cards:
            return (name,{})
            
        industry_link = cards.find_all('div',class_='investor-card-wrapper')
        
        franchise_details = {}
        for div in industry_link:
            link_tag=div.find('a')
            location_tag=div.find('div',class_='col-lg-8 col-xs-8 text-right')
            name_tag = div.find('div',class_='main-title')
            
            if name_tag and link_tag and location_tag:
                franchise_name = name_tag.get_text(strip=True)
                link = link_tag['href']
                city = location_tag.get_text(strip=True)
                franchise_details[franchise_name]=[url+link,city]
        return industry_type,franchise_details
        
    except Exception as e:
        return (name,{"error":str(e)})
            
industry_franchies = []
with te(max_workers=10) as executor:
    futures=[executor.submit(get_industries,name,link) for name,link in industries.items()]
for future in as_completed(futures):
    industry_franchies.append((future.result()))
    
print(industry_franchies)


[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_agents-dealers-distributors.html
[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_automotive-franchise.html
[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_beauty-franchise.html
[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_beverage-franchise.html
[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_business-services-franchise.html
[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_clothing-franchise-opportunities.html
[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_cleaning-franchise-opportunities.html
[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_construction-related-franchise.html
[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_computer-internet-franchise.html
[CACHE] Using cached: cached_pages\www_franchisebazar_com_industry_consultancy-franchise-opportunities.html
[CACHE