In [None]:
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

url = 'https://www.nepalyp.com/browse-business-directory'
response = requests.get(url)
soup = bs(response.content, 'html.parser')

content = soup.find('main')

def fetch_page(url):
    response = requests.get(url)
    return bs(response.content, 'html.parser')

def fetch_company_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        page_soup = bs(response.content, 'html.parser')
        company_item_div = page_soup.find('div', {'id': 'company_item'})
        company_data = {}
        
        if company_item_div:
            company_data['name'] = company_item_div.find('b', id='company_name').text.strip()
            company_data['location'] = company_item_div.find('div', {'class': 'text location'}).text.strip()
            company_data['website'] = company_item_div.find('div', {'class': 'text weblinks'}).text if company_item_div.find('div', {'class': 'weblinks'}) else 'NaN'

            # Extract contact numbers
            contact = company_item_div.find('div', class_='text phone')
            if contact:
                contact_numbers = [contact.strip() for contact in contact.get_text(separator='\n').split('\n') if contact.strip()]
                contact_numbers_str = ", ".join(contact_numbers)
            else:
                contact_numbers_str = 'NaN'
            company_data['contact_numbers'] = contact_numbers_str

            # Extract map coordinates
            map_canvas_div = company_item_div.find('div', id='map_canvas')
            if map_canvas_div:
                data_map_ltd = map_canvas_div.get('data-map-ltd')
                data_map_lng = map_canvas_div.get('data-map-lng')
                map_coordinates = f'{data_map_ltd},{data_map_lng}'
            else:
                map_coordinates = 'NaN'
            company_data['map_coordinates'] = map_coordinates

            # Extract mobile numbers
            label_div = company_item_div.find('div', class_='label', text='Mobile phone')
            if label_div:
                text_div = label_div.find_next_sibling('div', class_='text')
                mobile_numbers = [number.strip() for number in text_div.get_text(separator='\n').split('\n') if number.strip()]
                mobile_numbers_str = ", ".join(mobile_numbers)
            else:
                mobile_numbers_str = 'NaN'
            company_data['Mobile Numbers'] = mobile_numbers_str
        
        return company_data
    return None

def fetch_data(url):
    data_list = []
    while url:
        soup = fetch_page(url)
        listings = soup.find('div', attrs={'id': 'listings'})
        
        company_links = ['https://www.nepalyp.com{}'.format(row.find('h4').a['href']) for row in listings.select('div.company:not(.company_ad)')]
        
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_url = {executor.submit(fetch_company_data, link): link for link in company_links}
            for future in tqdm(as_completed(future_to_url), total=len(company_links), desc="Fetching company data"):
                data_list.append(future.result())
        
        next_link = soup.find('div', class_='pages_container')
        try:
            next_link = next_link.find('a', class_='pages_arrow', rel='next')
        except:
            next_link = None
        url = 'https://www.nepalyp.com{}'.format(next_link['href']) if next_link else None

    return data_list

category_data = []

for category in tqdm(content.find_all('h2', class_='cath2'), desc="Fetching categories"):
    category_name = category.text.strip()
    category_dict = {"category": category_name, "subcategories": []}
    
    for li in tqdm(category.find_next_sibling('ul', class_='cat_list').find_all('li'), desc=f"Fetching subcategories for {category_name}"):
        subcategory_name = [subcategory.strip() for subcategory in li.get_text(separator='\n').split('\n') if subcategory.strip()][0]
        a_href = li.a['href'] if li.a else 'NaN'
        category_dict["subcategories"].append({subcategory_name: []})
        if a_href != 'NaN':
            url = f'https://www.nepalyp.com{a_href}'
            category_dict['subcategories'][-1][subcategory_name] = fetch_data(url)
        # break
    category_data.append(category_dict)
    # break

# Save the data to a file or use it as needed
df = pd.DataFrame(category_data)
df.to_csv('business_data.csv', index=False)
# Assuming category_data is your data
df = pd.DataFrame(category_data)

# Save DataFrame as JSON
df.to_json('business_data.json', orient='records', lines=True, indent=4)
