In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import csv
from concurrent.futures import ThreadPoolExecutor
import aiohttp
import asyncio

In [13]:
# List of states (or cities) in India to search for
cities = ['delhi', 'mumbai', 'bangalore', 'hyderabad', 'chennai']

In [14]:
# Initialize an empty list to hold all company data
all_companies_data = []

In [15]:
# Function to scrape data for one city asynchronously
async def scrape_city_data(city, session):
    url = f'https://www.justdial.com/{city}/IT-Solution-Providers'  # Adjust the URL based on the city
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }

    # Make the request asynchronously
    async with session.get(url, headers=headers) as response:
        if response.status == 200:
            print(f"Scraping data for {city}...")
            
            # Parse the page content with BeautifulSoup
            soup = BeautifulSoup(await response.text(), 'html.parser')
            
            # Find all company listings (limit to 6-8 companies)
            companies = soup.find_all('li', class_='cntanr')[:8]  # Adjust based on Justdial's HTML structure
            
            # Extract the required company details
            city_data = []
            for company in companies:
                # Extract company details
                try:
                    name = company.find('span', class_='lng_cont_name').text.strip()
                except AttributeError:
                    name = 'N/A'
                # Check if the phone number is available or if it's "Show Number"
                try:
                    phone = company.find('p', class_='contact-info').text.strip()
                    if 'Show Number' in phone:
                        phone = 'N/A'
                except AttributeError:
                    phone = 'N/A'
                try:
                    address = company.find('span', class_='cont_fl_addr').text.strip()
                except AttributeError:
                    address = 'N/A'
                category = 'Marketing Agencies'  # Static for this example
                email = 'N/A'  # Emails are usually not listed on Justdial

                # Append company data to the city list
                city_data.append({
                    'Company Name': name,
                    'Contact Number': phone,
                    'Location/Address': address,
                    'Industry/Category': category,
                    'Email Address': email,
                    'City': city  # Include the city for reference
                })

            # Append data for the city
            all_companies_data.extend(city_data)
            
            # Introduce a random delay to avoid overloading the server
            await asyncio.sleep(random.uniform(1, 3))
        else:
            print(f"Failed to retrieve data for {city} (Status Code: {response.status})")

In [16]:
# Main function to run the scraping tasks in parallel using asyncio
async def main():
    async with aiohttp.ClientSession() as session:
        tasks = []
        
        for city in cities:
            tasks.append(scrape_city_data(city, session))  # Add tasks for each city
            
        # Run tasks concurrently
        await asyncio.gather(*tasks)

In [17]:
# Run the async scraping tasks directly in the existing event loop
await main()

Scraping data for hyderabad...
Scraping data for bangalore...
Scraping data for mumbai...
Scraping data for chennai...
Scraping data for delhi...


In [18]:
# Optionally, print the collected data
for company in all_companies_data:
    print(company)

{'Company Name': 'Design Blocks', 'Contact Number': 'N/A', 'Location/Address': 'Main Road, Hitech City, Hyderabad - 500081, Near By D Mart', 'Industry/Category': 'Marketing Agencies', 'Email Address': 'N/A', 'City': 'hyderabad'}
{'Company Name': 'Milestone Corporate Service..', 'Contact Number': 'N/A', 'Location/Address': 'Flat No 9-1-127/3 A 43, SD ROAD, Secunderabad City, Secunderabad - 500003, Near By Deccan Cronicle Office', 'Industry/Category': 'Marketing Agencies', 'Email Address': 'N/A', 'City': 'hyderabad'}
{'Company Name': 'Bharat Gps Tracker', 'Contact Number': 'N/A', 'Location/Address': 'House No 23-49/12, Madhapur, Hyderabad - 500081', 'Industry/Category': 'Marketing Agencies', 'Email Address': 'N/A', 'City': 'hyderabad'}
{'Company Name': 'Adonai Graphics & Media Wor..', 'Contact Number': 'N/A', 'Location/Address': 'Door No: 1-1-571, Gandhi Nagar, Golconda X Roads, Hyderabad - 500020, Beside Hebron Church Near Rtc X Roads', 'Industry/Category': 'Marketing Agencies', 'Email 

In [19]:
# Save the collected data into a CSV file
csv_file = 'companies_data.csv'

# Define the headers for the CSV file
headers = ['Company Name', 'Contact Number', 'Location/Address', 'Industry/Category', 'Email Address', 'City']

# Write the data to CSV file
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()  # Write the header row
    writer.writerows(all_companies_data)  # Write the data rows

print(f"Data has been saved to {csv_file}")

Data has been saved to companies_data.csv
