In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import time
import random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'
}

In [12]:
company_names = []
ratings = []
reviews = []
salaries = []
interviews = []
locations = []
tags = []

In [13]:
# Scraping pages 1 to 5
for page in range(1, 6):
    url = f"https://www.ambitionbox.com/list-of-companies?page={page}"
    print(f"\n🔍 Scraping page {page}...")

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.text, 'html.parser')

        companies = soup.find_all("div", class_="companyCardWrapper__companyCard")

        if not companies:
            print(f"⚠️ No company data found on page {page}.")
            continue

        for company in companies:
            try:
                name = company.find("h2").text.strip()
            except:
                name = np.nan
            try:
                rating = company.find("span", class_="companyCardWrapper__companyRatingValue").text.strip()
            except:
                rating = np.nan
            try:
                review = company.find("a", class_="companyCardWrapper__reviewsText").text.strip()
            except:
                review = np.nan
            try:
                salary = company.find("a", class_="companyCardWrapper__salariesText").text.strip()
            except:
                salary = np.nan
            try:
                interview = company.find("a", class_="companyCardWrapper__interviewsText").text.strip()
            except:
                interview = np.nan
            try:
                location = company.find("div", class_="companyCardWrapper__location").text.strip()
            except:
                location = np.nan
            try:
                tag = company.find("div", class_="companyCardWrapper__companyTags").text.strip()
            except:
                tag = np.nan

            company_names.append(name)
            ratings.append(rating)
            reviews.append(review)
            salaries.append(salary)
            interviews.append(interview)
            locations.append(location)
            tags.append(tag)

        # Pause to avoid rate-limiting
        delay = random.uniform(2, 5)
        print(f"⏸️ Sleeping for {round(delay, 2)} seconds...")
        time.sleep(delay)

    except requests.exceptions.ReadTimeout:
        print("⏰ Timeout error. Skipping this page.")
        continue
    except requests.exceptions.ConnectionError:
        print("📡 Connection error. Retrying after 5 seconds...")
        time.sleep(5)
        continue
    except requests.exceptions.HTTPError as e:
        print(f"🚫 HTTP error: {e}")
        continue
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        continue


🔍 Scraping page 1...
⏰ Timeout error. Skipping this page.

🔍 Scraping page 2...
⏰ Timeout error. Skipping this page.

🔍 Scraping page 3...
⏰ Timeout error. Skipping this page.

🔍 Scraping page 4...
⏰ Timeout error. Skipping this page.

🔍 Scraping page 5...
⏰ Timeout error. Skipping this page.


In [14]:
# Create a DataFrame
df = pd.DataFrame({
    'Company Name': company_names,
    'Rating': ratings,
    'Reviews': reviews,
    'Salaries': salaries,
    'Interviews': interviews,
    'Location': locations,
    'Tags': tags
})

# Save to CSV
df.to_csv('ambitionbox_companies.csv', index=False)
print("\n✅ Scraping completed. Data saved to 'ambitionbox_companies.csv'.")


✅ Scraping completed. Data saved to 'ambitionbox_companies.csv'.


In [15]:
df.head()

Unnamed: 0,Company Name,Rating,Reviews,Salaries,Interviews,Location,Tags
