In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from datetime import timedelta

## Request and Fetch the Webpage

In [4]:
# hit "www.ambitionbox.com/list-of-companies"
requests.get("https://www.ambitionbox.com/list-of-companies?page=1")

<Response [200]>

In [5]:
# webpage's robot.txt doesn't allow bots!
requests.get("https://www.ambitionbox.com/list-of-companies?page=1").text

'<!doctype html>\n<html data-n-head-ssr lang="en" data-n-head="%7B%22lang%22:%7B%22ssr%22:%22en%22%7D%7D">\n  <head >\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge"> \n    <title>List of companies - 486.3k companies | AmbitionBox</title><meta data-n-head="ssr" name="copyright" content="2020 AmbitionBox"><meta data-n-head="ssr" name="revisit-after" content="1 day"><meta data-n-head="ssr" name="application-name" content="AmbitionBox"><meta data-n-head="ssr" name="content-language" content="EN"><meta data-n-head="ssr" name="google-signin-client_id" content="462822053404-hphug4pkahqljh2tc96g35at47o4isv2.apps.googleusercontent.com"><meta data-n-head="ssr" property="fb:app_id" content="712617688793459"><meta data-n-head="ssr" name="theme-color" content="#5670fb"><meta data-n-head="ssr" name="msapplication-navbutton-color" content="#5670fb"><meta data-n-head="ssr" name="apple-mobile

In [6]:
# google chrome browser's request header (to make it look like, we are making this request from a browser)
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}

In [7]:
# hit using the header
response = requests.get("https://www.ambitionbox.com/list-of-companies?page=1", headers=header)

In [8]:
# see the recieved page source
response.text[0:500]

'<!doctype html>\n<html data-n-head-ssr lang="en" data-n-head="%7B%22lang%22:%7B%22ssr%22:%22en%22%7D%7D">\n  <head >\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge"> \n    <title>List of companies - 486.3k companies | AmbitionBox</title><meta data-n-head="ssr" name="copyright" content="2020 AmbitionBox"><meta data-n-head="ssr" name="revisit-after" content="1 day"><meta data-n-head="ssr" name='

## Pass the fetched webpage response to Beautiful Soup

In [9]:
# give the webpage to Beautiful Soup using parsers: "html.parser" or "lxml"
soup = BeautifulSoup(response.text, 'lxml')

In [10]:
# we see the whole webpage is made of cards and each card has the company info
# on inspecting, we see the cards are HTML: "div"s with class-name = "company-content-wrapper"
# Let us extract the first card and see how we can extract data from it...

first_company_card = soup.find("div", class_="company-content-wrapper")

In [11]:
# let's see what we got here...
print(first_company_card.prettify())

<div class="company-content-wrapper">
 <div class="company-content">
  <div class="company-logo">
   <img alt="Tata Consultancy Services logo" class="lazy" data-src="https://static.ambitionbox.com/alpha/company/photos/logos/tcs.jpg" onerror="this.onerror=null;this.src='/static/icons/company-placeholder.svg';" src="https://static.ambitionbox.com/static/icons/company-placeholder.svg"/>
  </div>
  <div class="company-info-wrapper">
   <div class="company-info">
    <div class="left">
     <a href="/overview/tcs-overview">
      <h2 class="company-name bold-title-l" title="TCS">
       TCS
      </h2>
     </a>
     <div class="rating-wrapper">
      <p class="rating badge-large rating-4">
       <i class="icon icon-star">
       </i>
       4.0
      </p>
      <a class="review-count sbold-Labels" href="https://www.ambitionbox.com/reviews/tcs-reviews">
       (17.7k Reviews)
      </a>
     </div>
    </div>
    <button class="ab_btn follow-btn invert round">
     <span class="ctas-btn-me

## Let us try and extract the data from a single company card

##  [1] Company Name

In [12]:
# 1. company name is inside h2 tag: <h2 class="company-name bold-title-l" title="TCS">
first_company_card.find("h2")

<h2 class="company-name bold-title-l" title="TCS">
								TCS
							</h2>

In [13]:
# extract company name
first_company_card.find("h2").text.strip()

'TCS'

##  [2] Company Rating

In [14]:
# 2. company rating lies inside p tag: <p class="rating badge-large rating-35">
first_company_card.find("p", class_="rating")

<p class="rating badge-large rating-4"><i class="icon icon-star"></i>
								4.0
							</p>

In [15]:
# extract company rating
first_company_card.find("p", class_="rating").text.strip()

'4.0'

##  [3] Number of Company Reviews

In [16]:
# 3. number of company reviews lies inside a tag: <a class="review-count sbold-Labels">
first_company_card.find("a", class_="review-count sbold-Labels")

<a class="review-count sbold-Labels" href="https://www.ambitionbox.com/reviews/tcs-reviews">
								(17.7k Reviews)
							</a>

In [17]:
# extract number of company reviews
first_company_card.find("a", class_="review-count sbold-Labels").text.strip()

'(17.7k Reviews)'

In [18]:
first_company_card.find("a", class_="review-count sbold-Labels").text.strip().replace(" Reviews", "")

'(17.7k)'

## [4] Domain,  [5] Location,  [6] Years Old,  [7] Employee Strength

In [19]:
# Now this is tricky!
# extract "infoEntity" containing: 
# 4. 'domain', 
# 5. 'location', 
# 6. 'years old'
# 7. 'employee strength'

In [20]:
# Try 1:
first_company_card.find_all("p", class_="infoEntity sbold-list-header")

[<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
 						Public
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-pin-drop"></i>
 						Mumbai,Maharashtra + 165 more
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-access-time"></i>
 						52 years old
 					</p>]

In [21]:
first_company_card.find_all("p", class_="infoEntity sbold-list-header")[0]

<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
						Public
					</p>

In [22]:
inner_company_info_list = first_company_card.find_all("p", class_="infoEntity sbold-list-header")
inner_company_info_list

[<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
 						Public
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-pin-drop"></i>
 						Mumbai,Maharashtra + 165 more
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-access-time"></i>
 						52 years old
 					</p>]

In [23]:
inner_company_info_list[0]

<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
						Public
					</p>

In [24]:
inner_company_info_list[3].findChildren("i")[0]["class"][0]

IndexError: list index out of range

In [25]:
inner_company_info_list[3].text.strip()

IndexError: list index out of range

In [26]:
# let's try using it's parent tag: div
inner_company_info_card = first_company_card.find("div", class_="company-basic-info")
print(inner_company_info_card.prettify())

<div class="company-basic-info">
 <p class="infoEntity sbold-list-header">
  <i class="icon-domain">
  </i>
  Public
 </p>
 <p class="infoEntity sbold-list-header">
  <i class="icon-pin-drop">
  </i>
  Mumbai,Maharashtra + 165 more
 </p>
 <p class="infoEntity sbold-list-header">
  <i class="icon-access-time">
  </i>
  52 years old
 </p>
 <!-- -->
</div>



In [27]:
inner_company_info_card.findChildren("i")

[<i class="icon-domain"></i>,
 <i class="icon-pin-drop"></i>,
 <i class="icon-access-time"></i>]

In [28]:
inner_company_info_card.findChildren("i")[0]["class"][0]

'icon-domain'

In [29]:
inner_company_info_card.find_all("p")[0].text.strip()

'Public'

In [30]:
# ignore
first_company_card.find_all("p", class_="infoEntity sbold-list-header")[0]

<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
						Public
					</p>

In [31]:
# extract "infoEntity" containing 'domain', 'location', 'years old' & 'employee strength'

info_list = first_company_card.find_all("p", class_="infoEntity sbold-list-header")
dom = None
loc = None
old = None
emp = None

for i in range(4):
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-domain':
        dom = info_list[i].text.strip()
        
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-pin-drop':
        loc = info_list[i].text.strip()
        
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-access-time':
        old = info_list[i].text.strip()
        
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-supervisor-account':
        emp = info_list[i].text.strip()
    
print("domain:", dom)
print("location:", loc)
print("years old:", old)
print("employee strength:", emp)

IndexError: list index out of range

##  [8] Company Tags

In [32]:
# 8. company tags are inside a tags: <a class="ab_chip">
first_company_card.find_all("a", class_="ab_chip")

[<a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_it-services-and-consulting" href="/it-services-and-consulting-companies-in-india" title="IT Services &amp; Consulting companies in India">
 						IT Services &amp; Consulting
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_fortune500" href="/fortune500-companies-in-india" title="Fortune500 companies in India">
 						Fortune500
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_forbesglobal2000" href="/forbesglobal2000-companies-in-india" title="Forbes Global 2000 companies in India">
 						Forbes Global 2000
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_public" href="/public-companies-in-india" title="Public companies in India">
 						Public
 					</a>]

In [33]:
# extract company tags
tags = []
for tag in first_company_card.find_all("a", class_="ab_chip"):
    tags.append(tag.text.strip())
tags

['IT Services & Consulting', 'Fortune500', 'Forbes Global 2000', 'Public']

In [34]:
tags = ', '.join(tags)
tags

'IT Services & Consulting, Fortune500, Forbes Global 2000, Public'

##  [9] Company Description

In [35]:
# 9. company description is inside p tag: <p class="description">
first_company_card.find("p", class_="description")

<p class="description body-small" itemprop="description">
				Tata Consultancy Services is an IT services, consulting and business solutions organisation that has been partnering with the world’s largest businesses in their transformation journeys for the last 50 years. 

A part of the Tata group, India's largest multinational business group, TCS has over 436,000 of the world’s best-trained consultants in 46 countries. The company is listed on the BSE (formerly Bombay Stock Exchange) and the NSE (National Stock Exchange) in India. 

TCS'​ proactive stance on climate change and award winning work with communities across the world have earned it a place in leading sustainability indices such as the Dow Jones Sustainability Index (DJSI), MSCI Global Sustainability Index and the FTSE4Good Emerging Index. 

			</p>

In [36]:
# extract company description
first_company_card.find("p", class_="description").text.strip()

"Tata Consultancy Services is an IT services, consulting and business solutions organisation that has been partnering with the world’s largest businesses in their transformation journeys for the last 50 years. \n\nA part of the Tata group, India's largest multinational business group, TCS has over 436,000 of the world’s best-trained consultants in 46 countries. The company is listed on the BSE (formerly Bombay Stock Exchange) and the NSE (National Stock Exchange) in India. \n\nTCS'\u200b proactive stance on climate change and award winning work with communities across the world have earned it a place in leading sustainability indices such as the Dow Jones Sustainability Index (DJSI), MSCI Global Sustainability Index and the FTSE4Good Emerging Index."

## Scraping a Single WebPage

In [37]:
# find all the company cards in the webpage (HTML divs that encloses data about each company)
company_cards = soup.find_all("div", class_="company-content-wrapper")
len(company_cards)

30

In [38]:
%%time

name = []
rating = []
reviews = []
domain = []
location = []
years_old = []
employee_strength = []
tags = []
about = []

for card in company_cards:
    # 1. name
    name.append(card.find("h2").text.strip())
    
    # 2. rating
    rating.append(card.find("p", class_="rating").text.strip())
    
    # 3. reviews
    reviews.append(card.find("a", class_="review-count sbold-Labels").text.strip().replace(" Reviews", ""))
    
    # 4. domain, 5. location, 6. years old & 7. employee strength
    info_list = card.find_all("p", class_="infoEntity sbold-list-header")
    dom = None
    loc = None
    old = None
    emp = None
    for i in range(4):
        try:
            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-domain':
                dom = info_list[i].text.strip()

            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-pin-drop':
                loc = info_list[i].text.strip()

            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-access-time':
                old = info_list[i].text.strip()

            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-supervisor-account':
                emp = info_list[i].text.strip()
        except:
            pass
    
    domain.append(dom)
    location.append(loc)
    years_old.append(old)
    employee_strength.append(emp)
    
    # 8. tags
    t = []
    for tag in card.find_all("a", class_="ab_chip"):
        t.append(tag.text.strip())
    t = ', '.join(t)
    tags.append(t)
    
    # 9. about
    about.append(card.find("p", class_="description").text.strip())
    
col_dic = {
    "name": name,
    "rating": rating,
    "reviews": reviews,
    "domain": domain,
    "location": location,
    "years_old": years_old,
    "employee_strength": employee_strength,
     "tags": tags,
    "about": about
}

df = pd.DataFrame(col_dic)

Wall time: 250 ms


In [39]:
df

Unnamed: 0,name,rating,reviews,domain,location,years_old,employee_strength,tags,about
0,TCS,4.0,(17.7k),Public,"Mumbai,Maharashtra + 165 more",52 years old,,"IT Services & Consulting, Fortune500, Forbes G...","Tata Consultancy Services is an IT services, c..."
1,Accenture,4.0,(15.5k),Private,Dublin + 95 more,31 years old,10000+ employees,"Consulting, IT / ITES, MNC, Forbes Global 2000",Accenture is a leading global professional ser...
2,ICICI Bank,4.1,(14.3k),Public,Mumbai + 728 more,26 years old,10000+ employees,"Banking / Insurance / Accounting, Financial Se...",ICICI Bank is India's largest private sector b...
3,Cognizant,3.9,(13.3k),Private,Teaneck + 48 more,26 years old,10000+ employees,"IT / ITES, MNC, Forbes Global 2000",Cognizant (NASDAQ-100: CTSH) is one of the wor...
4,HDFC Bank,4.0,(12.5k),Public,"Mumbai,Maharashtra + 726 more",26 years old,,"Financial Services, Banking, Insurance, Fortun...",HDFC Bank is one of India’s leading private ba...
5,Capgemini,3.4,(10.3k),Private,Paris + 47 more,53 years old,10000+ employees,"Computer Software, Consulting, Internet, IT / ...","Capgemini is a global leader in consulting, di..."
6,Tech Mahindra,3.6,(10.2k),Public,Pune + 161 more,34 years old,10000+ employees,"Consulting, IT / ITES, Analytics / BPM / Resea...","Tech Mahindra represents the connected world, ..."
7,HCL Technologies,3.7,(9.5k),Public,"Noida,Uttar Pradesh + 111 more",14 years old,,"Computer Software, Consulting, Financial Servi...",HCL Technologies is a next-generation global t...
8,L&T,4.1,(9.4k),Public,Mumbai + 339 more,82 years old,10000+ employees,"Industrial Equipment / Machinery, Manufacturin...","Larsen & Toubro is a major technology, enginee..."
9,Infosys,3.9,(8.9k),Public,Bangalore + 67 more,39 years old,10000+ employees,"IT / ITES, MNC, Fortune500, Forbes Global 2000...",Infosys is a global leader in next-generation ...


## Scraping the Whole Website!!

In [40]:
# ALL SET! LET'S PUT EVERYTHING TOGETHER AND SCRAPE THE WHOLE WEBSITE !!!
# Let's not scrape the whole website here... use a python script and run it in the terminal to do so!
# the external python script is attatched with this notebook

In [41]:
start_time = time.time()
dataframe_final = pd.DataFrame()

# 4,86,333 unique companies found / 30 per page = 16,211 pages
# total_number_of_webpages = 16,211
total_number_of_webpages = 10

In [42]:
for page in range(1, total_number_of_webpages+1):
    print("scraping webpage number: {page} of {total}".format(page=page, total=total_number_of_webpages))
    loop_time = time.time()
    
    # set page url and header
    url = "https://www.ambitionbox.com/list-of-companies?page={}".format(page)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
    
    # get page response from the website
    response = requests.get(url, headers=header)
    # time.sleep(0.1)
    
    # pass the page to BeautifulSoup
    soup = BeautifulSoup(response.text, 'lxml')
    
    # find all the company cards from the webpage
    company_cards = soup.find_all("div", class_="company-content-wrapper")
    
    # extract all the required data from each company card and store them in a list
    name = []
    rating = []
    reviews = []
    domain = []
    location = []
    years_old = []
    employee_strength = []
    tags = []
    about = []
    
    # scrap scrap scrap!
    for card in company_cards:
        # 1. name
        try:
            name.append(card.find("h2").text.strip())
        except:
            name.append(None)
            # 2. rating
        try:
            rating.append(card.find("p", class_="rating").text.strip())
        except:
            rating.append(None)

        # 3. reviews
        try:
            reviews.append(card.find("a", class_="review-count sbold-Labels").text.strip().replace(" Reviews", ""))
        except:
            reviews.append(None)

        # 4. domain, 5. location, 6. years old & 7. employee strength
        info_list = card.find_all("p", class_="infoEntity sbold-list-header")
        dom = None
        loc = None
        old = None
        emp = None
        for i in range(4):
            try:
                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-domain':
                    dom = info_list[i].text.strip()

                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-pin-drop':
                    loc = info_list[i].text.strip()

                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-access-time':
                    old = info_list[i].text.strip()

                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-supervisor-account':
                    emp = info_list[i].text.strip()
            except:
                pass
            
        domain.append(dom)
        location.append(loc)
        years_old.append(old)
        employee_strength.append(emp)

        # 8. tags
        t = []
        try:
            for tag in card.find_all("a", class_="ab_chip"):
                t.append(tag.text.strip())
            t = ', '.join(t)
            tags.append(t)
        except:
            tags.append(None)

        # 9. about
        try:
            about.append(card.find("p", class_="description").text.strip())
        except:
            about.append(None)
    
    # make a dictionary containing all the data extracted
    col_dic = {
        "name": name,
        "rating": rating,
        "reviews": reviews,
        "domain": domain,
        "location": location,
        "years_old": years_old,
        "employee_strength": employee_strength,
        "tags": tags,
        "about": about
    }
    
    # pass the dictionary to pandas to create a dataframe (page)
    df = pd.DataFrame(col_dic)
    
    # append the dataframe to the final dataframe (the whole website)
    dataframe_final = dataframe_final.append(df, ignore_index=True)
    
    # success
    print("success!")
    print("time taken:", round((time.time()-loop_time)*1000, 2), "ms")
    print("total time elapsed:", str(timedelta(seconds=(time.time()-start_time))))
    print()

end_time = time.time()
print("full website scraped successfully!")
print("total time taken:", str(timedelta(seconds=(end_time - start_time))))
print()

scraping webpage number: 1 of 10
success!
time taken: 216.93 ms
total time elapsed: 0:03:37.434261

scraping webpage number: 2 of 10
success!
time taken: 294.44 ms
total time elapsed: 0:03:37.728699

scraping webpage number: 3 of 10
success!
time taken: 531.37 ms
total time elapsed: 0:03:38.261064

scraping webpage number: 4 of 10
success!
time taken: 564.94 ms
total time elapsed: 0:03:38.826004

scraping webpage number: 5 of 10
success!
time taken: 478.72 ms
total time elapsed: 0:03:39.304723

scraping webpage number: 6 of 10
success!
time taken: 547.23 ms
total time elapsed: 0:03:39.851951

scraping webpage number: 7 of 10
success!
time taken: 587.68 ms
total time elapsed: 0:03:40.440599

scraping webpage number: 8 of 10
success!
time taken: 201.41 ms
total time elapsed: 0:03:40.642010

scraping webpage number: 9 of 10
success!
time taken: 516.77 ms
total time elapsed: 0:03:41.159779

scraping webpage number: 10 of 10
success!
time taken: 523.43 ms
total time elapsed: 0:03:41.684179


## Let's see what our dataframe looks like

In [43]:
dataframe_final.head()

Unnamed: 0,name,rating,reviews,domain,location,years_old,employee_strength,tags,about
0,TCS,4.0,(17.7k),Public,"Mumbai,Maharashtra + 165 more",52 years old,,"IT Services & Consulting, Fortune500, Forbes G...","Tata Consultancy Services is an IT services, c..."
1,Accenture,4.0,(15.5k),Private,Dublin + 95 more,31 years old,10000+ employees,"Consulting, IT / ITES, MNC, Forbes Global 2000",Accenture is a leading global professional ser...
2,ICICI Bank,4.1,(14.3k),Public,Mumbai + 728 more,26 years old,10000+ employees,"Banking / Insurance / Accounting, Financial Se...",ICICI Bank is India's largest private sector b...
3,Cognizant,3.9,(13.3k),Private,Teaneck + 48 more,26 years old,10000+ employees,"IT / ITES, MNC, Forbes Global 2000",Cognizant (NASDAQ-100: CTSH) is one of the wor...
4,HDFC Bank,4.0,(12.5k),Public,"Mumbai,Maharashtra + 726 more",26 years old,,"Financial Services, Banking, Insurance, Fortun...",HDFC Bank is one of India’s leading private ba...


In [44]:
# Print some statistics about the final dataframe:
print("dataframe shape", dataframe_final.shape)
print()
print("column-wise null count")
print(dataframe_final.isna().sum())
print()

dataframe shape (300, 9)

column-wise null count
name                  0
rating                0
reviews               0
domain               11
location              0
years_old             2
employee_strength    11
tags                  0
about                 2
dtype: int64



In [45]:
dataframe_final.describe()

Unnamed: 0,name,rating,reviews,domain,location,years_old,employee_strength,tags,about
count,300,300.0,300,289,300,298,289,300,298
unique,299,15.0,113,4,288,108,3,270,298
top,Mahindra & Mahindr...,4.2,(1.1k),Private,Mumbai + 122 more,20 years old,10000+ employees,"Banking / Insurance / Accounting, Financial Se...","In 1954, in a newly independent India, the nee..."
freq,2,61.0,29,141,2,15,233,5,1


## Exporting the DataFrame into an external CSV

In [49]:
# export the data to external csv
dataframe_final.to_csv("E:\dataset.csv", encoding="utf-8")