In [1]:
!pip install bs4



You should consider upgrading via the 'c:\python39\python.exe -m pip install --upgrade pip' command.


In [15]:
from bs4 import BeautifulSoup
import requests as req 
import re
import pandas as pd

In [16]:
company_frame = pd.DataFrame(columns=['Name','Address','Website','Revenue','Nb_employees'])
company_frame

Unnamed: 0,Name,Address,Website,Revenue,Nb_employees


# Web Scraping

In [41]:
basis = 'https://en.wikipedia.org/wiki/'
companies = ['Microsoft','Salesforce','Dataiku','HSBC','BNP_Paribas','Bouygues_Construction']

for company in range(len(companies)):
    url = basis + companies[company]
    response = req.request('get',url)
    html = response.text  
    soup = BeautifulSoup(html)

    try:

        #title
        title=f".infobox-title "
        print("title: "+soup.select(title)[0].text)
        company_frame.loc[company,'Name'] = soup.select(title)[0].text

        #other infos
        selector = '.infobox > tbody > tr' #to get each part of the right column
        table_rows    = soup.select(selector)
        rows = (len(table_rows))

        for row in range (rows):
            selector=f".infobox > tbody > tr:nth-child({row}) > .infobox-label " #get the title of the line on the column
            if soup.select(selector) !=[]:
                line_title = soup.select(selector)[0].text

                #If the title correspounds to the info we want, we get all the text in the <td> table element
                if line_title == 'Headquarters':
                    selector=f".infobox > tbody > tr:nth-child({row}) > td "
                    company_frame.loc[company,'Address'] = soup.select(selector)[0].text

                if line_title == 'Revenue':
                    selector=f".infobox > tbody > tr:nth-child({row}) > td "
                    company_frame.loc[company,'Revenue'] = soup.select(selector)[0].text

                if line_title == 'Website':
                    selector=f".infobox > tbody > tr:nth-child({row}) > td "
                    company_frame.loc[company,'Website'] = soup.select(selector)[0].text

                if line_title == 'Number of employees':
                    selector=f".infobox > tbody > tr:nth-child({row}) > td "
                    company_frame.loc[company,'Nb_employees'] = soup.select(selector)[0].text

    except Exception as e:
        print(e)


title: Microsoft Corporation
title: Salesforce, Inc.
title: Dataiku
title: HSBC Holdings plc
title: BNP Paribas S.A.
title: Bouygues S.A.


In [42]:
company_frame

Unnamed: 0,Name,Address,Website,Revenue,Nb_employees,location,city,country
0,Microsoft Corporation,"One Microsoft WayRedmond, Washington, U.S.",microsoft.com,US$198.3 billion (2022),"221,000 (2022)",One Microsoft WayRedmond,Washington,U.S.
1,"Salesforce, Inc.","Salesforce TowerSan Francisco, California, U.S.",salesforce.com,US$26.49 billion (2022),"73,542 (August 2022)",Salesforce TowerSan Francisco,California,U.S.
2,Dataiku,"New York City, United States",,US$150 million (2021)[1],"1,000+ (2022)[1]",New York City,United States,
3,HSBC Holdings plc,"8 Canada SquareLondon, England, UK",,US$49.552 billion (2021)[4],"219,697 (2021)[4]",8 Canada SquareLondon,England,UK
4,BNP Paribas S.A.,"Boulevard des Italiens, Paris, France",,€46.2 billion (2021)[2],"190,000 (2022)[2]",Boulevard des Italiens,Paris,France
5,Bouygues S.A.,"8th arrondissement, Paris, France",,€37.59 billion (2021)[1],"124,600 (Dec 2021)[2]",8th arrondissement,Paris,France


# Data Cleaning

In [43]:
company_frame['Nb_employees'] = [re.sub('\(.*', '', string) for string in company_frame['Nb_employees']]
company_frame['Nb_employees'] = [re.sub('\+', '', string) for string in company_frame['Nb_employees']]
company_frame['Nb_employees'] = [string.replace(',','') for string in company_frame['Nb_employees']]
company_frame['Nb_employees'].astype('int32').dtypes
company_frame

Unnamed: 0,Name,Address,Website,Revenue,Nb_employees,location,city,country
0,Microsoft Corporation,"One Microsoft WayRedmond, Washington, U.S.",microsoft.com,US$198.3 billion (2022),221000,One Microsoft WayRedmond,Washington,U.S.
1,"Salesforce, Inc.","Salesforce TowerSan Francisco, California, U.S.",salesforce.com,US$26.49 billion (2022),73542,Salesforce TowerSan Francisco,California,U.S.
2,Dataiku,"New York City, United States",,US$150 million (2021)[1],1000,New York City,United States,
3,HSBC Holdings plc,"8 Canada SquareLondon, England, UK",,US$49.552 billion (2021)[4],219697,8 Canada SquareLondon,England,UK
4,BNP Paribas S.A.,"Boulevard des Italiens, Paris, France",,€46.2 billion (2021)[2],190000,Boulevard des Italiens,Paris,France
5,Bouygues S.A.,"8th arrondissement, Paris, France",,€37.59 billion (2021)[1],124600,8th arrondissement,Paris,France


In [44]:
company_frame['Revenue'] = [re.sub('\(.*', '', string) for string in company_frame['Revenue']]
company_frame['Revenue'] = [string.replace('€','') for string in company_frame['Revenue']]
company_frame['Revenue'] = [string.replace('US$','') for string in company_frame['Revenue']]
company_frame

Unnamed: 0,Name,Address,Website,Revenue,Nb_employees,location,city,country
0,Microsoft Corporation,"One Microsoft WayRedmond, Washington, U.S.",microsoft.com,198.3 billion,221000,One Microsoft WayRedmond,Washington,U.S.
1,"Salesforce, Inc.","Salesforce TowerSan Francisco, California, U.S.",salesforce.com,26.49 billion,73542,Salesforce TowerSan Francisco,California,U.S.
2,Dataiku,"New York City, United States",,150 million,1000,New York City,United States,
3,HSBC Holdings plc,"8 Canada SquareLondon, England, UK",,49.552 billion,219697,8 Canada SquareLondon,England,UK
4,BNP Paribas S.A.,"Boulevard des Italiens, Paris, France",,46.2 billion,190000,Boulevard des Italiens,Paris,France
5,Bouygues S.A.,"8th arrondissement, Paris, France",,37.59 billion,124600,8th arrondissement,Paris,France


In [45]:
for row,index in company_frame.iterrows():
    if "billion" in company_frame.at[row,"Revenue"]:
        company_frame.at[row,"Revenue"] = str(int(float(company_frame.at[row,"Revenue"].replace('billion',''))*10**9))
    
    if "million" in company_frame.at[row,"Revenue"]:
        company_frame.at[row,"Revenue"] = str(int(float(company_frame.at[row,"Revenue"].replace('million',''))*10**6))

company_frame

Unnamed: 0,Name,Address,Website,Revenue,Nb_employees,location,city,country
0,Microsoft Corporation,"One Microsoft WayRedmond, Washington, U.S.",microsoft.com,198300000000,221000,One Microsoft WayRedmond,Washington,U.S.
1,"Salesforce, Inc.","Salesforce TowerSan Francisco, California, U.S.",salesforce.com,26490000000,73542,Salesforce TowerSan Francisco,California,U.S.
2,Dataiku,"New York City, United States",,150000000,1000,New York City,United States,
3,HSBC Holdings plc,"8 Canada SquareLondon, England, UK",,49552000000,219697,8 Canada SquareLondon,England,UK
4,BNP Paribas S.A.,"Boulevard des Italiens, Paris, France",,46200000000,190000,Boulevard des Italiens,Paris,France
5,Bouygues S.A.,"8th arrondissement, Paris, France",,37590000000,124600,8th arrondissement,Paris,France


In [47]:
company_frame.to_csv('companies.csv',sep=';')