In [1]:
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Launch Chrome
print('Starting Webdriver...')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.emploidakar.com/les-10-meilleures-entreprises-btp-au-senegal/")

html = driver.page_source
soup = BeautifulSoup(html, "lxml")

print('Scraping Company Info...')
article = soup.find("div", class_="entry-content single-page")
companies = []

for blocks in article.find_all("h3"):
    company_name = blocks.get_text(strip=True).split('.')[1].strip()
    p_blocks = [p.get_text(strip=True) for p in blocks.find_next_siblings("p", limit=3)]

    info_block = p_blocks[0] if len(p_blocks) > 0 else "BTP"
    address_block = p_blocks[1] if len(p_blocks) > 1 else ""
    contact_block = p_blocks[2] if len(p_blocks) > 2 else ""

    email_match = re.search(r"[\w.-]+@[\w.-]+\.\w+", contact_block)
    email = email_match.group(0) if email_match else ""

    phone_match = re.findall(r"\+?\d[\d\s\-]+", contact_block)
    phones = phone_match if phone_match else []

    addr_match = re.search(r"Adresse\s*:?\s*(.+)", address_block)
    address = addr_match.group(1).strip() if addr_match else address_block

    companies.append({
        "name": company_name,
        "info": info_block,
        "address": address,
        "email": email,
        "phones": phones
    })

driver.quit()

Starting Webdriver...
Scraping Company Info...


In [3]:
def comp_type(name):
    info = name.get('info', '').lower()
    if 'mines' in info or 'minier' in info:
        name['info'] = 'BTP/MINIER'
    elif 'ong' in info:
        name['info'] = 'ONG'
    elif 'btp' in info or 'btp' not in info:
        name['info'] = 'BTP'
    else:
        name['info'] = 'OTHERS'

In [4]:
for company in companies:
    company['pays'] = 'SENEGAL'
    comp_type(company)

    # Clean up phones: always join if it's a list
    if isinstance(company['phones'], list):
        company['phones'] = "/ ".join(company['phones'])
        company['phones'] = company['phones'].strip()
    else:
        company['phones'] = str(company['phones']) if company['phones'] else ""

    # Clean up address: if it's a list, take the first element
    if isinstance(company['address'], list):
        company['address'] = company['address'][0]

print('Storing Company Info...')

Storing Company Info...


In [7]:
df = pd.DataFrame(companies)
df.drop_duplicates(inplace=True)
df

Unnamed: 0,name,info,address,email,phones,pays
0,Groupe Soeco-Poncet et Compagnie,BTP,"Km 4,8 Boulevard du Centenaire de la Commune d...",soeco@soeco.sn,221/ 33 859 90 90,SENEGAL
1,ACT Senegal,BTP,"Avenue Félix Eboué – BP: 21852 Dakar Ponty, Da...",contact@act-sn.com,+221 33 823 12 77 / +221 33 821 96 96,SENEGAL
2,Groupe Atepa Technologie,BTP/MINIER,"Bd Martin Luther King-Fann Mermoz – Dakar, Sén...",atepa@atepa.com,+221/ 865 11 11/ 33 865 11 11 / +221/ 824 90 72,SENEGAL
3,Technosol ingénierie,BTP,"Rue 14 prolongée, Lot N°09 Zone Ind. SONEPI, D...",technosol@technosol-ingenierie.com,+221 33 825 40 28,SENEGAL
4,OBKmat Construction,BTP,"02 Ouest Foire | Immeuble Adja Kiné, Dakar, Sé...",contact@obkmat.com,+221 33 820 00 27 / +221 77 637 66 15,SENEGAL
5,Groupe CSE,BTP,"Rocade Fann Bel-Air, BP 609 Dakar, Sénégal",csesn@groupecse.com,+221 33 859 03 00 / +221 33 832 03 95,SENEGAL
6,SIAM Groupe,BTP,"Mamelles BP 24511 Ouakam, Dakar, Sénégal",contact@siam-groupe.com,+221 33 820 54 24,SENEGAL
7,SADIA Travaux BTP,BTP,"Guédiawaye, Cité Gadaye, Lot N1, prolongement ...",contact@sadia-travaux.com,+221 33 820 33 94 / +221 77 395 76 51,SENEGAL
8,Batix,BTP,"Batix Résidence Rokhaya – Dakar, Sénégal",batix@batix.sn,+221 / 33/ 859 55 00 / +221 33 820 47 05,SENEGAL
9,UVEN Corporates,BTP/MINIER,"Dakar, 30 liberté 6 extension, Rue LIB-35 Daka...",contact@uvencorp.com,,SENEGAL


In [6]:
df.to_csv("companies.csv", index=False)
print(f"\nCompanies saved: {len(df)} companies")


Companies saved: 10 companies
