## **Importing necessary packages**

In [1]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests

## **Starting HTMLSession and BeautifulSoup and ensuring both are working**

In [5]:
url = 'https://muqawil.org/en/contractors'
session = HTMLSession()
#response = requests.get(url) #for BeautifulSoup
r = session.get(url) # for HTMLSession
r.html.arender(sleep=1)
soup = BeautifulSoup(r.html.html, 'html.parser')
print(r.status_code)
#print(response.status_code)

200


  r.html.arender(sleep=1)


## **Finding the xpath for the container of all contractors**

In [6]:
companies = r.html.xpath('//*[@id="all_contractor"]', first=True)
print(companies)

<Element 'div' class=('col-lg-8', 'col-xl-9') id='all_contractor'>


## **Ensuring the links of contractors are clear and valid**

In [7]:
print(companies.absolute_links)

{'https://muqawil.org/en/contractors/20006199/143', 'https://muqawil.org/en/contractors/959/143', 'https://muqawil.org/en/contractors/20019229/143', 'https://muqawil.org/en/contractors/20005421/143', 'https://muqawil.org/en/contractors/20010655/143', 'https://muqawil.org/en/contractors/20016964/143', 'https://muqawil.org/en/contractors?page=3', 'https://muqawil.org/en/contractors/20004514/143', 'https://muqawil.org/en/contractors/20001440/143', 'https://muqawil.org/en/contractors/20020449/143', 'https://muqawil.org/en/contractors/869/143', 'https://muqawil.org/en/contractors?page=903', 'https://muqawil.org/en/contractors/20010987/143', 'https://muqawil.org/en/contractors/8050/143', 'https://muqawil.org/en/contractors?page=1', 'https://muqawil.org/en/contractors/20003088/143', 'https://muqawil.org/en/contractors/20002330/143', 'https://muqawil.org/en/contractors/20008800/143', 'https://muqawil.org/en/contractors/8649/143', 'https://muqawil.org/en/contractors/20008518/143', 'https://muqa

In [8]:
for item in companies.absolute_links:
    print(item)

https://muqawil.org/en/contractors/20006199/143
https://muqawil.org/en/contractors/959/143
https://muqawil.org/en/contractors/20019229/143
https://muqawil.org/en/contractors/20005421/143
https://muqawil.org/en/contractors/20010655/143
https://muqawil.org/en/contractors/20016964/143
https://muqawil.org/en/contractors?page=3
https://muqawil.org/en/contractors/20004514/143
https://muqawil.org/en/contractors/20001440/143
https://muqawil.org/en/contractors/20020449/143
https://muqawil.org/en/contractors/869/143
https://muqawil.org/en/contractors?page=903
https://muqawil.org/en/contractors/20010987/143
https://muqawil.org/en/contractors/8050/143
https://muqawil.org/en/contractors?page=1
https://muqawil.org/en/contractors/20003088/143
https://muqawil.org/en/contractors/20002330/143
https://muqawil.org/en/contractors/20008800/143
https://muqawil.org/en/contractors/8649/143
https://muqawil.org/en/contractors/20008518/143
https://muqawil.org/en/contractors/20012033/143
https://muqawil.org/en/con

## **Decoding the emails as they are protected by the website**

In [9]:
def cfDecodeEmail(encodedString):
    '''
    Emails are protected in the website
    this function decodes the emails
    '''
    r = int(encodedString[:2],16)
    email = ''.join([chr(int(encodedString[i:i+2], 16) ^ r) for i in range(2, len(encodedString), 2)])
    return email

## **Creating scrape_company function to scrape the required data and extract it to a pandas dataframe and an excel file**
###### note: running this cell will take +5 minutes to finish

In [12]:
def scrape_company(item, page_number, data):
    '''''
    item represents the href or url for each company
    page_number will be created later
    data is the variables which need to be scraped like name, phone, email..etc
    '''''
    r = session.get(item)
    r.html.arender(sleep=1)
    soup = BeautifulSoup(r.html.html, 'html.parser')
    
    # node_n for company name
    # node_c for city
    # node_e for email
    # node_p for phone
    # activities_nodes for activities
    
    node_n = r.html.xpath('/html/body/main/div/div/div/div/div/div[2]/h3', first=True)
    company_name = node_n.text if node_n is not None else None
    node_c = soup.find('div', attrs={'class': 'info-name'}, string='City')
    city = node_c.find_next('div', class_='info-value').text.strip() if node_c is not None else None
    node_e = r.html.xpath('/html/body/main/div/div/div/div/div/div[2]/div[2]/div/div[7]/div/div[2]/div[2]/a', first=True)
    email = cfDecodeEmail(node_e.attrs['href'].replace('/cdn-cgi/l/email-protection#', '')) if node_e is not None else None
    node_p = soup.find('div', attrs={'class': 'info-name'}, string='phone')
    phone = node_c.find_next('div', class_='info-value').text.strip() if node_c is not None else None
    activities_nodes = soup.find_all('li', class_='list-item')
    activities = ', '.join([node.text.strip() for node in activities_nodes]) if activities_nodes else None
            
    if company_name and (city or email or phone or activities): # to avoid getting None and null values
        data.append({
            'Page': page_number,
            'Company Name': company_name,
            'City': city,
            'Email': email,
            'Phone': phone,
            'Activities': activities
    })

# Initializing an empty list to collect data
data = []

base_url = 'https://muqawil.org/en/contractors'

# Iterating over the first 10 pages
for page in range(1, 11):
    page_url = f"{base_url}?page={page}"
    r = session.get(page_url)
    r.html.arender(sleep=1)
    
    companies = r.html.xpath('//*[@id="all_contractor"]', first=True)
    if not companies:
        continue
    
    for item in companies.absolute_links: # running scrape_company for each company link in companies, takes +5 minutes
        scrape_company(item, page, data)

# Converting the data to a Pandas DataFrame
df = pd.DataFrame(data)

# Converting the DataFrame to an xlsx file and saving to locally
df.to_excel('contractors.xlsx', index=False)

# Displaying the first 40 rows of the DataFrame
df.head(40)

  r.html.arender(sleep=1)
  r.html.arender(sleep=1)


Unnamed: 0,Page,Company Name,City,Email,Phone,Activities
0,1,Sharjah Development Contracting Co,AL KHOBAR,alsharqimna@gmail.com,AL KHOBAR,"Construction of buildings, Construction of bui..."
1,1,Alenjazat Contracting Company,RIYADH,info@alenjazat.sa,RIYADH,"Construction of buildings, Construction of bui..."
2,1,Dome Park Contracting Company,AL MUWAYH AL JADID,vv.com838@icloud.com,AL MUWAYH AL JADID,"Construction of buildings, Construction of bui..."
3,1,acn solutions for contracting,JEDDAH,alwa7ed@hotmail.com,JEDDAH,"Waste collection, treatment & disposal activit..."
4,1,Bunoon Wa Funoon Contracting Co.,RIYADH,info@bfconst.com,RIYADH,"Construction of buildings, Construction of bui..."
5,1,On Al-Arabia Contracting Est.,RIYADH,a1032500371@gmail.com,RIYADH,No Data
6,1,Modern Building Solutions Contracting Company,RIYADH,ttalshammari@gmail.com,RIYADH,"Construction of buildings, Construction of bui..."
7,1,Ratel Al Sharq Contracting Company,RIYADH,adel_77@hotmail.com,RIYADH,"Construction of buildings, Construction of bui..."
8,1,Almegren International Group,RIYADH,info@almegren.sa,RIYADH,"Construction of buildings, Construction of bui..."
9,1,Gulf Pioneers for Construction Company,RIYADH,hh@gpksa.com,RIYADH,"Construction of buildings, Construction of bui..."


### Challenges

1. Time:

    As my first project in web scraping, it took me much time to finish the web crawling process, I couldn't do the semantic search challenge because of lack of time. 
* * *           
2. Dynamic javascript for email scraping:

    I also did not have the time to learn selenium to scrape emails, so instead I used a decoding function.

Thanks for reading
* * *
*by Ahmed Sharabati*