In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
import warnings

from nordvpn_switcher import initialize_VPN,rotate_VPN,terminate_VPN

In [3]:
driver = webdriver.Chrome()

In [4]:
base_url = 'https://datarade.ai'
entry_url = 'https://datarade.ai/search/products?keywords=&page=1&search_context=products&search_type=navbar'

In [5]:
driver.get(entry_url)
# click the accept cookies button
time.sleep(1)
accept_cookies_button = driver.find_element(By.XPATH, '//button[contains(text(), "Accept all")]')
accept_cookies_button.click()
time.sleep(1)

In [8]:
def clean_article_text(text):
    text = text.replace('\n', '')
    text = text.strip()
    return text

# Extract relevant urls

In [None]:
datarade_data_to_scrap = []

page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
page_count = int(soup.find_all('a', class_='item')[-4].text)

for page_i in range(page_count+1):
    print(f'Scrapping {page_i + 1} / {page_count + 1} pages', end='\r')
    driver.get(f'https://datarade.ai/search/products?keywords=&page={page_i}&search_context=products&search_type=navbar')
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    product_list = soup.find_all(lambda x: x.text == 'View Product')
    dataset_list = soup.find_all(lambda x: x.text == 'View Dataset')
    for product_tag in product_list:
        datarade_data_to_scrap.append({
            'url': base_url + product_tag['href'],
            'type': 'product'
            })
    for dataset_tag in dataset_list:
        datarade_data_to_scrap.append({
            'url': base_url + dataset_tag['href'],
            'type': 'dataset'
            })

    
# about 16 minutes computing time => consider running this asynchronously for better performance

In [None]:
for data in datarade_data_to_scrap:
    data['is_scrapped'] = False

In [None]:
datarade_data_to_scrap

In [None]:

with open('datarade_data_to_scrap.json', 'w') as file:
    json.dump(datarade_data_to_scrap, file)  

# extract data from pages

In [12]:
# This is if a file is to be finished scrapping
with open('datarade_scrapped_data.json', 'r') as f:
  datarade_data_to_scrap = json.load(f)

In [13]:
len(datarade_data_to_scrap)

4423

In [None]:
# for testing purpose only

# datasets_example_url = 'https://datarade.ai/datasets/properties-for-sale-in-uae'
# product_example_url = 'https://datarade.ai/data-products/gain-dynamics-insurance-consumer-behaviour-data-2m-house-gain-dynamics'
# product_example_url_1 = 'https://datarade.ai/data-products/b2b-marketing-dataset-grepsr-grepsr'
# product_example_url_2 = 'https://datarade.ai/data-products/veraset-movement-europe-gps-mobile-location-data-reli-veraset'

In [None]:
# For testing purpose only
# driver.get(product_example_url_2)
# page_source = driver.page_source
# soup = BeautifulSoup(page_source, 'html.parser')

In [7]:
settings = initialize_VPN(area_input=['Europe'])
rotate_VPN(settings) 

[33mYou're using Windows.
Performing system check...
###########################
[0m
NordVPN installation check: [92m✓[0m
NordVPN service check: [92m✓[0m
Opening NordVPN app and disconnecting if necessary...
NordVPN app launched: [92m✓[0m
#####################################

You've entered a list of connection options. Checking list...


Done!


Your current ip-address is: 45.155.42.119

[34mConnecting you to France ...
[0m
your new ip-address is: 86.104.20.41

Done! Enjoy your new server.



In [9]:
#for product data

def extract_from_product_page(soup):
    data_out = {}
    data_out['title'] = clean_article_text(soup.find(class_='product-hero__header-title-name').text)
    if soup.find('span', class_='provider__rating-summary-score'):
        data_out['provider__rating-summary-score'] = soup.find('span', class_='provider__rating-summary-score').text
    if soup.find('span', class_='provider__rating-summary-count'):
        data_out['provider__rating-summary-count'] = soup.find('span', class_='provider__rating-summary-count').text[1:-1]

    data_out['dataset__fact'] = []
    for dataset_fact_tag in soup.find_all('div', class_='dataset__fact'):
        dataset_fact_name = clean_article_text(dataset_fact_tag.find('div', class_='dataset__fact-name').text)
        dataset_fact_value = clean_article_text(dataset_fact_tag.find('div', class_='dataset__fact-value').text)
        dataset_fact_label = clean_article_text(dataset_fact_tag.find('div', class_='dataset__fact-label').text)
        data_out['dataset__fact'].append({
            'name': dataset_fact_name,
            'value': dataset_fact_value,
            'label': dataset_fact_label
        })

    if soup.find('h2', string='Data Dictionary'):
        data_dictionary_list = []
        table_title_tags = soup.find('h2', string='Data Dictionary').find_next().find_all('div', class_='title')
        table_tags = soup.find('h2', string='Data Dictionary').find_next().find_all('table')
        for i in range(len(table_title_tags)):
            table_title = clean_article_text(table_title_tags[i].text)
            data_dictionary_tag = table_tags[i].find('tbody')
            data_dictionary = []
            for tablerow_tag in data_dictionary_tag.find_all('tr'):
                data_dictionary_row = {}
                splitted_tablerow_tag = tablerow_tag.find_all('td')
                data_dictionary_row['attribute'] = clean_article_text(splitted_tablerow_tag[0].text)
                data_dictionary_row['type'] = clean_article_text(splitted_tablerow_tag[1].text)
                data_dictionary_row['example'] = clean_article_text(splitted_tablerow_tag[2].text)
                data_dictionary_row['mapping'] = clean_article_text(splitted_tablerow_tag[3].text)
                data_dictionary.append(data_dictionary_row)
            data_dictionary_list.append({
                    'title': table_title,
                    'data': data_dictionary
                })
        data_out['data_dictionary'] = data_dictionary_list
    
    data_out['details'] = clean_article_text(soup.find('h2', string='Description').find_next().text)

    geo_coverage_tag = soup.find('div', class_='countries').find_all('div', class_='inline-block')
    data_geo_coverage = []
    for country_tag in geo_coverage_tag:
        data_geo_coverage.append(clean_article_text(country_tag.text))
    data_out['geographical_coverage'] = data_geo_coverage

    product_content__pricing_info = []
    for product_content__pricing_info_tag in soup.find_all('div', class_='product-content__pricing-info'):
        product_content__pricing_info.append(product_content__pricing_info_tag.find('span').text)
    data_out['product-content__pricing-info'] = product_content__pricing_info

    if soup.find('h2', string='Pricing').find_next('table'):
        pricing_plans = []
        for table_tag in soup.find('h2', string='Pricing').find_next('table').find('tbody').find_all('tr'):
            pricing_plan = {}
            pricing_plan['license'] = table_tag.find('th').text
            pricing_plan['starts_at'] = table_tag.find('span').text
            pricing_plans.append(pricing_plan)
        data_out['pricing_plans'] = pricing_plans

    if soup.find('h2', string='Suitable Company Sizes'):
        suitable_company_sizes = []
        suitable_company_sizes_tags = soup.find('h2', string='Suitable Company Sizes').find_next().find_all(class_='checked-tag-group__item active')
        for tag in suitable_company_sizes_tags:
            suitable_company_sizes.append(tag.text)
        data_out['suitable_company_sizes'] = suitable_company_sizes

    if soup.find('h2', string='Quality'):
        quality_list = []
        quality_tags = soup.find('h2', string='Quality').find_next().find_all(class_='product-content__quality-indicator')
        for quality_tag in quality_tags:
            value = quality_tag.find('span').text
            for child in quality_tag.find_all('div'):
                child.decompose()
            title = clean_article_text(quality_tag.text)
            quality_list.append({
                'value': value,
                'title': title
            })
            data_out['quality'] = quality_list

    if soup.find('h2', string='Delivery'):
        delivery = []
        for delivery_type_tag in soup.find('h2', string='Delivery').find_next().find_all(class_='product-content__delivery-header'):
            delivery_type = delivery_type_tag.text
            value_list = []
            for value_tag in delivery_type_tag.find_next().find_all(class_='checked-tag-group__item active'):
                value_list.append(value_tag.text)
            delivery.append({
                'type': delivery_type,
                'values': value_list
            })
        data_out['delivery'] = delivery
    
    if soup.find('h2', string='History'):
        data_out['history'] = clean_article_text(soup.find('h2', string='History').find_next().text)

    if soup.find('h2', string='Use Cases'):
        use_cases = []
        for use_case_tag in soup.find('h2', string='Use Cases').find_next().find_all('span'):
            use_cases.append(use_case_tag.text)
        data_out['use_cases'] = use_cases

    if soup.find('h2', string='Categories'):
        categories = []
        for category_tag in soup.find('h2', string='Categories').find_next().find_all('span'):
            categories.append(category_tag.text)
        data_out['categories'] = categories

    return data_out

In [10]:
#for dataset data

def extract_from_dataset_page(soup):
    data_out = {}
    data_out['title'] = clean_article_text(soup.find(class_='dataset__header-title-name').text)

    data_out['provider'] = soup.find('a', class_='provider').text
    if soup.find('span', class_='provider__rating-summary-score'):
        data_out['provider__rating-summary-score'] = soup.find('span', class_='provider__rating-summary-score').text
    if soup.find('span', class_='provider__rating-summary-count'):
        data_out['provider__rating-summary-count'] = soup.find('span', class_='provider__rating-summary-count').text[1:-1]

    data_out['dataset__fact'] = []
    for dataset_fact_tag in soup.find_all('div', class_='dataset__fact'):
        dataset_fact_name = clean_article_text(dataset_fact_tag.find('div', class_='dataset__fact-name').text)
        dataset_fact_value = clean_article_text(dataset_fact_tag.find('div', class_='dataset__fact-value').text)
        dataset_fact_label = clean_article_text(dataset_fact_tag.find('div', class_='dataset__fact-label').text)
        data_out['dataset__fact'].append({
            'name': dataset_fact_name,
            'value': dataset_fact_value,
            'label': dataset_fact_label
        })

    if len(soup.find_all('table', class_='table--dataset')) > 1:
        data_dictionary = []
        data_dictionary_tag = soup.find_all('table', class_='table--dataset')[1].find('tbody')
        for tablerow_tag in data_dictionary_tag.find_all('tr'):
            data_dictionary_row = {}
            splitted_tablerow_tag = tablerow_tag.find_all('td')
            data_dictionary_row['attribute'] = clean_article_text(splitted_tablerow_tag[0].text)
            data_dictionary_row['type'] = clean_article_text(splitted_tablerow_tag[1].text)
            data_dictionary_row['example'] = clean_article_text(splitted_tablerow_tag[2].text)
            data_dictionary_row['mapping'] = clean_article_text(splitted_tablerow_tag[3].text)
            data_dictionary.append(data_dictionary_row)
        data_out['data_dictionary'] = data_dictionary

    data_out['details'] = clean_article_text(soup.find('h3', string='Details').find_next().text)

    if (soup.find('div', class_='ui list')):
        geo_coverage_tag = soup.find('div', class_='ui list').find_all('div', class_='inline-block')
        data_geo_coverage = []
        for country_tag in geo_coverage_tag:
            data_geo_coverage.append(clean_article_text(country_tag.text))
        data_out['geographical_coverage'] = data_geo_coverage

    dataset__categories_tags = soup.find('div', class_='dataset__categories').find_all('span')
    dataset__categories = []
    for dataset__categories_tag in dataset__categories_tags:
        dataset__categories.append(dataset__categories_tag.text)

    data_out['dataset__categories'] = dataset__categories
    dataset__price_tag = soup.find('div', class_='dataset__price')
    for child in dataset__price_tag.find_all('div'):
        child.decompose()
    data_out['dataset__price'] = dataset__price_tag.text

    if soup.find('h2', string='History'):
        data_out['history'] = clean_article_text(soup.find('h2', string='History').find_next().text)

    return data_out

In [21]:
try:
    driver.close()
except:
    pass
driver = webdriver.Chrome()
# entry_url = 'https://datarade.ai/search/products?keywords=&page=1&search_context=products&search_type=navbar'
# driver.get(entry_url)
# click the accept cookies button
# try:
#     time.sleep(1)
#     accept_cookies_button = driver.find_element(By.XPATH, '//button[contains(text(), "Accept all")]')
#     accept_cookies_button.click()
#     time.sleep(1)
# except:
#     print('didn\'t click accept cookies button')
rotate_VPN(settings) 

pages_to_scrap_count = len(datarade_data_to_scrap)
page_i = 0
vpn_rotate_counter = 0
for data_to_scrap in datarade_data_to_scrap:
    page_i += 1
    data_entry_dict = data_to_scrap
    url = data_entry_dict['url']
    print(f'scrapping page {page_i} / {pages_to_scrap_count} : {url}                                                         ', end='\r')

    if data_to_scrap['is_scrapped']:
        continue 

    vpn_rotate_counter += 1
    try:
        driver.get(url)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        if data_entry_dict['type'] == 'product':
            #We filter the product if unavailable
            if soup.find(class_='landing-page__title') and clean_article_text(soup.find(class_='landing-page__title').text) == 'This product is currently unavailable':
                data_entry_dict.update({'title': 'unavailable',
                                        'is_scrapped': True})
                continue
            data_entry_dict.update(extract_from_product_page(soup))
        elif data_entry_dict['type'] == 'dataset':
            data_entry_dict.update(extract_from_dataset_page(soup))
        else:
            warnings.warn(f'{url} has no correct type')
        data_to_scrap['is_scrapped'] = True
    except:
        print(f'Couldn\'t scrap {url}')

    if vpn_rotate_counter >= 40:
        rotate_VPN(settings) 
        driver.close()
        driver = webdriver.Chrome()
        vpn_rotate_counter = 0


Your current ip-address is: 146.19.88.244

[34mConnecting you to Netherlands ...
[0m
your new ip-address is: 193.187.128.240

Done! Enjoy your new server.

Couldn't scrap https://datarade.ai/data-products/legal-data-litigation-data-legal-parties-data-easy-to-i-apiscrapyo-i-apiscrapy                                                                                 
scrapping page 912 / 4423 : https://datarade.ai/data-products/solution-publishing-construction-continuum-programmatic-d-solution-publishing                                                                      
Your current ip-address is: 193.187.128.240

[34mConnecting you to Ukraine ...
[0m
your new ip-address is: 37.19.218.164

Done! Enjoy your new server.

scrapping page 952 / 4423 : https://datarade.ai/data-products/the-data-appeal-global-location-data-insights-real-time-fo-the-data-appeal-company                                                         
Your current ip-address is: 37.19.218.164

[34mConnecting you to 

In [23]:
with open('datarade_scrapped_data.json', 'w') as file:
    json.dump(datarade_data_to_scrap, file)

# scrap use cases

TODO :scrap also those links : https://datarade.ai/use-cases