In [1]:
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

## 1. Request data

**Upwork** uses **Perimeter** anti-bot services. There is no free API to scrape talent data, page has to be jave script reneder.

![Perimeter](images/perimeter.png "Perimeter")

For this reason I chose to use `Requests-HTML` library to scrape upwork website.

https://docs.python-requests.org/projects/requests-html/en/latest/

## 2. step-by-step implementation

Using `data science` search url https://www.upwork.com/search/profiles/?q=data%20science .

In [2]:
# request javescripted content

url = 'https://www.upwork.com/search/profiles/?q=data%20science'
try:
    session = HTMLSession()
    response = session.get(url)
except requests.exceptions.RequestException as e:
    print(e)

session.close()

In [3]:
# create beautifoul soup element

soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
# iterate over freelancers
freelancers = soup.findAll('div', {'class': "up-card-section up-card-hover"})

freelancer = freelancers[1]

name = freelancer.find('div', {'class': 'identity-name'}).text.strip()
talent_title = freelancer.find('p', {'class': 'my-0 freelancer-title'}).text.strip()
country = freelancer.find('span', {'itemprop': 'country-name'}).text.strip()
rate = freelancer.find('div', {'data-qa': 'rate'}).text.strip()
description = freelancer.find('div', {'class': 'up-line-clamp-v2-wrapper mb-0'}).text.strip()

# some freelancers don't provide public earnings
try:
    earnings = freelancer.find('div', {'data-qa': 'earnings'}).text.strip()
except:
    earnings = ""

# some freelancers havent completed jobs yet
try:
    success = freelancer.find('span', {'class': 'up-job-success-text'}).text.strip()
except:
    success = ''
    
badges = ', '.join([_.text.strip() for _ in freelancer.findAll('div', {'class': 'up-skill-badge'})])
    
    
print(f'name: {name}')
print(f'\ntitle: {talent_title}')
print(f'\ncountry: {country}')
print(f'\nrate: {rate}')
print(f'\ndescription:\n\n{description}\n')
print(f'\nearnings: {earnings}')
print(f'\nsuccess: {success}')
print(f'\nbadges: {badges}')

name: Callum H.

title: Data Scientist: quantitative and qualitative data analysis experience

country: South Africa

rate: $40.00   /hr

description:

Greetings! I am a data scientist who lives in the R & Shiny ‘Tidyverse’ and it is my mission to extract insight from your data and bring it to life in a clear and meaningful way for you and your customers. 

I am particularly interested in sentiment analysis and geo-spatial data science projects and draw upon a wide range of environmental and business project experience including demographic surveys, customer classifications, e-commerce data analysis, real-time fraud detection, text mining and telecommunications marketing. I have performed various regression, machine learning and clustering techniques in these roles and bring the results to the fore using the Shiny web framework. 

I have a wide range of quantitative and qualitative statistical analysis experience and also survey design, research project management, custom and legacy da

In [5]:
# generate quary search
search_key_word = 'data science'

# replace spaces with %20 signs
print(search_key_word.replace(' ', '%20'))
# create new url string 
page = 1
url = f'https://www.upwork.com/search/profiles/?page={page}&q=' + search_key_word.replace(' ', '%20')
print(url)

data%20science
https://www.upwork.com/search/profiles/?page=1&q=data%20science


## 3. Functions

In [6]:
def scrape_js_content(url):
    """
    :param url:
    :return:
    """
    try:
        session = HTMLSession()
        response = session.get(url)
    except requests.exceptions.RequestException as e:
        print(e)

    session.close()

    return response.content

In [7]:
def process_page(soup):
    """
    :param url:
    :return:
    """
    freelancers = soup.findAll('div', {'class': "up-card-section up-card-hover"})
    data = list()
    
    for freelancer in freelancers:
        
        name = freelancer.find('div', {'class': 'identity-name'}).text.strip()
        talent_title = freelancer.find('p', {'class': 'my-0 freelancer-title'}).text.strip()
        country = freelancer.find('span', {'itemprop': 'country-name'}).text.strip()
        rate = freelancer.find('div', {'data-qa': 'rate'}).text.strip()
        description = freelancer.find('div', {'class': 'up-line-clamp-v2-wrapper mb-0'}).text.strip()

        try:
            earnings = freelancer.find('div', {'data-qa': 'earnings'}).text.strip()
        except:
            earnings = ""
            
        try:
            success = freelancer.find('span', {'class': 'up-job-success-text'}).text.strip()
        except:
            success = ''
            
        badges = ', '.join([_.text.strip() for _ in freelancer.findAll('div', {'class': 'up-skill-badge'})])
        
        data.append([name, talent_title, country, rate, earnings, success, badges, description])
        
    return data 

In [8]:
search_key_word = 'data science'

data = list()

for page in range(1, 2):
    url = f'https://www.upwork.com/search/profiles/?page={page}&q=' + search_key_word.replace(' ', '%20')
    content = scrape_js_content(url)
    soup = BeautifulSoup(content, 'html.parser')
    data += process_page(soup)
    # explicit wait times
    time.sleep(np.random.randint(7, 12)) 
    
    
df = pd.DataFrame(data, columns=['Name', 'Title', 'Country', 'Rate', 'Earnings', 'Success', 'Badges', 'Description'])
df.head()

Unnamed: 0,Name,Title,Country,Rate,Earnings,Success,Badges,Description
0,Ismail T.,Data Strategy Consultant | Data Solutions Expe...,United States,$250.00 /hr,$100k+ earned\n Close the tooltip \...,99% Job Success,"R, Microsoft Excel, Machine Learning, Data Sci...",⭐⭐⭐⭐⭐ 5-Star 𝗧𝗼𝗽 𝗥𝗮𝘁𝗲𝗱 𝗣𝗹𝘂𝘀 Data Professional\...
1,Callum H.,Data Scientist: quantitative and qualitative d...,South Africa,$40.00 /hr,$10k+ earned\n Close the tooltip \n...,100% Job Success,"Data Science, Data Mining, Data Cleansing, Dat...",Greetings! I am a data scientist who lives in ...
2,Mohammed Z.,Data Scientist,United States,$85.00 /hr,,100% Job Success,"Data Science, Machine Learning, Data Analytics...",Do you need a Data Scientist or a Data Enginee...
3,Luiz T.,Data Scientist and Machine Learning specialist,Brazil,$85.00 /hr,$40k+ earned\n Close the tooltip \n...,100% Job Success,"Python, Machine Learning, Data Science, Data A...",I am a trained computer scientist passionate a...
4,Ahmed A.,Machine Learning | Deep Learning | NLP Engineer,Egypt,$60.00 /hr,,100% Job Success,"Python, Data Science, Scripting, Algorithms, N...","Hello there,\n\nThis is Ahmed, a Machine Learn..."


In [9]:
df.shape

(10, 8)