In [1]:
import requests
from lxml import html
import pandas as pd
import time

In [2]:
def get_page_nb(url: str) -> int:
    '''The url is the start page, and we want to get the total number of pages we need to look at.'''
    resp = requests.get(url)
    page = html.fromstring(resp.text)
    page_nb = page.xpath('//a[contains(@class, "pagination__link")]')[-2].text_content() # to get the max of the number of pages, we have to look at the penultimate element of the list of pages because the arrows (that allow user to move on to the next or previous pages) are considered as the same element as page number in the html code of the webpage
    return int(page_nb) # return an int to be sure to have an int that can be used after

In [3]:
# the function below gets the url of all the profiles of one page (with 24 profiles)
def get_profiles(url: str, df: pd.DataFrame) -> pd.DataFrame:
    '''Get the list of the profiles available on the url and add each profile to a pandas DataFrame'''
    resp = requests.get(url)
    while resp.status_code!=200:
        time.sleep(60)
        resp = requests.get(url)
    profiles = html.fromstring(resp.text)
    profile_nodes = profiles.xpath('//div[contains(@class, "card⤍FreelancerCard")]')
    for profile in profile_nodes:
        profile_url = profile.xpath('.//a')[0].get('href') # the first element is always the link towards the full profile of the person
        entry = {
            'url' : profile_url
        }
        df = df.append(entry, ignore_index=True)
    return df

In [4]:
def get_all_profiles(start_page: str, nb_page: int, verbose:int =1) -> pd.DataFrame:
    """
    Get all the profiles of the freelancers working in the Writing & Translation category.
    """
    df = pd.DataFrame(columns=['url'])
    txt = '***********'
    for i in range(1,nb_page+1):
        page_url = start_page + '?page={}'.format(i)
        df = get_profiles(page_url, df)
        if verbose==1:
            if i%50==0:
                if i<=50:
                    print('page {}/{}'.format(i, nb_page))
                    print(txt)
                else:
                    print('page {}/{}'.format(i, nb_page))
                    print(txt+'*')
    print('End of scraping...')
    return df

In [5]:
start_page = 'https://www.peopleperhour.com/hire-freelancers/writing-translation'
max_page = get_page_nb(start_page)
df = get_all_profiles(start_page, max_page)

page 50/417
***********
page 100/417
************
page 150/417
************
page 200/417
************
page 250/417
************
page 300/417
************
page 350/417
************
page 400/417
************
End of scraping...


In [6]:
today = time.strftime("_%d_%m_%Y")

In [9]:
df.to_csv('./data/profiles_url'+today+'.csv', index=False)