In [1]:
import numpy as np
import pandas as pd
import datetime
import re
import ast
from nltk.corpus import stopwords
import utils
from collections import Counter

https://www.peopleperhour.com/freelancer/business/teodora-popescu-certified-translator-digital-zamnnqz?Projects_page=1
https://www.peopleperhour.com/freelancer/business/teodora-popescu-certified-translator-digital-zamnnqz?Projects_page=2


In [2]:
scraping_date = '_26_04_2021'
data = pd.read_csv('./data/improved_profiles'+scraping_date+'.csv')

In [3]:
fscraping_date = '2021-04-26'

# Format the price

**Change all currencies to USD, using the exchange rate of the day when the data was scraped.**

In [4]:
currency_dict = {'£': 'GBP', '€': 'EUR', '$': 'USD'}
currencies = ['GBP', 'EUR']
exchange_rates = utils.get_exchange_rates(currencies, fscraping_date)

In [5]:
exchange_rates

{'GBP': 1.3894797356, 'EUR': 1.2085}

In [6]:
prices = np.array(data.price)
formated_prices = []

In [7]:
for i,price in enumerate(prices):
    currency, formated_price = price[0], float(price[1:])
    formated_currency = currency_dict[currency]
    if formated_currency != 'USD':
        exchange_rate = exchange_rates[formated_currency]
        converted_price = formated_price * exchange_rate
    else:
        converted_price = formated_price
    
    formated_prices.append(round(converted_price,2))

In [8]:
data = data.assign(price=formated_prices)
data = data.rename(columns={'price':'Price/hour in USD'})

# Split location in city and country

**Location is always in the format city, country.**

In [9]:
location = np.array(data.location)

In [10]:
cities, countries = [], []
for city_country in location:
    splitted_location = city_country.split(',')
    if len(splitted_location)==2:
        city, country = city_country.split(',')
    elif splitted_location[2]=='United States':
        city, country = splitted_location[0], splitted_location[2]
    cities.append(city)
    countries.append(country)

In [11]:
data['city'] = cities
data['country'] = countries
data = data.drop('location', axis=1)

# Format gender

**Assign** ``Unknown`` **to gender when the gender cannot be determined (when it is a company or a group of people.**

In [12]:
data = data.replace({'gender': {None: 'unknown'}})

# Format the last active date

In [13]:
last_active = np.array(data.last_active)
dates = []

In [14]:
for date in last_active:
    date_obj = datetime.datetime.strptime(date, '%b %d %Y')
    formated_date = date_obj.strftime('%d-%m-%Y')
    dates.append(formated_date)

In [15]:
data = data.assign(last_active=dates)

# Format languages spoken

**Remove useless words and keep only the one referring to a language spoken by the person.**

In [16]:
languages = np.array(data.languages)
formated_languages = []

In [17]:
def remove_punc_stop_words(sentence: str) -> str:
    sentence = re.sub(pattern="[^\w\s]", repl="", string=sentence)
    sentence = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*').sub('', sentence)
    return sentence

In [18]:
# remove punctuation, stop words and the words speaking and macrolanguage
for spoken_languages in languages:
    if spoken_languages!=spoken_languages:
        formated_languages.append([])
        continue
    spoken_languages = remove_punc_stop_words(spoken_languages)
    speaking = re.compile(r'\bspeaking\b\s*' + r'|' + r'\bmacrolanguage\b\s*')
    spaces = re.compile(r'\s$')
    spoken_languages = spaces.sub('', speaking.sub('', spoken_languages))
    spoken_languages = spoken_languages.split(' ')
    formated_languages.append(spoken_languages)

In [19]:
data = data.assign(languages=formated_languages)

In [20]:
data

Unnamed: 0,name,languages,description,rating,nb_reviews,Price/hour in USD,skills,expertise,nb_projects,nb_buyers,last_active,gender,city,country
0,Maria H.,[English],We are a small UK based company speciaIising i...,4.9,4694,34.74,"['animation', 'graphics design', 'brand design...","Industry expertise: IT, Internet, Marketing, M...",7672,4168,26-04-2021,female,Sheffield,United Kingdom
1,Cormac Reynolds,[English],Looking for a link building solution or some g...,5.0,1190,138.95,"['online marketing', 'link building', 'marketi...",,1085,533,25-04-2021,unknown,City of London,United Kingdom
2,Denise Toepel,"[English, Spanish, French]",I am a Translation Specialist. I work in publ...,0.0,0,27.79,"['English translation', 'French <=> English', ...",Industry expertise: Education and Telecommunic...,0,0,25-04-2021,female,Cuenca,Ecuador
3,Translate Guru .,[],Hello! Hola! Ciao! Bonjour! 你好!\r\n\r\nI do hi...,4.9,598,20.00,"['German <=> English translation', 'Arabic <=>...",,721,376,26-04-2021,unknown,Cannanore,India
4,Logical Translation & Localisation,"[English, Spanish, French, Italian]",OFFERING A WIDE RANGE OF TRANSLATION SERVICES\...,4.9,135,27.79,"['editing', 'proofreading', 'German translatio...","Industry expertise: Advertising, Aerospace, Au...",140,100,24-04-2021,unknown,City of London,United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9998,Goggle Software Ltd,[English],Goggle Software build CRMs and bespoke portals...,5.0,31,69.47,"['ajax', 'database development', 'hosting deve...","Industry expertise: Electronics, IT and Techno...",26,22,15-04-2021,male,Leeds,United Kingdom
9999,Roshan Rupasinghe,[English],I am an IIBA certified business analyst with 3...,0.0,0,12.00,"['business analysis', 'business writing', 'bus...",Industry expertise: IT,0,0,02-12-2020,unknown,Colombo,Sri Lanka
10000,Sally Lavinia,"[English, Swahili]",,0.0,0,30.00,"['microsoft excel', 'proofreading', 'academic ...","Industry expertise: Financial Services, Intern...",0,0,30-12-2020,female,Nairobi,Kenya
10001,Mohammed Younes,"[Arabic, English]","With a couple of years of experience, I was ab...",0.0,0,15.00,"['graphics design', 'social media management',...","Industry expertise: Advertising, Food, Media a...",0,0,25-04-2021,male,Zgharta,Lebanon


# Format the expertise

In [21]:
expertise = np.array(data.expertise)
formated_expertise = []

In [22]:
for profile_expertise in expertise:
    if profile_expertise!=profile_expertise:
        formated_expertise.append([])
        continue
    fexpertise = remove_punc_stop_words(profile_expertise)
    fexpertise = fexpertise.split(' ')[2:]
    formated_expertise.append(fexpertise)

In [23]:
data = data.assign(expertise=formated_expertise)
data = data.rename(columns={'expertise':'industry_expertise'})

In [24]:
data

Unnamed: 0,name,languages,description,rating,nb_reviews,Price/hour in USD,skills,industry_expertise,nb_projects,nb_buyers,last_active,gender,city,country
0,Maria H.,[English],We are a small UK based company speciaIising i...,4.9,4694,34.74,"['animation', 'graphics design', 'brand design...","[IT, Internet, Marketing, Media, Technology]",7672,4168,26-04-2021,female,Sheffield,United Kingdom
1,Cormac Reynolds,[English],Looking for a link building solution or some g...,5.0,1190,138.95,"['online marketing', 'link building', 'marketi...",[],1085,533,25-04-2021,unknown,City of London,United Kingdom
2,Denise Toepel,"[English, Spanish, French]",I am a Translation Specialist. I work in publ...,0.0,0,27.79,"['English translation', 'French <=> English', ...","[Education, Telecommunications]",0,0,25-04-2021,female,Cuenca,Ecuador
3,Translate Guru .,[],Hello! Hola! Ciao! Bonjour! 你好!\r\n\r\nI do hi...,4.9,598,20.00,"['German <=> English translation', 'Arabic <=>...",[],721,376,26-04-2021,unknown,Cannanore,India
4,Logical Translation & Localisation,"[English, Spanish, French, Italian]",OFFERING A WIDE RANGE OF TRANSLATION SERVICES\...,4.9,135,27.79,"['editing', 'proofreading', 'German translatio...","[Advertising, Aerospace, Automotive, Construct...",140,100,24-04-2021,unknown,City of London,United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9998,Goggle Software Ltd,[English],Goggle Software build CRMs and bespoke portals...,5.0,31,69.47,"['ajax', 'database development', 'hosting deve...","[Electronics, IT, Technology]",26,22,15-04-2021,male,Leeds,United Kingdom
9999,Roshan Rupasinghe,[English],I am an IIBA certified business analyst with 3...,0.0,0,12.00,"['business analysis', 'business writing', 'bus...",[IT],0,0,02-12-2020,unknown,Colombo,Sri Lanka
10000,Sally Lavinia,"[English, Swahili]",,0.0,0,30.00,"['microsoft excel', 'proofreading', 'academic ...","[Financial, Services, Internet, Legal, News, P...",0,0,30-12-2020,female,Nairobi,Kenya
10001,Mohammed Younes,"[Arabic, English]","With a couple of years of experience, I was ab...",0.0,0,15.00,"['graphics design', 'social media management',...","[Advertising, Food, Media, Sport]",0,0,25-04-2021,male,Zgharta,Lebanon


# Format the skills

In [25]:
def clean(skills: list) -> list:
    cleaned_skills = []
    for skill in skills:
        skill = skill.lower()
        REMOVE_PUNCT = re.compile("[.;:!\'?,\"()\[\]]")
        skill = re.sub(r"\([^()]*\)", "", skill)
        skill = REMOVE_PUNCT.sub("", skill)
        REPLACE_HTML = re.compile("<=>")
        skill = REPLACE_HTML.sub("", skill)
        skill = skill.strip()
        cleaned_skills.append(skill)
    return cleaned_skills

In [26]:
skills = np.array(data.skills)
formated_skills = []

In [27]:
for profile_skills in skills:
    profile_skills = ast.literal_eval(profile_skills)
    profile_skills = clean(profile_skills)
    formated_skills.append(profile_skills)

In [28]:
data = data.assign(skills=formated_skills)

In [29]:
data

Unnamed: 0,name,languages,description,rating,nb_reviews,Price/hour in USD,skills,industry_expertise,nb_projects,nb_buyers,last_active,gender,city,country
0,Maria H.,[English],We are a small UK based company speciaIising i...,4.9,4694,34.74,"[animation, graphics design, brand design, log...","[IT, Internet, Marketing, Media, Technology]",7672,4168,26-04-2021,female,Sheffield,United Kingdom
1,Cormac Reynolds,[English],Looking for a link building solution or some g...,5.0,1190,138.95,"[online marketing, link building, marketing wr...",[],1085,533,25-04-2021,unknown,City of London,United Kingdom
2,Denise Toepel,"[English, Spanish, French]",I am a Translation Specialist. I work in publ...,0.0,0,27.79,"[english translation, french english, online ...","[Education, Telecommunications]",0,0,25-04-2021,female,Cuenca,Ecuador
3,Translate Guru .,[],Hello! Hola! Ciao! Bonjour! 你好!\r\n\r\nI do hi...,4.9,598,20.00,"[german english translation, arabic english ...",[],721,376,26-04-2021,unknown,Cannanore,India
4,Logical Translation & Localisation,"[English, Spanish, French, Italian]",OFFERING A WIDE RANGE OF TRANSLATION SERVICES\...,4.9,135,27.79,"[editing, proofreading, german translation, en...","[Advertising, Aerospace, Automotive, Construct...",140,100,24-04-2021,unknown,City of London,United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9998,Goggle Software Ltd,[English],Goggle Software build CRMs and bespoke portals...,5.0,31,69.47,"[ajax, database development, hosting developme...","[Electronics, IT, Technology]",26,22,15-04-2021,male,Leeds,United Kingdom
9999,Roshan Rupasinghe,[English],I am an IIBA certified business analyst with 3...,0.0,0,12.00,"[business analysis, business writing, business...",[IT],0,0,02-12-2020,unknown,Colombo,Sri Lanka
10000,Sally Lavinia,"[English, Swahili]",,0.0,0,30.00,"[microsoft excel, proofreading, academic writi...","[Financial, Services, Internet, Legal, News, P...",0,0,30-12-2020,female,Nairobi,Kenya
10001,Mohammed Younes,"[Arabic, English]","With a couple of years of experience, I was ab...",0.0,0,15.00,"[graphics design, social media management, slo...","[Advertising, Food, Media, Sport]",0,0,25-04-2021,male,Zgharta,Lebanon


# Define a score for a profile

**The score is based on the rating of the profile as well as the number of reviews. As the values are quite high, we use a** ``log`` **function to decrease the values but keep the growing trend.**\
We use the following formula to avoid having issues with 0 values.
$$score=\log(1+rating\times nb\_reviews)$$

In [49]:
score = np.log(1+data.rating*data.nb_reviews)

In [55]:
data['score'] = round(score,3)

data.to_csv('./data/full_profiles'+scraping_date+'.csv', index=False)