In [1]:
import numpy as np
import pandas as pd
import datetime
import re
import ast
from nltk.corpus import stopwords
import utils
from collections import Counter

In [2]:
scraping_date = '_17_05_2021'
# data = pd.read_csv('./data/improved_profiles'+scraping_date+'.csv')
data = pd.read_csv('./data/profiles_info'+scraping_date+'.csv')

In [3]:
# fscraping_date = '2021-04-26'
fscraping_date = '2021-05-17'

# Format the price

**Change all currencies to USD, using the exchange rate of the day when the data was scraped.**

In [4]:
currency_dict = {'£': 'GBP', '€': 'EUR', '$': 'USD'}
currencies = ['GBP', 'EUR']
exchange_rates = utils.get_exchange_rates(currencies, fscraping_date)

In [5]:
exchange_rates

{'GBP': 1.4084556052, 'EUR': 1.2143}

In [6]:
prices = np.array(data.price)
formated_prices = []

In [7]:
for i,price in enumerate(prices):
    currency, formated_price = price[0], float(price[1:])
    formated_currency = currency_dict[currency]
    if formated_currency != 'USD':
        exchange_rate = exchange_rates[formated_currency]
        converted_price = formated_price * exchange_rate
    else:
        converted_price = formated_price
    
    formated_prices.append(round(converted_price,2))

In [8]:
data = data.assign(price=formated_prices)
data = data.rename(columns={'price':'Price/hour in USD'})

# Split location in city and country

**Location is always in the format city, country.**

In [9]:
location = np.array(data.location)

In [10]:
cities, countries = [], []
for city_country in location:
    splitted_location = city_country.split(',')
    if len(splitted_location)==2:
        city, country = city_country.split(',')
    elif splitted_location[2]=='United States':
        city, country = splitted_location[0], splitted_location[2]
    cities.append(city)
    countries.append(country)

In [11]:
data['city'] = cities
data['country'] = countries
data = data.drop('location', axis=1)

# Format gender

**Assign** ``Unknown`` **to gender when the gender cannot be determined (when it is a company or a group of people.**

In [12]:
data = data.replace({'gender': {None: 'unknown'}})

# Format the last active date

In [13]:
last_active = np.array(data.last_active)
dates = []

In [14]:
for date in last_active:
    date_obj = datetime.datetime.strptime(date, '%b %d %Y')
    formated_date = date_obj.strftime('%d-%m-%Y')
    dates.append(formated_date)

In [15]:
data = data.assign(last_active=dates)

# Format languages spoken

**Remove useless words and keep only the one referring to a language spoken by the person.**

In [16]:
languages = np.array(data.languages)
formated_languages = []

In [17]:
def remove_punc_stop_words(sentence: str) -> str:
    sentence = re.sub(pattern="[^\w\s]", repl="", string=sentence)
    sentence = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*').sub('', sentence)
    return sentence

In [18]:
# remove punctuation, stop words and the words speaking and macrolanguage
for spoken_languages in languages:
    if spoken_languages!=spoken_languages:
        formated_languages.append([])
        continue
    spoken_languages = remove_punc_stop_words(spoken_languages)
    speaking = re.compile(r'\bspeaking\b\s*' + r'|' + r'\bmacrolanguage\b\s*')
    spaces = re.compile(r'\s$')
    spoken_languages = spaces.sub('', speaking.sub('', spoken_languages))
    spoken_languages = spoken_languages.split(' ')
    formated_languages.append(spoken_languages)

In [19]:
data = data.assign(languages=formated_languages)

In [20]:
data

Unnamed: 0,name,languages,description,rating,nb_reviews,Price/hour in USD,skills,expertise,nb_projects,nb_buyers,last_active,gender,city,country
0,Maria H.,[English],We are a small UK based company speciaIising i...,4.9,4745,35.21,"['animation', 'graphics design', 'brand design...","Industry expertise: IT, Internet, Marketing, M...",7727,4204,17-05-2021,unknown,Sheffield,United Kingdom
1,Jessica O'Neill,[English],Your blog content and website copy need to be ...,5.0,1220,77.47,"['editing', 'proofreading', 'transcription', '...","Industry expertise: PR, Sales and Travel",1546,880,17-05-2021,unknown,City of London,United Kingdom
2,Scott Anfield,[English],An experienced and professional writer based i...,4.9,1755,21.13,"['article', 'product description', 'content wr...","Industry expertise: Advertising, Education, Ma...",1784,1094,17-05-2021,unknown,Doncaster,United Kingdom
3,Translate Guru .,[],Hello! Hola! Ciao! Bonjour! 你好!\r\n\r\nI do hi...,4.9,608,20.00,"['German <=> English translation', 'Arabic <=>...",,730,378,17-05-2021,unknown,Cannanore,India
4,Logical Translation & Localisation,"[English, Spanish, French, Italian]",OFFERING A WIDE RANGE OF TRANSLATION SERVICES\...,4.9,135,28.17,"['editing', 'proofreading', 'German translatio...","Industry expertise: Advertising, Aerospace, Au...",140,100,17-05-2021,unknown,City of London,United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9998,Kendra Aquino,[English],I am an ambitious professional with working ba...,0.0,0,20.00,"['editing', 'microsoft excel', 'proofreading',...","Industry expertise: Advertising, Marketing and...",0,0,14-02-2021,unknown,Viareggio,Italy
9999,Maria Jussel,"[German, English]",Founding my business in 2020 has been a real s...,5.0,3,145.72,"['financial accounting', 'financial management...","Industry expertise: Financial Services, Human ...",2,1,24-04-2021,unknown,Vienna,Austria
10000,Alex Halfacree,[English],Hi! My name is Alex and I am a 19 year old bus...,0.0,0,8.45,"['administration support', 'data entry', 'busi...",,0,0,18-04-2021,unknown,Crawley,United Kingdom
10001,Allison Gerena,[English],I am a professional copywriter with a BA in En...,0.0,0,21.13,"['proofreading', 'email marketing', 'product d...","Industry expertise: Gaming, Internet, Leisure,...",0,0,14-04-2021,unknown,Crawley,United Kingdom


# Format the expertise

In [21]:
expertise = np.array(data.expertise)
formated_expertise = []

In [22]:
for profile_expertise in expertise:
    if profile_expertise!=profile_expertise:
        formated_expertise.append([])
        continue
    fexpertise = remove_punc_stop_words(profile_expertise)
    fexpertise = fexpertise.split(' ')[2:]
    formated_expertise.append(fexpertise)

In [23]:
data = data.assign(expertise=formated_expertise)
data = data.rename(columns={'expertise':'industry_expertise'})

In [24]:
data

Unnamed: 0,name,languages,description,rating,nb_reviews,Price/hour in USD,skills,industry_expertise,nb_projects,nb_buyers,last_active,gender,city,country
0,Maria H.,[English],We are a small UK based company speciaIising i...,4.9,4745,35.21,"['animation', 'graphics design', 'brand design...","[IT, Internet, Marketing, Media, Technology]",7727,4204,17-05-2021,unknown,Sheffield,United Kingdom
1,Jessica O'Neill,[English],Your blog content and website copy need to be ...,5.0,1220,77.47,"['editing', 'proofreading', 'transcription', '...","[PR, Sales, Travel]",1546,880,17-05-2021,unknown,City of London,United Kingdom
2,Scott Anfield,[English],An experienced and professional writer based i...,4.9,1755,21.13,"['article', 'product description', 'content wr...","[Advertising, Education, Marketing, Sales, Tra...",1784,1094,17-05-2021,unknown,Doncaster,United Kingdom
3,Translate Guru .,[],Hello! Hola! Ciao! Bonjour! 你好!\r\n\r\nI do hi...,4.9,608,20.00,"['German <=> English translation', 'Arabic <=>...",[],730,378,17-05-2021,unknown,Cannanore,India
4,Logical Translation & Localisation,"[English, Spanish, French, Italian]",OFFERING A WIDE RANGE OF TRANSLATION SERVICES\...,4.9,135,28.17,"['editing', 'proofreading', 'German translatio...","[Advertising, Aerospace, Automotive, Construct...",140,100,17-05-2021,unknown,City of London,United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9998,Kendra Aquino,[English],I am an ambitious professional with working ba...,0.0,0,20.00,"['editing', 'microsoft excel', 'proofreading',...","[Advertising, Marketing, Publishing]",0,0,14-02-2021,unknown,Viareggio,Italy
9999,Maria Jussel,"[German, English]",Founding my business in 2020 has been a real s...,5.0,3,145.72,"['financial accounting', 'financial management...","[Financial, Services, Human, Resources, IT, Ma...",2,1,24-04-2021,unknown,Vienna,Austria
10000,Alex Halfacree,[English],Hi! My name is Alex and I am a 19 year old bus...,0.0,0,8.45,"['administration support', 'data entry', 'busi...",[],0,0,18-04-2021,unknown,Crawley,United Kingdom
10001,Allison Gerena,[English],I am a professional copywriter with a BA in En...,0.0,0,21.13,"['proofreading', 'email marketing', 'product d...","[Gaming, Internet, Leisure, Marketing, Telecom...",0,0,14-04-2021,unknown,Crawley,United Kingdom


# Format the skills

In [25]:
def clean(skills: list) -> list:
    cleaned_skills = []
    for skill in skills:
        skill = skill.lower()
        REMOVE_PUNCT = re.compile("[.;:!\'?,\"()\[\]]")
        skill = re.sub(r"\([^()]*\)", "", skill)
        skill = REMOVE_PUNCT.sub("", skill)
        REPLACE_HTML = re.compile("<=>")
        skill = REPLACE_HTML.sub("", skill)
        skill = skill.strip()
        cleaned_skills.append(skill)
    return cleaned_skills

In [26]:
skills = np.array(data.skills)
formated_skills = []

In [27]:
for profile_skills in skills:
    profile_skills = ast.literal_eval(profile_skills)
    profile_skills = clean(profile_skills)
    formated_skills.append(profile_skills)

In [28]:
data = data.assign(skills=formated_skills)

In [29]:
data

Unnamed: 0,name,languages,description,rating,nb_reviews,Price/hour in USD,skills,industry_expertise,nb_projects,nb_buyers,last_active,gender,city,country
0,Maria H.,[English],We are a small UK based company speciaIising i...,4.9,4745,35.21,"[animation, graphics design, brand design, log...","[IT, Internet, Marketing, Media, Technology]",7727,4204,17-05-2021,unknown,Sheffield,United Kingdom
1,Jessica O'Neill,[English],Your blog content and website copy need to be ...,5.0,1220,77.47,"[editing, proofreading, transcription, academi...","[PR, Sales, Travel]",1546,880,17-05-2021,unknown,City of London,United Kingdom
2,Scott Anfield,[English],An experienced and professional writer based i...,4.9,1755,21.13,"[article, product description, content writing...","[Advertising, Education, Marketing, Sales, Tra...",1784,1094,17-05-2021,unknown,Doncaster,United Kingdom
3,Translate Guru .,[],Hello! Hola! Ciao! Bonjour! 你好!\r\n\r\nI do hi...,4.9,608,20.00,"[german english translation, arabic english ...",[],730,378,17-05-2021,unknown,Cannanore,India
4,Logical Translation & Localisation,"[English, Spanish, French, Italian]",OFFERING A WIDE RANGE OF TRANSLATION SERVICES\...,4.9,135,28.17,"[editing, proofreading, german translation, en...","[Advertising, Aerospace, Automotive, Construct...",140,100,17-05-2021,unknown,City of London,United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9998,Kendra Aquino,[English],I am an ambitious professional with working ba...,0.0,0,20.00,"[editing, microsoft excel, proofreading, socia...","[Advertising, Marketing, Publishing]",0,0,14-02-2021,unknown,Viareggio,Italy
9999,Maria Jussel,"[German, English]",Founding my business in 2020 has been a real s...,5.0,3,145.72,"[financial accounting, financial management, b...","[Financial, Services, Human, Resources, IT, Ma...",2,1,24-04-2021,unknown,Vienna,Austria
10000,Alex Halfacree,[English],Hi! My name is Alex and I am a 19 year old bus...,0.0,0,8.45,"[administration support, data entry, business ...",[],0,0,18-04-2021,unknown,Crawley,United Kingdom
10001,Allison Gerena,[English],I am a professional copywriter with a BA in En...,0.0,0,21.13,"[proofreading, email marketing, product descri...","[Gaming, Internet, Leisure, Marketing, Telecom...",0,0,14-04-2021,unknown,Crawley,United Kingdom


# Define a score for a profile

**The score is based on the rating of the profile as well as the number of reviews. As the values are quite high, we use a** ``log`` **function to decrease the values but keep the growing trend.**\
We use the following formula to avoid having issues with 0 values.
$$score=\log(1+rating\times nb\_reviews)$$

In [30]:
score = np.log(1+data.rating*data.nb_reviews)

In [31]:
data['score'] = round(score,3)

In [32]:
data

Unnamed: 0,name,languages,description,rating,nb_reviews,Price/hour in USD,skills,industry_expertise,nb_projects,nb_buyers,last_active,gender,city,country,score
0,Maria H.,[English],We are a small UK based company speciaIising i...,4.9,4745,35.21,"[animation, graphics design, brand design, log...","[IT, Internet, Marketing, Media, Technology]",7727,4204,17-05-2021,unknown,Sheffield,United Kingdom,10.054
1,Jessica O'Neill,[English],Your blog content and website copy need to be ...,5.0,1220,77.47,"[editing, proofreading, transcription, academi...","[PR, Sales, Travel]",1546,880,17-05-2021,unknown,City of London,United Kingdom,8.716
2,Scott Anfield,[English],An experienced and professional writer based i...,4.9,1755,21.13,"[article, product description, content writing...","[Advertising, Education, Marketing, Sales, Tra...",1784,1094,17-05-2021,unknown,Doncaster,United Kingdom,9.060
3,Translate Guru .,[],Hello! Hola! Ciao! Bonjour! 你好!\r\n\r\nI do hi...,4.9,608,20.00,"[german english translation, arabic english ...",[],730,378,17-05-2021,unknown,Cannanore,India,8.000
4,Logical Translation & Localisation,"[English, Spanish, French, Italian]",OFFERING A WIDE RANGE OF TRANSLATION SERVICES\...,4.9,135,28.17,"[editing, proofreading, german translation, en...","[Advertising, Aerospace, Automotive, Construct...",140,100,17-05-2021,unknown,City of London,United Kingdom,6.496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9998,Kendra Aquino,[English],I am an ambitious professional with working ba...,0.0,0,20.00,"[editing, microsoft excel, proofreading, socia...","[Advertising, Marketing, Publishing]",0,0,14-02-2021,unknown,Viareggio,Italy,0.000
9999,Maria Jussel,"[German, English]",Founding my business in 2020 has been a real s...,5.0,3,145.72,"[financial accounting, financial management, b...","[Financial, Services, Human, Resources, IT, Ma...",2,1,24-04-2021,unknown,Vienna,Austria,2.773
10000,Alex Halfacree,[English],Hi! My name is Alex and I am a 19 year old bus...,0.0,0,8.45,"[administration support, data entry, business ...",[],0,0,18-04-2021,unknown,Crawley,United Kingdom,0.000
10001,Allison Gerena,[English],I am a professional copywriter with a BA in En...,0.0,0,21.13,"[proofreading, email marketing, product descri...","[Gaming, Internet, Leisure, Marketing, Telecom...",0,0,14-04-2021,unknown,Crawley,United Kingdom,0.000


In [33]:
data.to_csv('./data/full_profiles'+scraping_date+'.csv', index=False)