In [97]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
from collections import Counter
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [98]:
scraping_date = '_26_04_2021'
data = pd.read_csv('./data/full_profiles'+scraping_date+'.csv')

In [99]:
data.head(3)

Unnamed: 0,name,languages,description,rating,nb_reviews,Price/hour in USD,skills,industry_expertise,nb_projects,nb_buyers,last_active,gender,city,country,score
0,Maria H.,['English'],We are a small UK based company speciaIising i...,4.9,4694,34.74,"['animation', 'graphics design', 'brand design...","['IT', 'Internet', 'Marketing', 'Media', 'Tech...",7672,4168,26-04-2021,female,Sheffield,United Kingdom,10.043
1,Cormac Reynolds,['English'],Looking for a link building solution or some g...,5.0,1190,138.95,"['online marketing', 'link building', 'marketi...",[],1085,533,25-04-2021,unknown,City of London,United Kingdom,8.691
2,Denise Toepel,"['English', 'Spanish', 'French']",I am a Translation Specialist. I work in publ...,0.0,0,27.79,"['english translation', 'french english', 'on...","['Education', 'Telecommunications']",0,0,25-04-2021,female,Cuenca,Ecuador,0.0


# Remove inactive profiles (no projects) and drop unnucessary features

In [100]:
print('Number of workers with inactive profiles:',  len(data.loc[(data['nb_projects'] == 0)]))

Number of workers with inactive profiles: 2930


In [101]:
data.drop(data.loc[data['nb_projects']==0].index, inplace=True)

Almost 30% of all the profiles are thus deleted of our regression analysis. This represents a big proportion of the profiles that we collected but there is still a significant amount of data that makes our study relevant.

In [102]:
del data['name'] 
del data['description'] 
del data['city'] 

# Spoken languages

In [103]:
available_languages = {}
languages = np.array(data.languages)
for profile_languages in languages:
    profile_languages = ast.literal_eval(profile_languages)
    if len(profile_languages)>0:
        for language in profile_languages:
            if language not in available_languages:
                available_languages[language] = 1
            else:
                available_languages[language] += 1

In [104]:
len(available_languages)

102

In [105]:
lang = np.array(list(available_languages.keys()))
count_lang = list(available_languages.values())
sorted_count_lang = np.argsort(count_lang)[-15:]
most_spoken_languages = lang[sorted_count_lang]

In [106]:
most_spoken_languages

array(['Swahili', 'Bengali', 'Greek', 'Dutch', 'Chinese', 'Portuguese',
       'Russian', 'Arabic', 'Urdu', 'Italian', 'Hindi', 'German',
       'Spanish', 'French', 'English'], dtype='<U13')

In [107]:
n = len(data)
english, french, spanish, arabic, german, italian, hindi, urdu, russian, portuguese, swahili, bengali, greek, dutch, chinese = np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n),

for i, lang in enumerate(languages):
    lang=ast.literal_eval(lang)
    if 'English' in lang:
        english[i]=1
    if 'French' in lang:
        french[i]=1
    if 'Spanish' in lang:
        spanish[i]=1
    if 'Arabic' in lang:
        arabic[i]=1
    if 'German' in lang:
        german[i]=1
    if 'Italian' in lang:
        italian[i]=1
    if 'Hindi' in lang:
        hindi[i]=1
    if 'Urdu' in lang:
        urdu[i]=1
    if 'Russian' in lang:
        russian[i]=1
    if 'Portuguese' in lang:
        portuguese[i]=1
    if 'Swahili' in lang:
        swahili[i]=1
    if 'Bengali' in lang:
        bengali[i]=1
    if 'Greek' in lang:
        greek[i]=1
    if 'Dutch' in lang:
        dutch[i]=1
    if 'Chinese' in lang:
        chinese[i]=1 

In [108]:
del data['languages']

data['lang_english'] = english
data['lang_french'] = french
data['lang_spanish'] = spanish
data['lang_arabic'] = arabic
data['lang_german'] = german
data['lang_italian'] = italian
data['lang_hindi'] = hindi
data['lang_urdu'] = urdu
data['lang_russian'] = russian
data['lang_portuguese'] = portuguese
data['lang_swahili'] = swahili
data['lang_bengali'] = bengali
data['lang_greek'] = greek
data['lang_dutch'] = dutch
data['lang_chinese'] = chinese

# Industry expertises

In [109]:
available_expertise = {}
industry_expertise = np.array(data.industry_expertise)
for profile_expertise in industry_expertise:
    profile_expertise = ast.literal_eval(profile_expertise)
    if len(profile_expertise)>0:
        for expertise in profile_expertise:
            if expertise not in available_expertise:
                available_expertise[expertise] = 1
            else:
                available_expertise[expertise] += 1

In [110]:
len(available_expertise)

65

In [111]:
exper = np.array(list(available_expertise.keys()))
count_exper = list(available_expertise.values())
sorted_count_exper = np.argsort(count_exper)[-10:]
most_imp_exper = exper[sorted_count_exper]

In [112]:
most_imp_exper

array(['Publishing', 'Healthcare', 'Travel', 'Internet', 'Technology',
       'IT', 'Media', 'Advertising', 'Education', 'Marketing'],
      dtype='<U18')

In [113]:
n = len(data)
marketing, education, advertising, media, it, internet, technology, entertainment, travel, art = np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n)

for i, exper in enumerate(industry_expertise):
    exper=ast.literal_eval(exper)
    if 'Marketing' in exper:
        marketing[i]=1
    if 'Education' in exper:
        education[i]=1
    if 'Advertising' in exper:
        advertising[i]=1
    if 'Media' in exper:
        media[i]=1
    if 'IT' in exper:
        it[i]=1
    if 'Internet' in exper:
        internet[i]=1
    if 'Technology' in exper:
        technology[i]=1
    if 'Entertainment' in exper:
        entertainment[i]=1
    if 'Travel' in exper:
        travel[i]=1
    if 'Art' in exper:
        art[i]=1

In [114]:
del data['industry_expertise']

data['exper_education'] = education
data['exper_advertising'] = advertising
data['exper_marketing'] = marketing
data['exper_media'] = media
data['exper_it'] = it
data['exper_internet'] = internet
data['exper_technology'] = technology
data['exper_entertainment'] = entertainment
data['exper_travel'] = travel
data['exper_art'] = art

# Recently active workers

In [115]:
data.last_active = pd.to_datetime(data.last_active)
threshold = datetime.datetime.strptime('20-03-2021', '%d-%m-%Y') 
# Choose date to qualify if worker was recently active

data.last_active = threshold < data.last_active
data.last_active = data.last_active*1

# Countries of work

In [116]:
location = data['country']

country = [ l.split(',')[-1].replace(" ", "") for l in location ]
count = Counter(country).most_common(10)

print(count)

[('UnitedKingdom', 3178), ('India', 511), ('Pakistan', 397), ('Kenya', 185), ('France', 150), ('Germany', 136), ('Spain', 136), ('Bangladesh', 132), ('Italy', 126), ('Nigeria', 118)]


In [117]:
values = data['country']
counts = pd.value_counts(values)
mask = values.isin(counts.index)
coun = pd.get_dummies(values[mask])

del data['country']
data['country_UnitedKingdom'] = coun[' United Kingdom']
data['country_India'] = coun[' India']
data['country_Pakistan'] = coun[' Pakistan']
data['country_Kenya'] = coun[' Kenya']
data['country_France'] = coun[' France']
data['country_Germany'] = coun[' Germany']
data['country_Spain'] = coun[' Spain']
data['country_Bangladesh'] = coun[' Bangladesh']
data['country_Italy'] = coun[' Italy']
data['country_Nigeria'] = coun[' Nigeria']

# Skills

In [118]:
available_skill = {}
skills = np.array(data.skills)
for profile_skill in skills:
    profile_skill = ast.literal_eval(profile_skill)
    if len(profile_skill)>0:
        for skill in profile_skill:
            if skill not in available_skill:
                available_skill[skill] = 1
            else:
                available_skill[skill] += 1

In [119]:
len(available_skill)

2148

In [120]:
sk = np.array(list(available_skill.keys()))
count_sk = list(available_skill.values())
sorted_count_sk = np.argsort(count_sk)[-10:]
most_imp_sk = sk[sorted_count_sk]

In [121]:
most_imp_sk

array(['web writing', 'social media marketing', 'article',
       'search engine optimization', 'editing', 'creative writing',
       'copywriting', 'proofreading', 'blog writing', 'content writing'],
      dtype='<U51')

In [122]:
n = len(data)
creative_writing, copywriting, proofreading, blog_writing, content_writing, web_writing, social_media_marketing, article, search_engine_optimization, editing = np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n)

for i, sk in enumerate(skills):
    sk=ast.literal_eval(sk)
    if 'creative writing' in sk:
        creative_writing[i]=1
    if 'copywriting' in sk:
        copywriting[i]=1
    if 'proofreading' in sk:
        proofreading[i]=1
    if 'blog writing' in sk:
        blog_writing[i]=1
    if 'content writing' in sk:
        content_writing[i]=1
    if 'web writing' in sk:
        web_writing[i]=1
    if 'social media marketing' in sk:
        social_media_marketing[i]=1
    if 'article' in sk:
        article[i]=1
    if 'search engine optimization' in sk:
        search_engine_optimization[i]=1
    if 'editing' in sk:
        editing[i]=1

In [123]:
del data['skills']

data['skill_creative_writing'] = creative_writing
data['skill_copywriting'] = copywriting
data['skill_proofreading'] = proofreading
data['skill_blog_writing'] = blog_writing
data['skill_content_writing'] = content_writing
data['skill_web_writing'] = web_writing
data['skill_social_media_marketing'] = social_media_marketing
data['skill_article'] = article
data['skill_search_engine_optimization'] = search_engine_optimization
data['skill_editing'] = editing

# Gender

In [124]:
values = data['gender']
gen = pd.get_dummies(values)

del data['gender']
data['gender_male'] = gen['male']
data['gender_female'] = gen['female']
data['gender_undefined'] = gen['unknown']

# Final data

In [125]:
data.head(5)

Unnamed: 0,rating,nb_reviews,Price/hour in USD,nb_projects,nb_buyers,last_active,score,lang_english,lang_french,lang_spanish,...,skill_blog_writing,skill_content_writing,skill_web_writing,skill_social_media_marketing,skill_article,skill_search_engine_optimization,skill_editing,gender_male,gender_female,gender_undefined
0,4.9,4694,34.74,7672,4168,1,10.043,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0
1,5.0,1190,138.95,1085,533,1,8.691,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,1
3,4.9,598,20.0,721,376,1,7.983,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
4,4.9,135,27.79,140,100,1,6.496,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,1
5,4.9,554,22.23,404,293,1,7.907,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0


In [126]:
print(data.isna().sum())

rating                              0
nb_reviews                          0
Price/hour in USD                   0
nb_projects                         0
nb_buyers                           0
last_active                         0
score                               0
lang_english                        0
lang_french                         0
lang_spanish                        0
lang_arabic                         0
lang_german                         0
lang_italian                        0
lang_hindi                          0
lang_urdu                           0
lang_russian                        0
lang_portuguese                     0
lang_swahili                        0
lang_bengali                        0
lang_greek                          0
lang_dutch                          0
lang_chinese                        0
exper_education                     0
exper_advertising                   0
exper_marketing                     0
exper_media                         0
exper_it    

In [127]:
data.describe()

Unnamed: 0,rating,nb_reviews,Price/hour in USD,nb_projects,nb_buyers,last_active,score,lang_english,lang_french,lang_spanish,...,skill_blog_writing,skill_content_writing,skill_web_writing,skill_social_media_marketing,skill_article,skill_search_engine_optimization,skill_editing,gender_male,gender_female,gender_undefined
count,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,...,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0
mean,4.834398,43.217447,30.884676,46.545879,30.039446,0.640888,3.881086,0.792309,0.103634,0.092464,...,0.336067,0.438993,0.138555,0.144352,0.170932,0.171921,0.182525,0.353174,0.481832,0.164994
std,0.667073,185.34974,34.608079,274.213685,166.041998,0.479774,1.568235,0.405683,0.304806,0.289701,...,0.472395,0.496299,0.345506,0.351471,0.376476,0.377339,0.386304,0.47799,0.499705,0.371201
min,0.0,0.0,8.34,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.9,3.0,14.5,3.0,2.0,0.0,2.773,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,8.0,24.17,8.0,6.0,1.0,3.714,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,27.0,35.0,26.0,19.0,1.0,4.913,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,5.0,8647.0,1354.74,16306.0,10343.0,1.0,10.634,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [128]:
X = data.drop(['Price/hour in USD'], axis=1)
y = data['Price/hour in USD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

model = sm.OLS(y_train.astype(float), X_train.astype(float))
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:      Price/hour in USD   R-squared:                       0.080
Model:                            OLS   Adj. R-squared:                  0.070
Method:                 Least Squares   F-statistic:                     8.560
Date:                Mon, 24 May 2021   Prob (F-statistic):           7.57e-62
Time:                        10:37:30   Log-Likelihood:                -26328.
No. Observations:                5304   AIC:                         5.276e+04
Df Residuals:                    5250   BIC:                         5.312e+04
Df Model:                          53                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
rating  