In [178]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime

from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [179]:
scraping_date = '_26_04_2021'
data = pd.read_csv('./data/full_profiles'+scraping_date+'.csv')

In [180]:
data.head(5)

Unnamed: 0,name,languages,description,rating,nb_reviews,Price/hour in USD,skills,industry_expertise,nb_projects,nb_buyers,last_active,gender,city,country
0,Maria H.,['English'],We are a small UK based company speciaIising i...,4.9,4694,34.74,"['animation', 'graphics design', 'brand design...","['IT', 'Internet', 'Marketing', 'Media', 'Tech...",7672,4168,26-04-2021,female,Sheffield,United Kingdom
1,Cormac Reynolds,['English'],Looking for a link building solution or some g...,5.0,1190,138.95,"['online marketing', 'link building', 'marketi...",[],1085,533,25-04-2021,unknown,City of London,United Kingdom
2,Denise Toepel,"['English', 'Spanish', 'French']",I am a Translation Specialist. I work in publ...,0.0,0,27.79,"['English translation', 'French <=> English', ...","['Education', 'Telecommunications']",0,0,25-04-2021,female,Cuenca,Ecuador
3,Translate Guru .,[],Hello! Hola! Ciao! Bonjour! 你好!\r\n\r\nI do hi...,4.9,598,20.0,"['German <=> English translation', 'Arabic <=>...",[],721,376,26-04-2021,unknown,Cannanore,India
4,Logical Translation & Localisation,"['English', 'Spanish', 'French', 'Italian']",OFFERING A WIDE RANGE OF TRANSLATION SERVICES\...,4.9,135,27.79,"['editing', 'proofreading', 'German translatio...","['Advertising', 'Aerospace', 'Automotive', 'Co...",140,100,24-04-2021,unknown,City of London,United Kingdom


# Remove inactive profiles (no projects)

In [181]:
print('Number of workers with inactive profiles:',  len(data.loc[(data['nb_projects'] == 0)]))

Number of workers with inactive profiles: 2930


In [182]:
data.drop(data.loc[data['nb_projects']==0].index, inplace=True)

Almost 30% of all the profiles are thus deleted of our regression analysis. This represents a big proportion of the profiles that we collected but there is still a significant amount of data that makes our study relevant.

# Drop unnecessary features

In [183]:
del data['name'] 
del data['description'] 
del data['city'] 

# Number of occurrences of all spoken languages

In [184]:
available_languages = {}
languages = np.array(data.languages)
for profile_languages in languages:
    profile_languages = ast.literal_eval(profile_languages)
    if len(profile_languages)>0:
        for language in profile_languages:
            if language not in available_languages:
                available_languages[language] = 1
            else:
                available_languages[language] += 1

In [185]:
len(available_languages)

102

In [186]:
lang = np.array(list(available_languages.keys()))
count_lang = list(available_languages.values())
sorted_count_lang = np.argsort(count_lang)[-10:]
most_spoken_languages = lang[sorted_count_lang]

In [187]:
most_spoken_languages

array(['Portuguese', 'Russian', 'Arabic', 'Urdu', 'Italian', 'Hindi',
       'German', 'Spanish', 'French', 'English'], dtype='<U13')

## Create dummy variables for the 10 most spoken languages

In [188]:
n = len(data)
english, french, spanish, arabic, german, italian, hindi, urdu, russian, portuguese = np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n)

for i, lang in enumerate(languages):
    lang=ast.literal_eval(lang)
    if 'English' in lang:
        english[i]=1
    if 'French' in lang:
        french[i]=1
    if 'Spanish' in lang:
        spanish[i]=1
    """
    if 'Arabic' in lang:
        arabic[i]=1
    if 'German' in lang:
        german[i]=1
    if 'Italian' in lang:
        italian[i]=1
    if 'Hindi' in lang:
        hindi[i]=1
    if 'Urdu' in lang:
        urdu[i]=1
    if 'Russian' in lang:
        russian[i]=1
    if 'Portuguese' in lang:
        portuguese[i]=1
    """

In [189]:
del data['languages']

data['lang_english'] = english
data['lang_french'] = french
data['lang_spanish'] = spanish

"""
data['arabic'] = arabic
data['german'] = german
data['italian'] = italian
data['hindi'] = hindi
data['urdu'] = urdu
data['russian'] = russian
data['portuguese'] = portuguese
"""

"\ndata['arabic'] = arabic\ndata['german'] = german\ndata['italian'] = italian\ndata['hindi'] = hindi\ndata['urdu'] = urdu\ndata['russian'] = russian\ndata['portuguese'] = portuguese\n"

# Number of occurences of all industry expertises

In [190]:
available_expertise = {}
industry_expertise = np.array(data.industry_expertise)
for profile_expertise in industry_expertise:
    profile_expertise = ast.literal_eval(profile_expertise)
    if len(profile_expertise)>0:
        for expertise in profile_expertise:
            if expertise not in available_expertise:
                available_expertise[expertise] = 1
            else:
                available_expertise[expertise] += 1

In [191]:
len(available_expertise)

65

In [192]:
exper = np.array(list(available_expertise.keys()))
count_exper = list(available_expertise.values())
sorted_count_exper = np.argsort(count_exper)[-10:]
most_imp_exper = exper[sorted_count_exper]

In [193]:
most_imp_exper

array(['Publishing', 'Healthcare', 'Travel', 'Internet', 'Technology',
       'IT', 'Media', 'Advertising', 'Education', 'Marketing'],
      dtype='<U18')

## Create dummy variables for the 10 most important industry expertise

In [194]:
n = len(data)
marketing, education, advertising, media, it, internet, technology, entertainment, travel, art = np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n)

for i, exper in enumerate(industry_expertise):
    exper=ast.literal_eval(exper)
    if 'Marketing' in exper:
        marketing[i]=1
    if 'Education' in exper:
        education[i]=1
    if 'Advertising' in exper:
        advertising[i]=1
    if 'Media' in exper:
        media[i]=1
    if 'IT' in exper:
        it[i]=1
    if 'Internet' in exper:
        internet[i]=1
    if 'Technology' in exper:
        technology[i]=1
    if 'Entertainment' in exper:
        entertainment[i]=1
    if 'Travel' in exper:
        travel[i]=1
    if 'Art' in exper:
        art[i]=1

In [195]:
del data['industry_expertise']

data['exper_education'] = education
data['exper_advertising'] = advertising
data['exper_marketing'] = marketing

In [196]:
rating = data.rating
nb_reviews = data.nb_reviews
price = data['Price/hour in USD']

## Create dummy variables for recently active workers

In [197]:
data.last_active = pd.to_datetime(data.last_active)
threshold = datetime.datetime.strptime('20-03-2021', '%d-%m-%Y') 
# Choose date to qualify if worker was recently active

data.last_active = threshold < data.last_active
data.last_active = data.last_active*1

## Create dummy variables for most important countries

In [198]:
data['from_UK'] = (data['country'] == 'United Kingdom')
data['from_UK'] = data['from_UK']*1 # all zeroes ????
del data['country']

# Number of occurences of all industry expertises

In [199]:
available_skill = {}
skills = np.array(data.skills)
for profile_skill in skills:
    profile_skill = ast.literal_eval(profile_skill)
    if len(profile_skill)>0:
        for skill in profile_skill:
            if skill not in available_skill:
                available_skill[skill] = 1
            else:
                available_skill[skill] += 1

In [200]:
len(available_skill)

2148

In [201]:
sk = np.array(list(available_skill.keys()))
count_sk = list(available_skill.values())
sorted_count_sk = np.argsort(count_sk)[-10:]
most_imp_sk = sk[sorted_count_sk]

In [202]:
most_imp_sk

array(['web writing', 'social media marketing', 'article',
       'search engine optimization (seo)', 'editing', 'creative writing',
       'Copywriting', 'proofreading', 'blog writing', 'content writing'],
      dtype='<U73')

## Create dummy variables for most important skills

In [203]:
n = len(data)
creative_writing, Copywriting, proofreading, blog_writing, content_writing = np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n)

for i, sk in enumerate(skills):
    sk=ast.literal_eval(sk)
    if 'creative writing' in sk:
        creative_writing[i]=1
    if 'Copywriting' in sk:
        Copywriting[i]=1
    if 'proofreading' in sk:
        proofreading[i]=1
    if 'blog writing' in sk:
        blog_writing[i]=1
    if 'content writing' in sk:
        content_writing[i]=1

## Create dummy variables gender

In [207]:
del data['skills']

data['skill_creative_writing'] = creative_writing
data['skill_Copywriting'] = Copywriting
data['skill_proofreading'] = proofreading
data['skill_blog_writing'] = blog_writing
data['skill_content_writing'] = content_writing

## Final data

In [208]:
data.head(5)

Unnamed: 0,rating,nb_reviews,Price/hour in USD,nb_projects,nb_buyers,last_active,gender,lang_english,lang_french,lang_spanish,exper_education,exper_advertising,exper_marketing,from_UK,skill_creative_writing,skill_Copywriting,skill_proofreading,skill_blog_writing,skill_content_writing
0,4.9,4694,34.74,7672,4168,1,female,1.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,0.0,1.0
1,5.0,1190,138.95,1085,533,1,unknown,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0
3,4.9,598,20.0,721,376,1,unknown,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
4,4.9,135,27.79,140,100,1,unknown,1.0,1.0,1.0,1.0,1.0,0.0,0,0.0,0.0,1.0,0.0,0.0
5,4.9,554,22.23,404,293,1,male,1.0,1.0,1.0,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [209]:
data.describe()

Unnamed: 0,rating,nb_reviews,Price/hour in USD,nb_projects,nb_buyers,last_active,lang_english,lang_french,lang_spanish,exper_education,exper_advertising,exper_marketing,from_UK,skill_creative_writing,skill_Copywriting,skill_proofreading,skill_blog_writing,skill_content_writing
count,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0,7073.0
mean,4.834398,43.217447,30.884676,46.545879,30.039446,0.640888,0.792309,0.103634,0.092464,0.149442,0.129365,0.240209,0.0,0.21674,0.287431,0.334653,0.336067,0.438993
std,0.667073,185.34974,34.608079,274.213685,166.041998,0.479774,0.405683,0.304806,0.289701,0.356548,0.335627,0.427241,0.0,0.412053,0.452596,0.471902,0.472395,0.496299
min,0.0,0.0,8.34,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.9,3.0,14.5,3.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,8.0,24.17,8.0,6.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,27.0,35.0,26.0,19.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
max,5.0,8647.0,1354.74,16306.0,10343.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [210]:
X = data.drop(['Price/hour in USD', 'gender', 'from_UK'], axis=1)
y = data['Price/hour in USD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

model = sm.OLS(y_train.astype(float), X_train.astype(float))
result = model.fit()
print(result.summary())

                                 OLS Regression Results                                
Dep. Variable:      Price/hour in USD   R-squared (uncentered):                   0.432
Model:                            OLS   Adj. R-squared (uncentered):              0.431
Method:                 Least Squares   F-statistic:                              251.7
Date:                Tue, 11 May 2021   Prob (F-statistic):                        0.00
Time:                        13:25:24   Log-Likelihood:                         -26510.
No. Observations:                5304   AIC:                                  5.305e+04
Df Residuals:                    5288   BIC:                                  5.316e+04
Df Model:                          16                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------