In [122]:
import numpy as np
import pandas as pd
import ast
import datetime

In [79]:
scraping_date = '_26_04_2021'
df = pd.read_csv('data/full_profiles'+scraping_date+'.csv')

# Prepare variables for the analysis

## Format the last active date

In [80]:
df['last_active'] = pd.to_datetime(df.last_active, dayfirst=True)

## Remove useless features

In [81]:
df = df.drop(['description', 'city'], axis=1)

## Format the gender

In [82]:
df = df.replace({'gender':{'male':0, 'female':1, 'unknown':2}})

## Create dummy variables for the country

Start by removing the space at the beginning of the country string, then creating the dummy variables

In [83]:
df['country'] = df['country'].apply(lambda x: x.strip())

In [84]:
country_dummies = pd.get_dummies(df.country)

## Create dummy variables for the skills

In [85]:
available_skill = {}
skills = np.array(df.skills)
for profile_skill in skills:
    profile_skill = ast.literal_eval(profile_skill)
    if len(profile_skill)>0:
        for skill in profile_skill:
            if skill not in available_skill:
                available_skill[skill]=1
            else:
                available_skill[skill]+=1

In [86]:
sk = np.array(list(available_skill.keys()))
count_sk = np.array(list(available_skill.values()))
sorted_count_sk = np.argsort(-count_sk)
most_imp_sk = sk[sorted_count_sk]
top_100_sk = most_imp_sk[:100]

In [87]:
skills_dummies = pd.DataFrame()
for top_skill in top_100_sk:
    skills_dummies[top_skill] = df['skills'].apply(lambda x: top_skill in x).astype(int)

## Create dummy variables for the industry expertise

In [90]:
available_expertise = {}
industry_expertise = np.array(df.industry_expertise)
for profile_expertise in industry_expertise:
    profile_expertise = ast.literal_eval(profile_expertise)
    if len(profile_expertise)>0:
        for expertise in profile_expertise:
            if expertise not in available_expertise:
                available_expertise[expertise] = 1
            else:
                available_expertise[expertise] += 1

In [97]:
expertises = np.array(list(available_expertise.keys()))

In [98]:
exper_dummies = pd.DataFrame()
for exper in expertises:
    exper_dummies[exper] = df['industry_expertise'].apply(lambda x: exper in x).astype(int)

## Create dummy variables for the languages spoken

In [106]:
available_languages = {}
languages = np.array(df.languages)
for profile_languages in languages:
    profile_languages = ast.literal_eval(profile_languages)
    if len(profile_languages)>0:
        for language in profile_languages:
            if language not in available_languages:
                available_languages[language] = 1
            else:
                available_languages[language] += 1

In [109]:
languages = np.array(list(available_languages.keys()))

In [110]:
languages_dummies = pd.DataFrame()
for lang in languages:
    languages_dummies[lang] = df['languages'].apply(lambda x: lang in x).astype(int)

# Merge all the dummy variables in one dataframe

In [135]:
data = pd.concat([df, country_dummies, skills_dummies, exper_dummies, languages_dummies], axis=1)

# Prepare the data for the regression

## Remove the 'unknown' gender

In [136]:
data = data[data.gender<2]

## Keep only the active profiles

### Remove profiles with no projects

In [137]:
data = data.drop(data.loc[data['nb_projects']==0].index)

### Keep recently active profiles

In [139]:
threshold = datetime.datetime.strptime('2021-03-26', '%Y-%m-%d')
data['last_active'] = (data.last_active>threshold).astype(int)

In [140]:
data = data.reset_index(drop=True)

## Remove string features

In [141]:
data = data.drop(['name', 'languages', 'skills', 'industry_expertise', 'country'], axis=1)

## Remove columns with only 0s

In [153]:
unwanted = [cols for cols in data.columns if data[cols].sum()==0]

In [156]:
data = data.drop(unwanted, axis=1)

In [160]:
data

Unnamed: 0,rating,nb_reviews,Price/hour in USD,nb_projects,nb_buyers,last_active,gender,score,Albania,Algeria,...,Assamese,Southern,Sotho,Zulu,Kikuyu,Nynorsk,Twi,Nyanja,Akan,Khmer
0,4.9,4694,34.74,7672,4168,1,1,10.043,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.9,554,22.23,404,293,1,0,7.907,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.0,198,48.34,136,92,1,1,6.899,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,273,18.13,205,136,1,1,7.220,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5.0,94,15.00,75,58,1,1,6.155,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5901,5.0,5,20.84,5,3,1,1,3.258,0,0,...,0,0,0,0,0,0,0,0,0,0
5902,4.9,56,69.47,81,68,0,0,5.618,0,0,...,0,0,0,0,0,0,0,0,0,0
5903,4.5,2,13.29,2,2,1,0,2.303,0,0,...,0,0,0,0,0,0,0,0,0,1
5904,4.7,18,41.68,25,18,0,0,4.450,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
data.to_csv('data/dummies.csv', index=False)