# Transforming the raw data into .csv format

Information that could become relevant but is still missing:

* totals hours (Hourly projects)
* Total value of hourly projects

## Required libraries and functions

In [2]:
import pandas as pd
import numpy as np
import os
import ast
import glob

In [3]:
# open data line by line

def open_file(filename):
    with open(os.getcwd() + filename, "r") as fp:
        lines = fp.readlines()
    return lines

In [4]:
# extract data from json and save as dict

def extract_data(rawdata):
    
    dict_ = {"skills":[],"skills2":[], "rate":[], "amount":[], "job_type":[], "date":[], "description":[],
         "title":[], "worker_ID":[], "op_engagement":[], "engagement_weeks":[],
         "op_pref_english_skill":[],"candidates":[], "buyer":[],"category_1":[],"category_2":[],
        "assignments":[],"assignment_info":[]}
    
    for i in range(0,len(rawdata)):
        job = ast.literal_eval(rawdata[i])

        # multiple items

        ## skills 1

        try:
            skills = list(job["op_required_skills"]["op_required_skill"].values())[0]
        except:
            skills = float('nan')
        dict_["skills"].append(skills)

        ## skills multiple
        try:
            skills2 = job["op_required_skills"]["op_required_skill"]
            skill_list = []
            for skill in skills2:
                try:
                    skill_item = list(skill.values())[0]
                except:
                    skill_item = float('nan')
                skill_list.append(skill_item)
        except:
            skill_list = float('nan')

        dict_["skills2"].append(skill_list)

        ## candidates
        try:
            candidates = list(job["candidates"].values())[0]

            candidate_list = []
            for candidate in candidates:
                try:
                    candidate_item = candidate
                except:
                    candidate_item = float('nan')
                candidate_list.append(candidate_item)
        except:
            candidate_list = float('nan')

        dict_["candidates"].append(candidate_list)

        # Simple items

        try:
            assignments = job["assignments"]
        except:
            assignments = float('nan')
        dict_["assignments"].append(assignments)    

        try:
            assignment_info = job["assignment_info"]
        except:
            assignment_info = float('nan')
        dict_["assignment_info"].append(assignment_info)    

        # project category sub-group
        try:
            category_2 = list(job["op_job_category_v2"].values())[0]["name"]
        except:
            category_2 = float('nan')
        dict_["category_2"].append(category_2) 

        # project category main group
        try:
            category_1 = list(list(job["op_job_category_v2"].values())[0]["groups"].values())[0]['name']
        except:
            category_1 = float('nan')
        dict_["category_1"].append(category_1) 

        try:
            buyer = job["buyer"]
        except:
            buyer = float('nan')
        dict_["buyer"].append(buyer) 

        try:
            title = job["op_title"]
        except:
            title = float('nan')
        dict_["title"].append(title)    

        try:
            description = job["op_description"]
        except:
            description = float('nan')
        dict_["description"].append(description)

        try:
            worker_ID = job["ciphertext"]
        except:
            worker_ID = float('nan')
        dict_["worker_ID"].append(worker_ID)

        try:
            op_engagement = job["op_engagement"]
        except:
            op_engagement = float('nan')
        dict_["op_engagement"].append(op_engagement)

        try:
            engagement_weeks = job["engagement_weeks"]
        except:
            engagement_weeks = float('nan')
        dict_["engagement_weeks"].append(engagement_weeks)   

        try:
            op_pref_english_skill = job["op_pref_english_skill"]
        except:
            op_pref_english_skill = float('nan')
        dict_["op_pref_english_skill"].append(op_pref_english_skill)    

        try:
            date = job["op_ctime"]
        except:
            date = float('nan')
        dict_["date"].append(date)

        try:    
            job_type =job["op_job_category_v2"]["op_job_category_v"]["name"]
        except:
            job_type = float('nan')
        dict_["job_type"].append(job_type)

        if job["job_type"] == "Hourly":

            try:
                rate = (int(job["op_high_hourly_rate_all"]) + int(job["op_low_hourly_rate_all"])) *0.5
                amount = float('nan')
            except:
                rate = float('nan')
        else:
            try:
                amount = job["amount"]
            except:
                amount = float('nan')
            rate = float('nan')
        dict_["rate"].append(rate)
        dict_["amount"].append(amount)
    
    return(dict_)

In [5]:
# Transform to dataframe and clean

def to_clean_df(dict_):
    
    df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in dict_.items()]))
    
    # clean skills
    df.skills.fillna(df.skills2, inplace=True)
    del df['skills2']
    
    # Clean assignments and assignment_info columns
    df = df.replace(r'^\s*$', np.nan, regex=True)
    
    return df

In [5]:
# Write to csv

# df.to_csv("project_data.csv", index= False)

NameError: name 'df' is not defined

## Application to raw data

### a) Single file

In [None]:
df = to_clean_df(extract_data(open_file("/results_day_1_to_3.txt")))

df.to_csv("project_data_1_to_3.csv", index= False)

### b) Folder structure

**Note**: For this to work the notebook has to be placed where the results folders are!

Open results.txt in each folder, load and transform the data. Then save the data as .csv in a new folder (reason: better overview while keeping smaller file sizes).

In [5]:
def substract(a, b):                              
    return "".join(a.rsplit(b))

In [18]:
# Get Folder and file names
files = glob.glob(os.getcwd()+"/*/results.txt")
folders = glob.glob(os.getcwd()+"/*/")

# Create new folder for results
try:
    os.mkdir(os.getcwd()+"/csv_data")
except FileExistsError:
    pass

# Transform data
for num, file in enumerate(files):
    
    # extract data
    with open(file, "r") as fp:
        lines = fp.readlines()
    
    dict_ = extract_data(lines)
    
    df = to_clean_df(dict_)
    
    # get new file name
    name = substract(substract(folders[num],os.getcwd()),"/")
    
    # save as .csv
    df.to_csv("csv_data/" + name + ".csv", index= False)
    
    print("Succesfully transformed", name)

Succesfully transformed jobids_40
Succesfully transformed jobids_41
Succesfully transformed csv_data
