In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import pickle
from pathlib import Path

In [2]:
df = pd.read_parquet("data/cleaned.parquet")

# Categorical Features

In [3]:
english_levels = ["Elementary", "Limited working proficiency", "Professional working proficiency", 
                  "Full professional proficiency", "Native or bilingual proficiency"]

working_experience_levels = ["No working experience", "Less than 1 year", "Between 1 - 3 years", 
                             "Between 3 - 6 years", "Between 6 - 9 years", "More than 9 years"]

employer_size_levels = ["Less than 10 employees", "10 - 19 employees", "20 - 99 employees", "100 - 499 employees", 
                        "500 - 999 employees", "1000 - 4.999 employees", "More than 5.000 employees"]

In [4]:
ordinal_features = ["English_Level", "Working_Experience", "Employer_Size"]
ordinal_encoder = OrdinalEncoder(categories = [english_levels, working_experience_levels, employer_size_levels])
df[ordinal_features] = ordinal_encoder.fit_transform(df[ordinal_features])

In [5]:
file_path = Path("data/ordinal_encoder.p")

with file_path.open("wb") as f:
    pickle.dump(ordinal_encoder, f)

# One-hot Encoding

In [6]:
onehot_features = ["Employment_Status",
                    "Residence_District_Aggregated",
                    "Work_Company_Country",
                    "Job_Role_Original",         
                    "Employer_Industry",            
                    "Employer_Org_Type",
                    "Education_Level"]


onehot_encoder = OneHotEncoder(sparse=False)
transformed = onehot_encoder.fit_transform(df[onehot_features])
df_transformed = pd.DataFrame(transformed, columns=onehot_encoder.get_feature_names())

In [7]:
df = df.drop(columns = onehot_features)
df = pd.concat([df,df_transformed], axis=1)

In [8]:
file_path = Path("data/onehot_encoder.p")

with file_path.open("wb") as f:
    pickle.dump(onehot_encoder, f)

# Export

In [9]:
df.to_parquet("data/processed.parquet")