In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("../data/raw.xlsx", 1)

  """Entry point for launching an IPython kernel.


# Cleaning

In [3]:
# Typos
df = df.replace("iceland", "Iceland")
df = df.rename(columns={"Framework_OutSystems":"Framework_Outsystems"})

# Data Types
df["Avg_Salary"] = df["Avg_Salary"].astype(int)

# Residence country is always Portugal so it's irrelevant
df = df.drop(columns="Residence_Country")

# It's not clear how many hours part-time workers work. It's better to remove them from the scope of the project.
df = df[df["Employment_Status"] != "Employed part-time"]

# Remove datapoints where salary is 0
df = df[df.Avg_Salary!=0]

# Feature Selection

In [4]:
cols = ["Employment_Status",
        "Residence_District_Aggregated",
        "Work_Company_Country",
        "Job_Role_Original",
        "Employer_Industry",
        "Employer_Org_Type",
        "Employer_Size",
        "Avg_Salary",
        "English_Level",
        "Education_Level",
        "Working_Experience"]

In [5]:
df = df.loc[:,cols]

# Aggregate Work Company Country

In [6]:
# Replace all work company countries with less than 19 points by "Other".
# We assume that less than 19 points is not representative enough for the model to learn.
values = df["Work_Company_Country"].value_counts()[df["Work_Company_Country"].value_counts()<19].index
values = values.append(pd.Index(["No specific country"]))
df.loc[:,"Work_Company_Country"] = df["Work_Company_Country"].replace(values,"Other")

#Â Frameworks and Languages Cleaning

In [7]:
# Cleaning frameworks and languages when testing their use.

#tool_cols = df.columns[df.columns.str.contains("Framework") | df.columns.str.contains("Language")]
#for col in tool_cols:
#    df[col] = df[col].replace(col.split("_")[-1],1)
#    df[col] = df[col].fillna(0)
#    df[col] = df[col].astype("category")

# Categoricals

In [8]:
categoricals = df.dtypes[df.dtypes == object].index
df[categoricals] = df[categoricals].astype("category")

# Drop NaNs

In [9]:
# Drop all datapoints that have at least one missing feature.

print(df.shape)
df = df.dropna().reset_index(drop=True)
print(df.shape)

(3334, 11)
(3081, 11)


# Export

In [10]:
df.to_parquet("../data/cleaned.parquet")