In [1]:
# Constants
DATA_PATH   = "../data/raw/survey_results_public.csv"
EXPORT_PATH = "../data/processed/1_preprocessed_df.pk1"

REPLACE_DICT = { 
    'YearsCodePro': {    'Less than 1 year': 0, 'More than 50 years': 51},
    'YearsCode':    {    'Less than 1 year': 0, 'More than 50 years': 51},
    'Age1stCode':   {'Younger than 5 years': 4,      'Older than 85': 86}}
      

In [2]:
# Load packages
import pandas as pd
import numpy  as np
import logging
import pickle

_______

# Functions

In [3]:
def split_answers(data_series, delimiter=";"):
    """
    Split multiple answers in a single string
    to a list of single strings each represents a single answers
    
    Args:
    ----
    * data_series (pd.Series): String series with answers
    * delimiter (String): Another decimal integer
                          Default to ";"
   
   Returns:
    -------  
    return (pd.Series): if column contains delimiter
    """
    
    # Sub functions
    def is_splittable(pd_series, delimiter):
        """ 
        Check if it has multiple answers, needs to be splitted 
        Reurns:  
                (boolean)  
        """
        return pd_series.str.contains(delimiter)
    
    
    def split_answer(pd_series, delimiter):
        """ Function to split single answer """
        return pd_series.str.split(delimiter)
    
    # ----------------------
   
    # Check if multiple answers exist - if none: return original
    Splittable_values = is_splittable(data_series, delimiter)
    if not Splittable_values.any():
        return data_series
    
    # Else, split each value to a list
    modified_series = split_answer(data_series, delimiter)
    
    # Replace NAN values with empty lists
    mask_null = modified_series.isnull()
    modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])
    
    return modified_series
    

____

# Processing

In [4]:
# read raw data
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

#### 1. Split multiple answers

In [5]:
# Select object columns
object_cols = df.select_dtypes(include='object').columns.tolist()
for col in object_cols:
    df[col] = split_answers(df[col])

#### 2. Replace Values

In [6]:
for col, replacement in REPLACE_DICT.items():
    df[col] = df[col].replace(replacement).astype(np.float32)

___

# Export Data

In [7]:
df.to_pickle(EXPORT_PATH)

____