In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import pickle
import os

In [20]:
EXPORT_PATH = r"C:\Users\fathy\StackOverFlowSurvay\data\processed\1_preprocessed_df.pkl"
SURVEY_RESULT_PUBLIC = "https://raw.githubusercontent.com/Deena-Gergis/e2e_ds_project/batch_2/data/raw/survey_results_public.csv"
REPLACE_DICT = {
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51}, 
    'YearsCode':    {'Less than 1 year': 0, 'More than 50 years': 51}}

In [21]:
def split_answers(data_series, delimiter=";"):
    """ 
    Split multiple answers in a single string 
    to a list of single strings each represnting a single answers 

    Parameters:
    * data_series (pd.Series): String series with answers 
    * delimiter (string): Another decimal integer 
                          Defaults to ";"

    Returns: (pd.Series): If column contains 
    """
    
    # Sub functions 
    def is_splittable(pd_series, delimiter):
        """ Check if results multiple should be splitted - Returns boolean """    
        return pd_series.str.contains(delimiter)
    
    def split_answer(pd_series, delimiter): 
        """Function to split single answer"""
        return pd_series.str.split(delimiter)
    
    # --------------------
    
    # Check if multiple answers exist - if none: return original 
    splittable_values = is_splittable(data_series, delimiter)
    if not splittable_values.any():
        return data_series
    
    # Else, split each value to a list 
    modified_series = split_answer(data_series, delimiter)    
    
    # Replace NAs with empty lists 
    mask_null = modified_series.isnull()
    modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])
    
    return modified_series

In [22]:
raw_df = pd.read_csv(SURVEY_RESULT_PUBLIC)
df = raw_df.copy()

In [23]:
for col, replacement in REPLACE_DICT.items():
    df[col] = df[col].replace(replacement).astype(np.float32)

In [24]:
object_cols = df.select_dtypes(include='object').columns.tolist()
for col in object_cols:
    df[col] = split_answers(df[col])

In [25]:
i = df.sample(1).index[0]
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(df['LanguageHaveWorkedWith'].iloc[i])

C;C++;HTML/CSS;JavaScript;TypeScript
['C', 'C++', 'HTML/CSS', 'JavaScript', 'TypeScript']


In [26]:
i = df.sample(1).index[0]
print(raw_df['DevType'].iloc[i])
print(df['DevType'].iloc[i])

Developer, desktop or enterprise applications
['Developer, desktop or enterprise applications']


In [27]:
i = df.sample(1).index[0]
print(raw_df['YearsCodePro'].iloc[i])
print(df['YearsCodePro'].iloc[i])

4
4.0


In [28]:
df.to_pickle(EXPORT_PATH)

In [30]:
if not os.path.isfile(EXPORT_PATH):
   with open(EXPORT_PATH,'wb') as file:
       pickle.dump(df, file)
   file.close()