# Data wrangling:

In [1]:
import pandas as pd
import re

### Raw data retrieve:

In [2]:
df_personal_info =  pd.read_parquet('../data/raw/personal_info.parquet')
df_country_info =   pd.read_parquet('../data/raw/country_info.parquet')
df_norm_job_codes = pd.read_parquet('../data/raw/norm_job_codes-names.parquet')
df_career_info =    pd.read_parquet('../data/raw/career_info.parquet')
df_country_codes =  pd.read_parquet('../data/raw/webscraping_country_code-name.parquet')
df_poll_info =      pd.read_parquet('../data/raw/poll_info.parquet')

### Personal Info:

In [3]:
df_personal_info.head()

Unnamed: 0,uuid,age,gender,dem_has_children,age_group
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,61 years old,male,NO,40_65
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,57 years old,male,yES,40_65
2,83127080-da3d-0133-c74f-0a81e8b09a82,32 years old,male,nO,26_39
3,15626d40-db13-0133-ea5c-0a81e8b09a82,45 years old,Male,YES,40_65
4,24954a70-db98-0133-4a64-0a81e8b09a82,41 years old,Fem,yES,40_65


In [4]:
df_personal_info['age'] = df_personal_info['age'].apply(lambda x: re.sub(r'\D', '', x)).astype(int)

In [5]:
df_personal_info['gender'].unique()

array(['male', 'Male', 'Fem', 'FeMale', 'female'], dtype=object)

In [6]:
df_personal_info['gender'] = df_personal_info['gender'].apply(lambda x: x.lower().capitalize())


In [7]:
df_personal_info["gender"].replace("Fem", "Female", inplace=True)

In [8]:
df_personal_info['dem_has_children'] = df_personal_info['dem_has_children'].apply(lambda x: x.lower())

In [9]:
df_personal_info.head()

Unnamed: 0,uuid,age,gender,dem_has_children,age_group
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,61,Male,no,40_65
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,57,Male,yes,40_65
2,83127080-da3d-0133-c74f-0a81e8b09a82,32,Male,no,26_39
3,15626d40-db13-0133-ea5c-0a81e8b09a82,45,Male,yes,40_65
4,24954a70-db98-0133-4a64-0a81e8b09a82,41,Female,yes,40_65


### Country info:

In [10]:
df_country_info.head()

Unnamed: 0,uuid,country_code,rural
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,AT,countryside
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,AT,urban
2,83127080-da3d-0133-c74f-0a81e8b09a82,AT,city
3,15626d40-db13-0133-ea5c-0a81e8b09a82,AT,Country
4,24954a70-db98-0133-4a64-0a81e8b09a82,AT,city


In [11]:
df_country_info.replace('GB', 'UK', inplace=True) # normalize GB-UK
df_country_info.replace('GR', 'EL', inplace=True) # normalize GR-EL

In [12]:
try:
    df_country_codes.set_index('code', inplace=True)
except:
    pass

In [13]:
df_country_info['country'] = df_country_info['country_code'].apply(lambda x: df_country_codes.loc[x])

In [14]:
df_country_info['rural'] = df_country_info['rural'].apply(lambda x: x.lower())

In [15]:
col_order = ['uuid', 'country_code', 'country', 'rural']

In [16]:
df_country_info = df_country_info[col_order]

In [17]:
df_country_info.head()

Unnamed: 0,uuid,country_code,country,rural
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,AT,Austria,countryside
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,AT,Austria,urban
2,83127080-da3d-0133-c74f-0a81e8b09a82,AT,Austria,city
3,15626d40-db13-0133-ea5c-0a81e8b09a82,AT,Austria,country
4,24954a70-db98-0133-4a64-0a81e8b09a82,AT,Austria,city


## Career info:

In [18]:
df_career_info.head()

Unnamed: 0,uuid,dem_education_level,dem_full_time_job,normalized_job_code
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,no,no,
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,high,yes,861a9b9151e11362eb3c77ca914172d0
2,83127080-da3d-0133-c74f-0a81e8b09a82,,no,
3,15626d40-db13-0133-ea5c-0a81e8b09a82,high,yes,049a3f3a2b5f85cb2971ba77ad66e10c
4,24954a70-db98-0133-4a64-0a81e8b09a82,high,yes,f4b2fb1aa40f661488e2782b6d57ad2f


In [19]:
df_norm_job_codes.head()

Unnamed: 0,normalized_job_code,title
0,861a9b9151e11362eb3c77ca914172d0,Automatic Data Processing Planner
1,049a3f3a2b5f85cb2971ba77ad66e10c,Data Coordinator
2,f4b2fb1aa40f661488e2782b6d57ad2f,Database Developer
3,27af8700f5577cec835acee2cb90a2ff,Data Entry Specialist
4,c1b670eba9ccb65e7c99f7da116d5b9c,Database Architect


In [20]:
df_career_info['dem_education_level'].fillna('no', inplace=True)
df_career_info['normalized_job_code'].replace('None','none', inplace=True)
df_norm_job_codes.fillna('none', inplace=True)

In [21]:
# try:
#     df_norm_job_codes.set_index('title', inplace=True)
# except:
#     pass

In [22]:
df_career_info.head()

Unnamed: 0,uuid,dem_education_level,dem_full_time_job,normalized_job_code
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,no,no,none
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,high,yes,861a9b9151e11362eb3c77ca914172d0
2,83127080-da3d-0133-c74f-0a81e8b09a82,no,no,none
3,15626d40-db13-0133-ea5c-0a81e8b09a82,high,yes,049a3f3a2b5f85cb2971ba77ad66e10c
4,24954a70-db98-0133-4a64-0a81e8b09a82,high,yes,f4b2fb1aa40f661488e2782b6d57ad2f


In [23]:
df_norm_job_codes.head()

Unnamed: 0,normalized_job_code,title
0,861a9b9151e11362eb3c77ca914172d0,Automatic Data Processing Planner
1,049a3f3a2b5f85cb2971ba77ad66e10c,Data Coordinator
2,f4b2fb1aa40f661488e2782b6d57ad2f,Database Developer
3,27af8700f5577cec835acee2cb90a2ff,Data Entry Specialist
4,c1b670eba9ccb65e7c99f7da116d5b9c,Database Architect


In [24]:
df_career_info = df_career_info.merge(df_norm_job_codes, on='normalized_job_code', how='left')

In [25]:
df_career_info

Unnamed: 0,uuid,dem_education_level,dem_full_time_job,normalized_job_code,title
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,no,no,none,none
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,high,yes,861a9b9151e11362eb3c77ca914172d0,Automatic Data Processing Planner
2,83127080-da3d-0133-c74f-0a81e8b09a82,no,no,none,none
3,15626d40-db13-0133-ea5c-0a81e8b09a82,high,yes,049a3f3a2b5f85cb2971ba77ad66e10c,Data Coordinator
4,24954a70-db98-0133-4a64-0a81e8b09a82,high,yes,f4b2fb1aa40f661488e2782b6d57ad2f,Database Developer
...,...,...,...,...,...
9644,7d1ac020-dcb4-0133-817a-0a81e8b09a82,high,yes,847165cfda6b1dc82ae22b967da8af2f,Data Warehouse Developer
9645,39f989f0-db52-0133-8482-0a81e8b09a82,high,yes,a4d5b8b38f9513825d0d94a981ebe962,Database Manager
9646,70ce4a90-d965-0133-f5e4-0a81e8b09a82,low,no,none,none
9647,2896e440-db3c-0133-5b67-0a81e8b09a82,low,yes,775190277a849cba701b306a7b374c0a,Data Officer


In [26]:
df_career_info.rename(columns={'dem_education_level': 'education_level', 'dem_full_time_job':'full_time_job', 'title':'job_title'}, inplace=True)

In [27]:
df_career_info.head()

Unnamed: 0,uuid,education_level,full_time_job,normalized_job_code,job_title
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,no,no,none,none
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,high,yes,861a9b9151e11362eb3c77ca914172d0,Automatic Data Processing Planner
2,83127080-da3d-0133-c74f-0a81e8b09a82,no,no,none,none
3,15626d40-db13-0133-ea5c-0a81e8b09a82,high,yes,049a3f3a2b5f85cb2971ba77ad66e10c,Data Coordinator
4,24954a70-db98-0133-4a64-0a81e8b09a82,high,yes,f4b2fb1aa40f661488e2782b6d57ad2f,Database Developer


### Poll info:

In [28]:
df_poll_info.head()

Unnamed: 0,uuid,question_bbi_2016wave4_basicincome_awareness,question_bbi_2016wave4_basicincome_vote,question_bbi_2016wave4_basicincome_effect,question_bbi_2016wave4_basicincome_argumentsfor,question_bbi_2016wave4_basicincome_argumentsagainst
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,I know something about it,I would not vote,None of the above,None of the above,None of the above
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,I understand it fully,I would probably vote for it,A basic income would not affect my work choices,It increases appreciation for household work a...,It might encourage people to stop working
2,83127080-da3d-0133-c74f-0a81e8b09a82,I have heard just a little about it,I would not vote,‰Û_ gain additional skills,It creates more equality of opportunity,Foreigners might come to my country and take a...
3,15626d40-db13-0133-ea5c-0a81e8b09a82,I have heard just a little about it,I would probably vote for it,‰Û_ work less,It reduces anxiety about financing basic needs,None of the above
4,24954a70-db98-0133-4a64-0a81e8b09a82,I have heard just a little about it,I would probably vote for it,None of the above,It reduces anxiety about financing basic needs,It is impossible to finance | It might encoura...


In [29]:
df_poll_info.columns

Index(['uuid', 'question_bbi_2016wave4_basicincome_awareness',
       'question_bbi_2016wave4_basicincome_vote',
       'question_bbi_2016wave4_basicincome_effect',
       'question_bbi_2016wave4_basicincome_argumentsfor',
       'question_bbi_2016wave4_basicincome_argumentsagainst'],
      dtype='object')

In [30]:
df_poll_info.rename(columns={'question_bbi_2016wave4_basicincome_awareness':        'basic_income_awareness',
                            'question_bbi_2016wave4_basicincome_vote':              'basic_income_vote',
                            'question_bbi_2016wave4_basicincome_effect':            'basic_income_effect',
                            'question_bbi_2016wave4_basicincome_argumentsfor':      'basic_income_arguments_for',
                            'question_bbi_2016wave4_basicincome_argumentsagainst':  'basic_income_arguments_against'},
                            inplace=True)

In [31]:
df_poll_info['basic_income_effect'] = df_poll_info['basic_income_effect'].apply(lambda x: re.sub('‰Û_', "I would", x))

In [32]:
df_poll_info.head()

Unnamed: 0,uuid,basic_income_awareness,basic_income_vote,basic_income_effect,basic_income_arguments_for,basic_income_arguments_against
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,I know something about it,I would not vote,None of the above,None of the above,None of the above
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,I understand it fully,I would probably vote for it,A basic income would not affect my work choices,It increases appreciation for household work a...,It might encourage people to stop working
2,83127080-da3d-0133-c74f-0a81e8b09a82,I have heard just a little about it,I would not vote,I would gain additional skills,It creates more equality of opportunity,Foreigners might come to my country and take a...
3,15626d40-db13-0133-ea5c-0a81e8b09a82,I have heard just a little about it,I would probably vote for it,I would work less,It reduces anxiety about financing basic needs,None of the above
4,24954a70-db98-0133-4a64-0a81e8b09a82,I have heard just a little about it,I would probably vote for it,None of the above,It reduces anxiety about financing basic needs,It is impossible to finance | It might encoura...


### Merging...

df_personal_info =  pd.read_parquet('../data/raw/personal_info.parquet')
df_country_info =   pd.read_parquet('../data/raw/country_info.parquet')
df_career_info =    pd.read_parquet('../data/raw/career_info.parquet')
df_poll_info =      pd.read_parquet('../data/raw/poll_info.parquet')

In [33]:
df_processed = df_personal_info.merge(df_country_info, on='uuid', how='left')\
                              .merge(df_career_info, on='uuid', how='left')\
                              .merge(df_poll_info, on='uuid', how='left')

In [34]:
df_processed.head()

Unnamed: 0,uuid,age,gender,dem_has_children,age_group,country_code,country,rural,education_level,full_time_job,normalized_job_code,job_title,basic_income_awareness,basic_income_vote,basic_income_effect,basic_income_arguments_for,basic_income_arguments_against
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,61,Male,no,40_65,AT,Austria,countryside,no,no,none,none,I know something about it,I would not vote,None of the above,None of the above,None of the above
1,54f0f1c0-dda1-0133-a559-0a81e8b09a82,57,Male,yes,40_65,AT,Austria,urban,high,yes,861a9b9151e11362eb3c77ca914172d0,Automatic Data Processing Planner,I understand it fully,I would probably vote for it,A basic income would not affect my work choices,It increases appreciation for household work a...,It might encourage people to stop working
2,83127080-da3d-0133-c74f-0a81e8b09a82,32,Male,no,26_39,AT,Austria,city,no,no,none,none,I have heard just a little about it,I would not vote,I would gain additional skills,It creates more equality of opportunity,Foreigners might come to my country and take a...
3,15626d40-db13-0133-ea5c-0a81e8b09a82,45,Male,yes,40_65,AT,Austria,country,high,yes,049a3f3a2b5f85cb2971ba77ad66e10c,Data Coordinator,I have heard just a little about it,I would probably vote for it,I would work less,It reduces anxiety about financing basic needs,None of the above
4,24954a70-db98-0133-4a64-0a81e8b09a82,41,Female,yes,40_65,AT,Austria,city,high,yes,f4b2fb1aa40f661488e2782b6d57ad2f,Database Developer,I have heard just a little about it,I would probably vote for it,None of the above,It reduces anxiety about financing basic needs,It is impossible to finance | It might encoura...


In [35]:
df_processed.to_json('../data/processed/df_processed.json')