# Census Variables

You have decided to volunteer for your local community by offering to clean their recently collected census data

### Assessing Variable Types 

In [267]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [268]:
census = pd.read_csv('census_data.csv')
census.head()

Unnamed: 0.1,Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status
0,0,Denise,Ratke,2005,False,0,92129.41,disagree,single
1,1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced
2,2,Salomon,Orn,1992,True,2,166313.45,agree,single
3,3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married
4,4,Gust,Abernathy,1945,False,2,143316.08,agree,married


In [269]:
census.dtypes

Unnamed: 0          int64
first_name         object
last_name          object
birth_year         object
voted                bool
num_children        int64
income_year       float64
higher_tax         object
marital_status     object
dtype: object

- The manager of the census would like to know the average birth year of the respondents.

### Inspecting Datatypes 

In [270]:
census['birth_year'].unique()

array(['2005', '1987', '1992', '1965', '1945', '1951', '1963', '1949',
       '1950', '1971', '2007', '1944', '1995', '1973', '1946', '1954',
       '1994', '1989', '1947', '1993', '1976', '1984', 'missing', '1966',
       '1941', '2000', '1953', '1956', '1960', '2001', '1980', '1955',
       '1985', '1996', '1968', '1979', '2006', '1962', '1981', '1959',
       '1977', '1978', '1983', '1957', '1961', '1982', '2002', '1998',
       '1999', '1952', '1940', '1986', '1958'], dtype=object)

### Altering Data

In [271]:
census['birth_year'] = census['birth_year'].replace(['missing'], 1967)

In [272]:
census['birth_year'].unique()

array(['2005', '1987', '1992', '1965', '1945', '1951', '1963', '1949',
       '1950', '1971', '2007', '1944', '1995', '1973', '1946', '1954',
       '1994', '1989', '1947', '1993', '1976', '1984', 1967, '1966',
       '1941', '2000', '1953', '1956', '1960', '2001', '1980', '1955',
       '1985', '1996', '1968', '1979', '2006', '1962', '1981', '1959',
       '1977', '1978', '1983', '1957', '1961', '1982', '2002', '1998',
       '1999', '1952', '1940', '1986', '1958'], dtype=object)

In [273]:
census['birth_year'] = census['birth_year'].astype('int')

In [274]:
census['birth_year'].dtypes

dtype('int32')

In [275]:
census['birth_year'].mean()

1973.4

- Your manager would like to set an order to the higher_tax variable

In [276]:
census['higher_tax'].unique()

array(['disagree', 'neutral', 'agree', 'strongly agree',
       'strongly disagree'], dtype=object)

In [277]:
higher_tax_categories = ['strongly disagree', 'disagree', 'neutral', 'agree', 'strongly agree']

In [278]:
census['higher_tax'] = pd.Categorical(census['higher_tax'], higher_tax_categories, ordered=True)
census['higher_tax'].unique()

['disagree', 'neutral', 'agree', 'strongly agree', 'strongly disagree']
Categories (5, object): ['strongly disagree' < 'disagree' < 'neutral' < 'agree' < 'strongly agree']

In [279]:
census['higher_tax_codes'] = census['higher_tax'].cat.codes
census.head()

Unnamed: 0.1,Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status,higher_tax_codes
0,0,Denise,Ratke,2005,False,0,92129.41,disagree,single,1
1,1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced,2
2,2,Salomon,Orn,1992,True,2,166313.45,agree,single,3
3,3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married,4
4,4,Gust,Abernathy,1945,False,2,143316.08,agree,married,3


- Your manager would also like to know the median sentiment of the respondents on the issue of higher taxes for the wealthy

In [280]:
median_index = np.median(census['higher_tax_codes'])
median_index

2.0

In [281]:
median_index_status = census_categories[int(median_index)]
median_index_status

'neutral'

- Create a new variable called marital_codes by Label Encoding the marital_status variable. This could help the Census team use machine learning to predict if a respondent thinks the wealthy should pay higher taxes based on their marital status.

In [282]:
census['marital_status'].unique()

array(['single', 'divorced', 'married', 'widowed'], dtype=object)

In [283]:
marital_status_categories = ['single', 'married', 'divorced', 'widowed']
census['marital_status'] = pd.Categorical(census['marital_status'], marital_status_categories, ordered=True)
census['marital_status'].unique()

['single', 'divorced', 'married', 'widowed']
Categories (4, object): ['single' < 'married' < 'divorced' < 'widowed']

In [284]:
census['marital_codes'] = census['marital_status'].cat.codes
census.head()

Unnamed: 0.1,Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status,higher_tax_codes,marital_codes
0,0,Denise,Ratke,2005,False,0,92129.41,disagree,single,1,0
1,1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced,2,2
2,2,Salomon,Orn,1992,True,2,166313.45,agree,single,3,0
3,3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married,4,1
4,4,Gust,Abernathy,1945,False,2,143316.08,agree,married,3,1


In [285]:
marital_codes_median = np.median(census['marital_codes'])
marital_codes_median

1.0

In [286]:
marital_codes_median_status = marital_status_categories[int(marital_codes_median)]
marital_codes_median_status

'married'

- Your manager is interested in using machine learning models on the census data in the future. To help, let’s One-Hot Encode marital_status to create binary variables of each category.

In [287]:
census = pd.get_dummies(data=census, columns=['marital_status'])
census.head()

Unnamed: 0.1,Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,higher_tax_codes,marital_codes,marital_status_single,marital_status_married,marital_status_divorced,marital_status_widowed
0,0,Denise,Ratke,2005,False,0,92129.41,disagree,1,0,1,0,0,0
1,1,Hali,Cummerata,1987,False,0,75649.17,neutral,2,2,0,0,1,0
2,2,Salomon,Orn,1992,True,2,166313.45,agree,3,0,1,0,0,0
3,3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,4,1,0,1,0,0
4,4,Gust,Abernathy,1945,False,2,143316.08,agree,3,1,0,1,0,0


- Create a new variable called age_group, which groups respondents based on their birth year. The groups should be in five-year increments, e.g., 25-30, 31-35, etc.

In [288]:
census['age'] = 2021 - census['birth_year']

In [289]:
age_bins = np.arange(min(census['age']) - 4, 100, 5)

In [290]:
census['age_group'] = pd.cut(census['age'], bins=age_bins)

In [291]:
census.head()

Unnamed: 0.1,Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,higher_tax_codes,marital_codes,marital_status_single,marital_status_married,marital_status_divorced,marital_status_widowed,age,age_group
0,0,Denise,Ratke,2005,False,0,92129.41,disagree,1,0,1,0,0,0,16,"(15, 20]"
1,1,Hali,Cummerata,1987,False,0,75649.17,neutral,2,2,0,0,1,0,34,"(30, 35]"
2,2,Salomon,Orn,1992,True,2,166313.45,agree,3,0,1,0,0,0,29,"(25, 30]"
3,3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,4,1,0,1,0,0,56,"(55, 60]"
4,4,Gust,Abernathy,1945,False,2,143316.08,agree,3,1,0,1,0,0,76,"(75, 80]"
