##### You have decided to volunteer for your local community by offering to clean their recently collected census data.

#### **Assessing Variable Types**

In [54]:
import pandas as pd

In [55]:
# Read in the census dataframe
census = pd.read_csv('/content/census_data.csv', index_col=0)

##### The census dataframe is composed of simulated census data to represent demographics of a small community in the U.S.

In [56]:
census.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status
0,Denise,Ratke,2005,False,0,92129.41,disagree,single
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced
2,Salomon,Orn,1992,True,2,166313.45,agree,single
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married
4,Gust,Abernathy,1945,False,2,143316.08,agree,married


In [57]:
census.dtypes

first_name         object
last_name          object
birth_year         object
voted                bool
num_children        int64
income_year       float64
higher_tax         object
marital_status     object
dtype: object

###**Inspecting Datatypes**

##### The manager of the census would like to know the average birth year of the respondents.

In [58]:
census.birth_year.unique()

array(['2005', '1987', '1992', '1965', '1945', '1951', '1963', '1949',
       '1950', '1971', '2007', '1944', '1995', '1973', '1946', '1954',
       '1994', '1989', '1947', '1993', '1976', '1984', 'missing', '1966',
       '1941', '2000', '1953', '1956', '1960', '2001', '1980', '1955',
       '1985', '1996', '1968', '1979', '2006', '1962', '1981', '1959',
       '1977', '1978', '1983', '1957', '1961', '1982', '2002', '1998',
       '1999', '1952', '1940', '1986', '1958'], dtype=object)

### **Altering Data**

##### There appears to be a missing value in the birth_year column. With some research you find that the respondent’s birth year is 1967.

In [59]:
census.birth_year = census.birth_year.replace('missing', '1967')
census.birth_year.unique()

array(['2005', '1987', '1992', '1965', '1945', '1951', '1963', '1949',
       '1950', '1971', '2007', '1944', '1995', '1973', '1946', '1954',
       '1994', '1989', '1947', '1993', '1976', '1984', '1967', '1966',
       '1941', '2000', '1953', '1956', '1960', '2001', '1980', '1955',
       '1985', '1996', '1968', '1979', '2006', '1962', '1981', '1959',
       '1977', '1978', '1983', '1957', '1961', '1982', '2002', '1998',
       '1999', '1952', '1940', '1986', '1958'], dtype=object)

In [60]:
# Change the datatype to integer
census.birth_year = census.birth_year.astype('int')
census.dtypes

first_name         object
last_name          object
birth_year          int64
voted                bool
num_children        int64
income_year       float64
higher_tax         object
marital_status     object
dtype: object

In [61]:
# Average birth year
census.birth_year.mean()

1973.4

##### Your manager would like to set an order to the higher_tax variable so that: strongly disagree < disagree < neutral < agree < strongly agree.

In [62]:
census.higher_tax = pd.Categorical(census.higher_tax, categories=['strongly disagree', 'disagree', 'neutral', 'agree', 'strongly agree'], ordered = True)
census.higher_tax.unique()

['disagree', 'neutral', 'agree', 'strongly agree', 'strongly disagree']
Categories (5, object): ['strongly disagree' < 'disagree' < 'neutral' < 'agree' < 'strongly agree']

##### Your manager would also like to know the median sentiment of the respondents on the issue of higher taxes for the wealthy.

In [63]:
# Label encode
census.higher_tax = census.higher_tax.cat.codes
# The median of the higher_tax value
census.higher_tax.median()

2.0

##### Your manager is interested in using machine learning models on the census data in the future.

In [70]:
# To help, let’s One-Hot Encode marital_status to create binary variables of each category.
# Use the pandas get_dummies() method to One-Hot Encode the marital_status variable.
#census = pd.get_dummies(data=census, columns=['marital_status'])
#census.head(5)


#####  Help the Census team use machine learning to predict if a respondent thinks the wealthy should pay higher taxes based on their marital status.

In [71]:
census.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status_divorced,marital_status_married,marital_status_single,marital_status_widowed
0,Denise,Ratke,2005,False,0,92129.41,1,0,0,1,0
1,Hali,Cummerata,1987,False,0,75649.17,2,1,0,0,0
2,Salomon,Orn,1992,True,2,166313.45,3,0,0,1,0
3,Sarina,Schiller,1965,False,2,71704.81,4,0,1,0,0
4,Gust,Abernathy,1945,False,2,143316.08,3,0,1,0,0


In [68]:
# Create a new variable called marital_codes by Label Encoding the marital_status variable.
census['marital_status'] = pd.Categorical(census['marital_status'], ['single', 'married', 'divorced', 'widowed'], ordered=True)
census['marital_codes'] = census['marital_status'].cat.codes

KeyError: 'marital_status'