# Dummy Variables

In [1]:
import pandas as pd

# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
data = pd.read_csv("adult.csv", header=None, index_col=False,
                       names=['age', 'workclass', 'fnlwgt', 'education',  'education-num','marital-status', 'occupation', 'relationship', 'race', 'gender','capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income'])

# For illustration purpose, we only select some of the columns
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week','occupation', 'income']]

# IPython.display allows nice output formatting within the Jupyter notebook
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,age,workclass,education,sex,hours.per.week,occupation,income
1,90,?,HS-grad,Female,40,?,<=50K
2,82,Private,HS-grad,Female,18,Exec-managerial,<=50K
3,66,?,Some-college,Female,40,?,<=50K
4,54,Private,7th-8th,Female,40,Machine-op-inspct,<=50K


In [3]:
# Checking string-encoded categorical data
print(data.gender.value_counts())

Male      21790
Female    10771
sex           1
Name: gender, dtype: int64


In [4]:
# changing the sex data point
data.loc[:, 'gender'] = data.gender.replace('sex', 'Female')

In [5]:
print(data.gender.value_counts())

Male      21790
Female    10772
Name: gender, dtype: int64


In [10]:
print(data.age.value_counts())

36     898
31     888
34     886
23     877
35     876
      ... 
88       3
85       3
86       1
age      1
87       1
Name: age, Length: 74, dtype: int64


In [6]:
# before using get_dummies
data

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,age,workclass,education,Female,hours.per.week,occupation,income
1,90,?,HS-grad,Female,40,?,<=50K
2,82,Private,HS-grad,Female,18,Exec-managerial,<=50K
3,66,?,Some-college,Female,40,?,<=50K
4,54,Private,7th-8th,Female,40,Machine-op-inspct,<=50K
...,...,...,...,...,...,...,...
32557,22,Private,Some-college,Male,40,Protective-serv,<=50K
32558,27,Private,Assoc-acdm,Female,38,Tech-support,<=50K
32559,40,Private,HS-grad,Male,40,Machine-op-inspct,>50K
32560,58,Private,HS-grad,Female,40,Adm-clerical,<=50K


In [8]:
# using get_dummies
data_dummies = pd.get_dummies(data)
print("Features after get_dummies:\n", list(data_dummies.columns))

Features after get_dummies:
 ['age_17', 'age_18', 'age_19', 'age_20', 'age_21', 'age_22', 'age_23', 'age_24', 'age_25', 'age_26', 'age_27', 'age_28', 'age_29', 'age_30', 'age_31', 'age_32', 'age_33', 'age_34', 'age_35', 'age_36', 'age_37', 'age_38', 'age_39', 'age_40', 'age_41', 'age_42', 'age_43', 'age_44', 'age_45', 'age_46', 'age_47', 'age_48', 'age_49', 'age_50', 'age_51', 'age_52', 'age_53', 'age_54', 'age_55', 'age_56', 'age_57', 'age_58', 'age_59', 'age_60', 'age_61', 'age_62', 'age_63', 'age_64', 'age_65', 'age_66', 'age_67', 'age_68', 'age_69', 'age_70', 'age_71', 'age_72', 'age_73', 'age_74', 'age_75', 'age_76', 'age_77', 'age_78', 'age_79', 'age_80', 'age_81', 'age_82', 'age_83', 'age_84', 'age_85', 'age_86', 'age_87', 'age_88', 'age_90', 'age_age', 'workclass_?', 'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Never-worked', 'workclass_Private', 'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc', 'workclass_State-gov', 'workclass_Without-pay', 'workclass_work