In [1]:
# "How do I create dummy variables in pandas?"

In [2]:
import pandas as pd

In [3]:
train = pd.read_csv('http://bit.ly/kaggletrain')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train['Sex_male'] = train.Sex.map({'female':0, 'male':1})
# creates dummy variables by giving values to strings
# might have to look up series map method
# pass a dictionary {} female maps to 0 male maps to 1

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [8]:
pd.get_dummies(train.Sex)
# get_dummies is a top-level function (pd.___) creates a column for each value, and creates a row for each variable
# two possible values for sex - female, male. so we use one categorical variable  
# If you have 'k' possible values for a categorical variable, you use k -1 dummy variables to represent it


Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [9]:
pd.get_dummies(train.Sex).iloc[:, 1:]
# we don't need first column, so we drop it and define it as the 'baseline'
# all rows, online 1 column
# identify where it came from if you add it back into df

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [11]:
train.Embarked.value_counts()
# shows us the numbers of each value for Embarked column

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [14]:
pd.get_dummies(train.Embarked, prefix='Embarked')
# we can see how this is less work than the first option now, because we need not assign
# a new variable for each value individually

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [15]:
pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:]
# k -1 variables needed. aka 3 values (C, Q, S) = only 2 variables - 0 0 is C
# makes sure to have prefixes to know what column it came from when you put back in

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


In [16]:
embarked_dummies = pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:]
# can now call the df as embarked_dummies

In [19]:
train = pd.concat([train, embarked_dummies], axis=1)
# here, we are concatenating the og df 'train' with our new one 'embarked dummies'
# axis=1 adds it on the column axis

In [20]:
train.head()
# our new train df with both sex and embarked subbed with dummy variables

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,0,1


In [None]:
pd.get_dummies(train.Sex, prefix='Sex').iloc[:, 1:]
# adds prefix, so we can replicate the first one, seems like more work at first