## pd.get_dummies

https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

Convert categorical variable into dummy/indicator variables.

In [1]:
import pandas as pd
import numpy as np

#### Example-0

In [2]:
dic = {'Sex' : ['Male', 'Male', 'Female', 'Female', 'Female', np.nan]}
df = pd.DataFrame(dic)
df.head(7)

Unnamed: 0,Sex
0,Male
1,Male
2,Female
3,Female
4,Female
5,


In [3]:
### Convert Categorical to Numeric - using own function

def function(val):
    if val == 'Male':
        return 1
    else:
        return 0

df['Sex'] = df['Sex'].apply(function)
df.head(7)

Unnamed: 0,Sex
0,1
1,1
2,0
3,0
4,0
5,0


#### Example-1

In [4]:
dic = {'Sex' : ['Male', 'Male', 'Female', 'Female', 'Female', np.nan]}
df = pd.DataFrame(dic)
df.head(7)

Unnamed: 0,Sex
0,Male
1,Male
2,Female
3,Female
4,Female
5,


In [5]:
# Convert Categorical columns to numerical columns using get_dummies
# By default prefix is - Column Name itself 
# By defauly seperator is underscore

temp = pd.get_dummies(df)
temp

Unnamed: 0,Sex_Female,Sex_Male
0,0,1
1,0,1
2,1,0
3,1,0
4,1,0
5,0,0


In [6]:
# We can pass our own prefix as well

pd.get_dummies(df, prefix='Test')

Unnamed: 0,Test_Female,Test_Male
0,0,1
1,0,1
2,1,0
3,1,0
4,1,0
5,0,0


In [7]:
# we can pass the own prefix and prefix_seperator as well

pd.get_dummies(df, prefix='Test', prefix_sep='+')

Unnamed: 0,Test+Female,Test+Male
0,0,1
1,0,1
2,1,0
3,1,0
4,1,0
5,0,0


In [8]:
# Add a column to indicate NaNs if any

pd.get_dummies(df, prefix='Test', prefix_sep='+', dummy_na=True)

Unnamed: 0,Test+Female,Test+Male,Test+nan
0,0,1,0
1,0,1,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1


In [9]:
# It will remove (drop) first column 

pd.get_dummies(df, prefix='Test', prefix_sep='+', dummy_na=True, drop_first=True)

Unnamed: 0,Test+Male,Test+nan
0,1,0
1,1,0
2,0,0
3,0,0
4,0,0
5,0,1


#### Example-2

In [10]:
dic = {'Sex' : ['Male', 'Male', 'Female', 'Female', 'Female', np.nan], 'Age' : [10, 20, 30, 40, 50, np.nan]}
df = pd.DataFrame(dic)
df.head()

Unnamed: 0,Sex,Age
0,Male,10.0
1,Male,20.0
2,Female,30.0
3,Female,40.0
4,Female,50.0


In [11]:
# we can pass the Series as well

pd.get_dummies(df.Sex)

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,1,0
3,1,0
4,1,0
5,0,0


In [12]:
pd.get_dummies(df)

Unnamed: 0,Age,Sex_Female,Sex_Male
0,10.0,0,1
1,20.0,0,1
2,30.0,1,0
3,40.0,1,0
4,50.0,1,0
5,,0,0


#### Example-3

In [13]:
dic = {'Sex' : ['Male', 'Male', 'Female', 'Female', 'Female', np.nan], 
       'Age' : [10, 20, 30, 40, 50, np.nan], 
       'Title' : ['Mr', 'Mr', 'Miss', 'Miss', 'Miss', np.nan]}
df = pd.DataFrame(dic)
df.head()

Unnamed: 0,Sex,Age,Title
0,Male,10.0,Mr
1,Male,20.0,Mr
2,Female,30.0,Miss
3,Female,40.0,Miss
4,Female,50.0,Miss


In [14]:
# if we have multiple categorical columns - all will be changed to numerical

pd.get_dummies(df)

Unnamed: 0,Age,Sex_Female,Sex_Male,Title_Miss,Title_Mr
0,10.0,0,1,0,1
1,20.0,0,1,0,1
2,30.0,1,0,1,0
3,40.0,1,0,1,0
4,50.0,1,0,1,0
5,,0,0,0,0


#### Example-4

In [15]:
dic = {'Sex' : ['Male', 'Male', 'Female', 'Female', 'Female', np.nan], 
       'Age' : [10, 20, 30, 40, 50, np.nan], 
       'Title' : ['Mr', 'Mr', 'Miss', 'Miss', 'Miss', np.nan],
      'Embarked' : ['S', 'Q', 'C', 'Q', 'S', np.nan]}

df = pd.DataFrame(dic)
df.head()

Unnamed: 0,Sex,Age,Title,Embarked
0,Male,10.0,Mr,S
1,Male,20.0,Mr,Q
2,Female,30.0,Miss,C
3,Female,40.0,Miss,Q
4,Female,50.0,Miss,S


In [23]:
# Now we have 3 categorical column, (Sex, Title and Embarked)
# We want to convert only 2 columns (Sex, Embarked) to numerical

temp = pd.get_dummies(df, columns=['Sex', 'Embarked'])
temp.head(7)

Unnamed: 0,Age,Title,Sex_Female,Sex_Male,Embarked_C,Embarked_Q,Embarked_S
0,10.0,Mr,0,1,0,0,1
1,20.0,Mr,0,1,0,1,0
2,30.0,Miss,1,0,1,0,0
3,40.0,Miss,1,0,0,1,0
4,50.0,Miss,1,0,0,0,1
5,,,0,0,0,0,0
