In [1]:
import pandas as pd
import numpy as np

Let's create some series with a categorical variable present, and see how to indicate that to pandas

In [2]:
categorical_series = pd.Series(['Cotton', 'Polyester', 'Wool'], 
                               dtype="category")

In [3]:
categorical_series

0       Cotton
1    Polyester
2         Wool
dtype: category
Categories (3, object): [Cotton, Polyester, Wool]

#### How does Pandas handle duplicate categories?
The 3 unique categories are noted, but the data remains unaffected

In [4]:
categorical_series = pd.Series(['Cotton', 'Polyester', 'Wool', 'Cotton'], 
                               dtype="category")

In [5]:
categorical_series

0       Cotton
1    Polyester
2         Wool
3       Cotton
dtype: category
Categories (3, object): [Cotton, Polyester, Wool]

#### Playing around with what is defined
Converting a column in a dataframe to a category

In [6]:
df = pd.DataFrame({"Material":['Cotton', 'Polyester', 'Wool', 'Silk'],
                   "Garment":['T-shirt', 'Jacket', 'Socks', 'Scarf'],
                   "Size":['Medium', 'Large', 'Large', 'Small']})


In [7]:
df

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium
1,Polyester,Jacket,Large
2,Wool,Socks,Large
3,Silk,Scarf,Small


In [8]:
df.dtypes

Material    object
Garment     object
Size        object
dtype: object

In [9]:
df['Material_cat'] = df["Material"].astype('category')

In [10]:
df

Unnamed: 0,Material,Garment,Size,Material_cat
0,Cotton,T-shirt,Medium,Cotton
1,Polyester,Jacket,Large,Polyester
2,Wool,Socks,Large,Wool
3,Silk,Scarf,Small,Silk


In [11]:
df.dtypes

Material          object
Garment           object
Size              object
Material_cat    category
dtype: object

Creating a dataframe with all columns as categories. the column labels automatically become the different categories

In [12]:
df = pd.DataFrame({"Material":['Cotton', 'Polyester', 'Wool', 'Silk'],
                   "Garment":['T-shirt', 'Jacket', 'Socks', 'Scarf'],
                   "Size":['Medium', 'Large', 'Large', 'Small']},
                  dtype="category")

In [13]:
df

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium
1,Polyester,Jacket,Large
2,Wool,Socks,Large
3,Silk,Scarf,Small


#### All columns are cast as category types

In [14]:
df.dtypes

Material    category
Garment     category
Size        category
dtype: object

Observing data under each category

In [15]:
df['Garment']

0    T-shirt
1     Jacket
2      Socks
3      Scarf
Name: Garment, dtype: category
Categories (4, object): [Jacket, Scarf, Socks, T-shirt]

#### 4 values, 3 unique categories

In [16]:
df['Size']

0    Medium
1     Large
2     Large
3     Small
Name: Size, dtype: category
Categories (3, object): [Large, Medium, Small]

#### Ordering categorical values
In some instances there will be an ordering of the categorical values in a series. For instance, for clothing sizes, the values 'Small', 'Medium' and 'Large' have an an order

In [17]:
from pandas.api.types import CategoricalDtype

In [18]:
materials = CategoricalDtype(categories=['Cotton', 'Polyester', 'Wool', 'Silk'],
                             ordered=False)

In [19]:
materials

CategoricalDtype(categories=['Cotton', 'Polyester', 'Wool', 'Silk'], ordered=False)

In [20]:
sizes = CategoricalDtype(categories=['Small', 'Medium', 'Large'],
                         ordered=True)

In [21]:
sizes

CategoricalDtype(categories=['Small', 'Medium', 'Large'], ordered=True)

#### Assign the elements of a Series to one of the defined categories
If there is no match, the corresponding value in the Series is NaN

In [22]:
list_series = pd.Series(['Medium', 'Small', 'X-Large', 'Small'])

list_series.astype(sizes)

0    Medium
1     Small
2       NaN
3     Small
dtype: category
Categories (3, object): [Small < Medium < Large]

#### Different behaviours for ordered and unordered categories

In [23]:
df = pd.DataFrame({"Material":['Cotton', 'Polyester', 'Wool', 'Silk'],
                   "Garment":['T-shirt', 'Jacket', 'Socks', 'Scarf'],
                   "Size":['Medium', 'Large', 'Large', 'Small']})

In [24]:
df['Material'] = df['Material'].astype(materials)

In [25]:
df['Size'] = df['Size'].astype(sizes)

In [26]:
df

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium
1,Polyester,Jacket,Large
2,Wool,Socks,Large
3,Silk,Scarf,Small


#### Unordered category

In [27]:
df['Material']

0       Cotton
1    Polyester
2         Wool
3         Silk
Name: Material, dtype: category
Categories (4, object): [Cotton, Polyester, Wool, Silk]

#### Ordered category

In [28]:
df['Size']

0    Medium
1     Large
2     Large
3     Small
Name: Size, dtype: category
Categories (3, object): [Small < Medium < Large]

In [29]:
df['Garment']

0    T-shirt
1     Jacket
2      Socks
3      Scarf
Name: Garment, dtype: object

#### Equality operator works on both

In [30]:
df[df['Material'] == 'Cotton']

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium


In [31]:
df[df['Size'] == 'Large']

Unnamed: 0,Material,Garment,Size
1,Polyester,Jacket,Large
2,Wool,Socks,Large


#### Comparisons require ordered categories

In [32]:
df[df['Size'] > 'Small']

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium
1,Polyester,Jacket,Large
2,Wool,Socks,Large


In [33]:
df[df['Material'] > 'Cotton']

TypeError: Unordered Categoricals can only compare equality or not

#### Operating on Object types
The contents are sorted in alphabetical order

In [34]:
df[df['Garment'] > 'Scarf']

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium
2,Wool,Socks,Large


#### Categorizing with CategoricalDtype vs casting as "category"
They are effectively the same

In [35]:
df['Garment'] = df['Garment'].astype('category')

df['Garment']

0    T-shirt
1     Jacket
2      Socks
3      Scarf
Name: Garment, dtype: category
Categories (4, object): [Jacket, Scarf, Socks, T-shirt]

In [36]:
df[df['Garment'] == 'Scarf']

Unnamed: 0,Material,Garment,Size
3,Silk,Scarf,Small


In [37]:
df[df['Garment'] > 'Scarf']

TypeError: Unordered Categoricals can only compare equality or not

#### Ordered Categories enable sorting

In [38]:
df.sort_values(by = ['Size'])

Unnamed: 0,Material,Garment,Size
3,Silk,Scarf,Small
0,Cotton,T-shirt,Medium
1,Polyester,Jacket,Large
2,Wool,Socks,Large


### Editing categories

In [39]:
df['Size'].cat.categories

Index(['Small', 'Medium', 'Large'], dtype='object')

#### Append new categories
In the case of ordered categories, these get appended to the existing list of categories

In [40]:
df['Size'] = df['Size'].cat.add_categories(['X-Large', 'XX-Large'])

df['Size']

0    Medium
1     Large
2     Large
3     Small
Name: Size, dtype: category
Categories (5, object): [Small < Medium < Large < X-Large < XX-Large]

#### Ordering does not apply to categories which were previously unordered

In [41]:
df['Material'] = df['Material'].cat.add_categories(['Bamboo'])

df['Material']

0       Cotton
1    Polyester
2         Wool
3         Silk
Name: Material, dtype: category
Categories (5, object): [Cotton, Polyester, Wool, Silk, Bamboo]

In [42]:
df['Size'] = df['Size'].cat.remove_categories(['XX-Large'])

df['Size']

0    Medium
1     Large
2     Large
3     Small
Name: Size, dtype: category
Categories (4, object): [Small < Medium < Large < X-Large]

In [43]:
df['Size'] = df['Size'].cat.remove_unused_categories()

df['Size']

0    Medium
1     Large
2     Large
3     Small
Name: Size, dtype: category
Categories (3, object): [Small < Medium < Large]

#### Set categories

Can explicitly define the values of categories using the set_categories() method

In [44]:
df['Size'] = df['Size'].cat.set_categories(['X-Small', 
                                            'Small', 
                                            'Medium', 
                                            'Large', 
                                            'X-Large'])

In [45]:
df['Size']

0    Medium
1     Large
2     Large
3     Small
Name: Size, dtype: category
Categories (5, object): [X-Small < Small < Medium < Large < X-Large]

#### Editing categories need not affect the data
As long as the existing categories are untouched

In [46]:
df

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium
1,Polyester,Jacket,Large
2,Wool,Socks,Large
3,Silk,Scarf,Small


In [47]:
df['Size'] = df['Size'].cat.set_categories(['X-Small', 
                                            'Small', 
                                            'Medium'])

In [48]:
df['Size']

0    Medium
1       NaN
2       NaN
3     Small
Name: Size, dtype: category
Categories (3, object): [X-Small < Small < Medium]

In [49]:
df

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium
1,Polyester,Jacket,
2,Wool,Socks,
3,Silk,Scarf,Small
