# Categorical data

* Categoricals are a pandas data type corresponding to categorical variables in statistics.
* Data type: category
* Cat accessor

## Exercise 1 - category data type

In [1]:
import pandas as pd

brands = pd.Series(["Ford", "Toyota", "BMW"], dtype="category")

brands

0      Ford
1    Toyota
2       BMW
dtype: category
Categories (3, object): ['BMW', 'Ford', 'Toyota']

In [2]:
brands = pd.Series(pd.Categorical(["Ford", "Toyota", "BMW"]))

brands

0      Ford
1    Toyota
2       BMW
dtype: category
Categories (3, object): ['BMW', 'Ford', 'Toyota']

## Exercise 2 - categories

In [3]:
brands = pd.Series(pd.Categorical(["Ford", "Toyota", "BMW"], categories=["Ford", "Toyota", "BMW", "Nissan"]))

brands

0      Ford
1    Toyota
2       BMW
dtype: category
Categories (4, object): ['Ford', 'Toyota', 'BMW', 'Nissan']

## Exercise 3 - astype

In [4]:
sizes = pd.Series(["M", "L", "M", "XL", "S", "M", "M", "S", "L", "L"])

sizes

0     M
1     L
2     M
3    XL
4     S
5     M
6     M
7     S
8     L
9     L
dtype: object

In [5]:
sizes = sizes.astype("category")

sizes

0     M
1     L
2     M
3    XL
4     S
5     M
6     M
7     S
8     L
9     L
dtype: category
Categories (4, object): ['L', 'M', 'S', 'XL']

## Exercise 4 - add categories

In [6]:
brands.cat.add_categories("Tesla")

0      Ford
1    Toyota
2       BMW
dtype: category
Categories (5, object): ['Ford', 'Toyota', 'BMW', 'Nissan', 'Tesla']

In [7]:
brands

0      Ford
1    Toyota
2       BMW
dtype: category
Categories (4, object): ['Ford', 'Toyota', 'BMW', 'Nissan']

In [8]:
brands = brands.cat.add_categories("Tesla")

brands

0      Ford
1    Toyota
2       BMW
dtype: category
Categories (5, object): ['Ford', 'Toyota', 'BMW', 'Nissan', 'Tesla']

## Exercise 5 - remove categories

In [9]:
brands

0      Ford
1    Toyota
2       BMW
dtype: category
Categories (5, object): ['Ford', 'Toyota', 'BMW', 'Nissan', 'Tesla']

In [10]:
brands = brands.cat.remove_categories(["Nissan", "Tesla"])

In [11]:
brands

0      Ford
1    Toyota
2       BMW
dtype: category
Categories (3, object): ['BMW', 'Ford', 'Toyota']

## Exercise 6 - categories

In [12]:
sizes

0     M
1     L
2     M
3    XL
4     S
5     M
6     M
7     S
8     L
9     L
dtype: category
Categories (4, object): ['L', 'M', 'S', 'XL']

In [13]:
sizes = sizes.cat.add_categories(["XS", "XXL"])

sizes

0     M
1     L
2     M
3    XL
4     S
5     M
6     M
7     S
8     L
9     L
dtype: category
Categories (6, object): ['L', 'M', 'S', 'XL', 'XS', 'XXL']

In [14]:
sizes.cat.categories

Index(['L', 'M', 'S', 'XL', 'XS', 'XXL'], dtype='object')

## Exercise 7 - unique vs value_counts

In [15]:
sizes.unique()

['M', 'L', 'XL', 'S']
Categories (6, object): ['L', 'M', 'S', 'XL', 'XS', 'XXL']

In [16]:
sizes.value_counts()

M      4
L      3
S      2
XL     1
XS     0
XXL    0
Name: count, dtype: int64

## Exercise 8 - as_ordered

In [17]:
sizes = sizes.cat.as_ordered()

sizes

0     M
1     L
2     M
3    XL
4     S
5     M
6     M
7     S
8     L
9     L
dtype: category
Categories (6, object): ['L' < 'M' < 'S' < 'XL' < 'XS' < 'XXL']

## Exercise 9 - reorder

In [18]:
sizes = sizes.cat.reorder_categories(["XS", "S", "M", "L", "XL", "XXL"])

sizes

0     M
1     L
2     M
3    XL
4     S
5     M
6     M
7     S
8     L
9     L
dtype: category
Categories (6, object): ['XS' < 'S' < 'M' < 'L' < 'XL' < 'XXL']

## Exercise 10 - ordered parameter

In [19]:
divisions = pd.Series(pd.Categorical(
    
    values=["C", "C", "A", "B", "A", "C", "A"], 
    categories=["C", "B", "A"], 
    ordered=True
    
))

divisions

0    C
1    C
2    A
3    B
4    A
5    C
6    A
dtype: category
Categories (3, object): ['C' < 'B' < 'A']

## Exercise 11 - rename categories

In [20]:
divisions = divisions.cat.rename_categories(["group C", "group B", "group A"])

divisions

0    group C
1    group C
2    group A
3    group B
4    group A
5    group C
6    group A
dtype: category
Categories (3, object): ['group C' < 'group B' < 'group A']

## Exercise 12 - in a DataFrame

In [21]:
import numpy as np

cars = pd.DataFrame({
    
    "id": np.arange(1, 100001),
    "brand": ["Ford", "Toyota", "BMW", "Tesla"] * 25000,
    "price": np.random.randint(10000, 20000, size=100000)
    
})

cars.head()

Unnamed: 0,id,brand,price
0,1,Ford,10433
1,2,Toyota,17735
2,3,BMW,18637
3,4,Tesla,17007
4,5,Ford,15849


In [22]:
cars.dtypes

id        int64
brand    object
price     int64
dtype: object

In [23]:
cars["brand_categorical"] = cars["brand"].astype("category")

cars.head()

Unnamed: 0,id,brand,price,brand_categorical
0,1,Ford,10433,Ford
1,2,Toyota,17735,Toyota
2,3,BMW,18637,BMW
3,4,Tesla,17007,Tesla
4,5,Ford,15849,Ford


In [24]:
cars.dtypes

id                      int64
brand                  object
price                   int64
brand_categorical    category
dtype: object

## Exercise 13 - CategoricalDtype

In [25]:
cars["brand_categorical"] = cars["brand"].astype(pd.CategoricalDtype())

cars.head()

Unnamed: 0,id,brand,price,brand_categorical
0,1,Ford,10433,Ford
1,2,Toyota,17735,Toyota
2,3,BMW,18637,BMW
3,4,Tesla,17007,Tesla
4,5,Ford,15849,Ford


In [26]:
cars["brand_categorical"]

0          Ford
1        Toyota
2           BMW
3         Tesla
4          Ford
          ...  
99995     Tesla
99996      Ford
99997    Toyota
99998       BMW
99999     Tesla
Name: brand_categorical, Length: 100000, dtype: category
Categories (4, object): ['BMW', 'Ford', 'Tesla', 'Toyota']

## Exercise 14 - ordered parameter

In [27]:
cars["brand_categorical"] = cars["brand"].astype(pd.CategoricalDtype(ordered=True))

cars.head()

Unnamed: 0,id,brand,price,brand_categorical
0,1,Ford,10433,Ford
1,2,Toyota,17735,Toyota
2,3,BMW,18637,BMW
3,4,Tesla,17007,Tesla
4,5,Ford,15849,Ford


In [28]:
cars["brand_categorical"]

0          Ford
1        Toyota
2           BMW
3         Tesla
4          Ford
          ...  
99995     Tesla
99996      Ford
99997    Toyota
99998       BMW
99999     Tesla
Name: brand_categorical, Length: 100000, dtype: category
Categories (4, object): ['BMW' < 'Ford' < 'Tesla' < 'Toyota']

## Exercise 15 - as_unordered

In [29]:
cars["brand_categorical"] = cars["brand_categorical"].cat.as_unordered()

cars["brand_categorical"]

0          Ford
1        Toyota
2           BMW
3         Tesla
4          Ford
          ...  
99995     Tesla
99996      Ford
99997    Toyota
99998       BMW
99999     Tesla
Name: brand_categorical, Length: 100000, dtype: category
Categories (4, object): ['BMW', 'Ford', 'Tesla', 'Toyota']

## Exercise 16 - memory usage

* Memory usage in bytes for object vs category

In [30]:
cars.memory_usage()

Index                   128
id                   800000
brand                800000
price                800000
brand_categorical    100204
dtype: int64