In [1]:
import pandas as pd
import numpy as np

# Nominal Categorical Values and Computational Efficiency

In [2]:
df = pd.read_csv('http://bit.ly/drinksbycountry')
df.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [8]:
# note the memory usage is 9.1+ kb currently with the use of object columns, which means at least this much
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 9.1+ KB


To get the actual memory usage you can specify deep on memory usage .Now, pandas actually looks at the object columns and figured out that it takes 30.4 kb of memory for this dataframe

In [10]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 30.4 KB


You can also get the memory usage columns without deeper inspection as such

In [15]:
# This is the estimate without looking at the object columns themselves
df.memory_usage()

Index                             80
country                         1544
beer_servings                   1544
spirit_servings                 1544
wine_servings                   1544
total_litres_of_pure_alcohol    1544
continent                       1544
dtype: int64

In [14]:
# this is the memory usage after having actually looked at the object columns
df.memory_usage(deep='True')

Index                              80
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [17]:
print("Total memory usage for this dataframe is: {}kb".format(df.memory_usage(deep=True).sum()))

Total memory usage for this dataframe is: 31176kb


The bottom line is that object columns take a lot more space. This could be a big deal for larger datasets with larger categories using strings. But what if we stored those strings as integers in memory, because as we know, integers are much more space efficient than strings in memory.

In [20]:
print(df.continent.unique())
print('\n')
print(df.country.unique())

['Asia' 'Europe' 'Africa' 'North America' 'South America' 'Oceania']


['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Antigua & Barbuda'
 'Argentina' 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas'
 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
 'Bhutan' 'Bolivia' 'Bosnia-Herzegovina' 'Botswana' 'Brazil' 'Brunei'
 'Bulgaria' 'Burkina Faso' 'Burundi' "Cote d'Ivoire" 'Cabo Verde'
 'Cambodia' 'Cameroon' 'Canada' 'Central African Republic' 'Chad' 'Chile'
 'China' 'Colombia' 'Comoros' 'Congo' 'Cook Islands' 'Costa Rica'
 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic' 'North Korea' 'DR Congo'
 'Denmark' 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji'
 'Finland' 'France' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece'
 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel' 

From this bit, we can see that we have stored these strings as integers, which takes less memory than strings, and those integers are used to point to a lookup table to decode the values afterwards.

## Continent Feature
We will see that if we cast this column to category, we see that the memory usage is smaller than before

In [25]:
df.continent = df.continent.astype('category')

In [26]:
df.memory_usage(deep=True)

Index                              80
country                         18094
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [27]:
df.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [29]:
# this is the numeric encoding scheme for the category scheme in which we store these values as integers in memory
# at the bottom of this series view, you can see that the stored datatype now is int8
df.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [31]:
# This is the categorical accessor object
df.continent.cat

<pandas.core.categorical.CategoricalAccessor object at 0x7fdf9edbcfd0>

In [32]:
# This is the entire continent series using the actual numerical encoding from the categorizer
df.continent.cat.codes

0      1
1      2
2      0
3      2
4      0
5      3
6      5
7      2
8      4
9      2
10     2
11     3
12     1
13     1
14     3
15     2
16     2
17     3
18     0
19     1
20     5
21     2
22     0
23     5
24     1
25     2
26     0
27     0
28     0
29     0
      ..
163    5
164    0
165    2
166    2
167    1
168    1
169    1
170    2
171    1
172    0
173    4
174    3
175    0
176    1
177    1
178    4
179    0
180    2
181    1
182    2
183    0
184    3
185    5
186    1
187    4
188    5
189    1
190    1
191    0
192    0
Length: 193, dtype: int8

## Country Feature
Let's take a look at the country feature column after we have casted it to a categorical datatype

In [23]:
df.country = df.country.astype('category')
df.memory_usage(deep='True')

Index                              80
country                         18094
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

You see that the memory usage has actually increased for the country column. Why is this? It's because we have created categories for 193 different categories

In [38]:
df.country.head()

0    Afghanistan
1        Albania
2        Algeria
3        Andorra
4         Angola
Name: country, dtype: category
Categories (193, object): [Afghanistan, Albania, Algeria, Andorra, ..., Vietnam, Yemen, Zambia, Zimbabwe]

In [39]:
df.country.cat.categories

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', length=193)

In [35]:
df.country.cat.codes.head()

0    0
1    1
2    2
3    3
4    4
dtype: int16

The bottom line here: Is that you should only use the 'category' datatype on categorical columns when there are ONLY A FEW categories, but lots of data points. This also has the two-fold effect of speeding up computation

Otherwise, it would just be storing a lot of integers to point to a long of strings in memory.

# Ordinal Categorical Values

In [52]:
data = pd.DataFrame(
    {'ID': [100, 101, 102, 103], 
     'quality': ['good', 'bad', 'good', 'excellent']}
                   )
data.head()

Unnamed: 0,ID,quality
0,100,good
1,101,bad
2,102,good
3,103,excellent


In [60]:
# here we state that excellent is better than good which is better than bad, ad we do this from lowest to highest
data.quality = pd.Categorical(data.quality, categories=['bad', 'good', 'excellent'], ordered=True)

In [61]:
data.quality

0         good
1          bad
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): [bad < good < excellent]

In [65]:
# Keep in mind we tried to use sort_values on a dataframe object here. 
# There is no keyword argument called 'by' in the dataframe version of this method
print(data.quality.sort_values(by='quality', ascending=False, axis=1))

TypeError: sort_values() got an unexpected keyword argument 'by'

In [66]:
# keep in mind we are using sort_values on a series here
data.quality.sort_values(ascending=False)

3    excellent
2         good
0         good
1          bad
Name: quality, dtype: category
Categories (3, object): [bad < good < excellent]