In [23]:
import pandas as pd

In [24]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

In [25]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [26]:
drinks.info()
# info tells us a variety of information, one of them is the dataframe's basic
#memory usage

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


In [27]:
drinks.info(memory_usage='deep')
# we use memory_usage='deep' to get the actual memory usage

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [28]:
drinks.memory_usage(deep=True)
# here we are looking at memory_usage by series/column

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [29]:
drinks.memory_usage(deep=True).sum()
# a quick sum of all the columns' memory_usage

31224

In [30]:
sorted(drinks.continent.unique())
#looking at all the unique values in continent column = repeatead values

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [31]:
drinks.continent.head()
# helps to see if our solution later on works

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: object

In [32]:
drinks['continent'] = drinks.continent.astype('category')
# saves space and speeds up computations

In [33]:
drinks.dtypes
# continent is now a categorical dtype

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [34]:
drinks.continent.head()
# now we can see the dtype has changed to category

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [35]:
drinks.continent.cat.codes.head()
# lets us see the new values for our categorical column

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [36]:
drinks.memory_usage(deep=True)
# 193 integers instead of 193 strings
# continent category has gone down by more than 11000 kb

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [37]:
drinks['country'] = drinks.country.astype('category')
# doing the same operation with country

In [38]:
drinks.memory_usage(deep=True)
# make sure category actually makes it less spacious
# doesn't make smaller because each country is a unique value

Index                             128
country                         18094
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [39]:
drinks.country.cat.categories
# 193 unique values is not good for categorical dtype

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',
       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', length=193)

In [40]:
df = pd.DataFrame({'ID':[100, 101, 102, 103], 'quality':['good', 'very good', 'good', 'excellent']})
# creation of a small dataframe

In [41]:
df

Unnamed: 0,ID,quality
0,100,good
1,101,very good
2,102,good
3,103,excellent


In [42]:
df.sort_values('quality')
# sort dataframe by 'quality' values

Unnamed: 0,ID,quality
3,103,excellent
0,100,good
2,102,good
1,101,very good


In [43]:
from pandas.api.types import CategoricalDtype
quality_cat = CategoricalDtype(['good', 'very good', 'excellent'], ordered=True)
df['quality'] = df.quality.astype(quality_cat)
df.quality
# just have to remember this in order to order a categorical column
# new way that was a pain in the ass to figure out

0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): [good < very good < excellent]

In [44]:
df.sort_values('quality')
# it now sorts values according to how we ordered the categorcial data

Unnamed: 0,ID,quality
0,100,good
2,102,good
1,101,very good
3,103,excellent


In [45]:
df.loc[df.quality > 'good', :]
# we locate all values that are > 'good'; it knows this by our ordering of the category

Unnamed: 0,ID,quality
1,101,very good
3,103,excellent
