### Memory Usage

In [4]:
import pandas as pd

moma = pd.read_csv("moma.csv")

moma.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34558 entries, 0 to 34557
Data columns (total 27 columns):
ExhibitionID              34129 non-null float64
ExhibitionNumber          34558 non-null object
ExhibitionTitle           34558 non-null object
ExhibitionCitationDate    34557 non-null object
ExhibitionBeginDate       34558 non-null object
ExhibitionEndDate         33354 non-null object
ExhibitionSortOrder       34558 non-null float64
ExhibitionURL             34125 non-null object
ExhibitionRole            34424 non-null object
ConstituentID             34044 non-null float64
ConstituentType           34424 non-null object
DisplayName               34424 non-null object
AlphaSort                 34424 non-null object
FirstName                 31499 non-null object
MiddleName                3804 non-null object
LastName                  31998 non-null object
Suffix                    157 non-null object
Institution               2458 non-null object
Nationality               26

* __7.1+ MB__ estimated Memory usage

### How Pandas Represents Values in a Dataframe
* `Datafame` is stored in seperate blocks of each unique datatype
* `BlockManager` class mainains mapping and relations between row, column indexes and the actual blocks, which are `ObjectBlock` class objects

### Retrieve the DataFrame's Underlying BlockManager

In [5]:
moma._data

BlockManager
Items: Index(['ExhibitionID', 'ExhibitionNumber', 'ExhibitionTitle',
       'ExhibitionCitationDate', 'ExhibitionBeginDate', 'ExhibitionEndDate',
       'ExhibitionSortOrder', 'ExhibitionURL', 'ExhibitionRole',
       'ConstituentID', 'ConstituentType', 'DisplayName', 'AlphaSort',
       'FirstName', 'MiddleName', 'LastName', 'Suffix', 'Institution',
       'Nationality', 'ConstituentBeginDate', 'ConstituentEndDate',
       'ArtistBio', 'Gender', 'VIAFID', 'WikidataID', 'ULANID',
       'ConstituentURL'],
      dtype='object')
Axis 1: RangeIndex(start=0, stop=34558, step=1)
FloatBlock: [0, 6, 9, 19, 20, 23, 25], 7 x 34558, dtype: float64
ObjectBlock: [1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 24, 26], 20 x 34558, dtype: object

* The `moma` DataFrame consists of two blocks:
    * `FloatBlock`: Containing 7 float d-type columns ( Numpy Array, Low Memory Usage )
    * `ObjectBlock`: Containing 20 object d-type columns ( Python List, High Memory Usage )
    * 34558 rows each

### Different Memory Footprint of Different Types

In [8]:
# Total Number of Values in a DataFrame
total_size = moma.size

# Total Memory Footprint
total_bytes = moma.size * 8

# 2^20 = 1024*1024 = 1048576 bytes in a megabyte
total_megabytes = total_bytes/1048576

print(total_bytes)
print(total_megabytes)

7464528
7.1187286376953125


### Calculating True Memory Footprint

In [13]:
moma.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34558 entries, 0 to 34557
Data columns (total 27 columns):
ExhibitionID              34129 non-null float64
ExhibitionNumber          34558 non-null object
ExhibitionTitle           34558 non-null object
ExhibitionCitationDate    34557 non-null object
ExhibitionBeginDate       34558 non-null object
ExhibitionEndDate         33354 non-null object
ExhibitionSortOrder       34558 non-null float64
ExhibitionURL             34125 non-null object
ExhibitionRole            34424 non-null object
ConstituentID             34044 non-null float64
ConstituentType           34424 non-null object
DisplayName               34424 non-null object
AlphaSort                 34424 non-null object
FirstName                 31499 non-null object
MiddleName                3804 non-null object
LastName                  31998 non-null object
Suffix                    157 non-null object
Institution               2458 non-null object
Nationality               26

* Actual memory usage is higher __(45.6MB)__ because it includes the contents of the memory addresses that are being pointed to 

### Calculating Deep Memory Footprint of Each Column

In [12]:
moma.memory_usage(deep=True)

Index                          80
ExhibitionID               276464
ExhibitionNumber          2085250
ExhibitionTitle           3333695
ExhibitionCitationDate    3577728
ExhibitionBeginDate       2281851
ExhibitionEndDate         2234872
ExhibitionSortOrder        276464
ExhibitionURL             3494606
ExhibitionRole            2179383
ConstituentID              276464
ConstituentType           2313112
DisplayName               2548428
AlphaSort                 2534329
FirstName                 2104909
MiddleName                1218917
LastName                  2162937
Suffix                    1110333
Institution               1221368
Nationality               1949664
ConstituentBeginDate       276464
ConstituentEndDate         276464
ArtistBio                 3183300
Gender                    1858994
VIAFID                     276464
WikidataID                1821293
ULANID                     276464
ConstituentURL            2677922
dtype: int64

#### Object Columns

In [22]:
obj_cols = moma.select_dtypes(include=['object'])
obj_cols_mem = obj_cols.memory_usage(deep=True)
print("obj_cols_mem:\n\n",obj_cols_mem,"\n")
obj_cols_sum = obj_cols_mem.sum()/1048576
print("obj_cols_sum:\n\n",obj_cols_sum)

obj_cols_mem:

 Index                          80
ExhibitionNumber          2085250
ExhibitionTitle           3333695
ExhibitionCitationDate    3577728
ExhibitionBeginDate       2281851
ExhibitionEndDate         2234872
ExhibitionURL             3494606
ExhibitionRole            2179383
ConstituentType           2313112
DisplayName               2548428
AlphaSort                 2534329
FirstName                 2104909
MiddleName                1218917
LastName                  2162937
Suffix                    1110333
Institution               1221368
Nationality               1949664
ArtistBio                 3183300
Gender                    1858994
WikidataID                1821293
ConstituentURL            2677922
dtype: int64 

obj_cols_sum:

 43.766947746276855


* Object columns take up a total of 43.7MB of memory

#### Float Columns

In [23]:
float_cols = moma.select_dtypes(include=['float'])
float_cols_mem = float_cols.memory_usage(deep=True)
print("float_cols_mem:\n\n",float_cols_mem,"\n")
float_cols_sum = float_cols_mem.sum()/1048576
print("float_cols_sum:\n\n",float_cols_sum)

float_cols_mem:

 Index                       80
ExhibitionID            276464
ExhibitionSortOrder     276464
ConstituentID           276464
ConstituentBeginDate    276464
ConstituentEndDate      276464
VIAFID                  276464
ULANID                  276464
dtype: int64 

float_cols_sum:

 1.845672607421875


* Float columns take up a total of 1.8MB of memory

### Optmizing Integer Columns with Numeric Subtypes

#### int

In [24]:
import numpy as np
int_types = ["int8", "int16", "int32", "int64"]
for it in int_types:
     print(np.iinfo(it))

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------



#### float

In [32]:
import numpy as np
float_types = ["float16", "float32", "float64"]
for flt in float_types:
     print(np.finfo(flt))

Machine parameters for float16
---------------------------------------------------------------
precision =   3   resolution = 1.00040e-03
machep =    -10   eps =        9.76562e-04
negep =     -11   epsneg =     4.88281e-04
minexp =    -14   tiny =       6.10352e-05
maxexp =     16   max =        6.55040e+04
nexp =        5   min =        -max
---------------------------------------------------------------

Machine parameters for float32
---------------------------------------------------------------
precision =   6   resolution = 1.0000000e-06
machep =    -23   eps =        1.1920929e-07
negep =     -24   epsneg =     5.9604645e-08
minexp =   -126   tiny =       1.1754944e-38
maxexp =    128   max =        3.4028235e+38
nexp =        8   min =        -max
---------------------------------------------------------------

Machine parameters for float64
---------------------------------------------------------------
precision =  15   resolution = 1.0000000000000001e-15
machep =    -52   e

### Converting Numeric Subtypes
* Must represent missing values in numeric columns using a float subtype
* NumPy int type doesn't have a missing value object (like NaN for float values).
* Trying to convert a float column that contains missing values to an int column will generate an error.

In [33]:
print(moma.select_dtypes(include=['float']).isnull().sum())

ExhibitionID              429
ExhibitionSortOrder         0
ConstituentID             514
ConstituentBeginDate     9268
ConstituentEndDate      14739
VIAFID                   7562
ULANID                  12870
dtype: int64


### Optimizing Integer Columns with Subtypes

In [35]:
import numpy as np
col_max = moma['ExhibitionSortOrder'].max()
col_min = moma['ExhibitionSortOrder'].min()
print("col_max:",col_max)
print("col_min:",col_min)


if col_max <  np.iinfo("int8").max and col_min > np.iinfo("int8").min:
    moma['ExhibitionSortOrder'] = moma['ExhibitionSortOrder'].astype("int8")
elif col_max <  np.iinfo("int16").max and col_min > np.iinfo("int16").min:
    moma['ExhibitionSortOrder'] = moma['ExhibitionSortOrder'].astype("int16")
elif col_max <  np.iinfo("int32").max and col_min > np.iinfo("int32").min:
    moma['ExhibitionSortOrder'] = moma['ExhibitionSortOrder'].astype("int32")
elif col_max <  np.iinfo("int64").max and col_min > np.iinfo("int64").min:
    moma['ExhibitionSortOrder'] = moma['ExhibitionSortOrder'].astype("int64")
print("Chosen dtype:",moma['ExhibitionSortOrder'].dtype)
print("Column Memory Usage:", moma['ExhibitionSortOrder'].memory_usage(deep=True))

col_max: 1768
col_min: 1
Chosen dtype: int16
Column Memory Usage: 69196


### Automated Downcasting

In [42]:
moma = pd.read_csv("moma.csv")

# Convert column to Numeric, downcasting can only be done on numeric columns
moma['ExhibitionSortOrder'] = moma['ExhibitionSortOrder'].astype('int')

# Print original defaul dtype
print(moma['ExhibitionSortOrder'].dtype)

# Downcast
moma['ExhibitionSortOrder'] = pd.to_numeric(moma['ExhibitionSortOrder'],
                                            downcast='integer'
                                           )

# Print new downcasted dtype
print(moma['ExhibitionSortOrder'].dtype)

int32
int16


In [39]:
moma = pd.read_csv("moma.csv")

float_cols = moma.select_dtypes(include=['float'])
print(float_cols.dtypes)
print()

moma['ExhibitionSortOrder'] = moma['ExhibitionSortOrder'].astype("int16")

float_cols = moma.select_dtypes(include=['float'])
print(float_cols.dtypes)
print()

for col in float_cols.columns:
    moma[col] = pd.to_numeric(moma[col], downcast='float')
print(moma.select_dtypes(include=['float']).dtypes)

ExhibitionID            float64
ExhibitionSortOrder     float64
ConstituentID           float64
ConstituentBeginDate    float64
ConstituentEndDate      float64
VIAFID                  float64
ULANID                  float64
dtype: object

ExhibitionID            float64
ConstituentID           float64
ConstituentBeginDate    float64
ConstituentEndDate      float64
VIAFID                  float64
ULANID                  float64
dtype: object

Series([], dtype: object)
