In [1]:
# Ch02-3 Pandas Memory

In [2]:
# Libraries
import numpy as np 
import pandas as pd 

In [4]:
# Load data
vdata = pd.read_csv("data/2021VAERSDATA.csv.gz", encoding="iso-8859-1", low_memory=False) 
vdata.info(memory_usage="deep") 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753040 entries, 0 to 753039
Data columns (total 35 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   VAERS_ID      753040 non-null  int64  
 1   RECVDATE      753040 non-null  object 
 2   STATE         637152 non-null  object 
 3   AGE_YRS       671951 non-null  float64
 4   CAGE_YR       604330 non-null  float64
 5   CAGE_MO       4304 non-null    float64
 6   SEX           753040 non-null  object 
 7   RPT_DATE      928 non-null     object 
 8   SYMPTOM_TEXT  752463 non-null  object 
 9   DIED          10534 non-null   object 
 10  DATEDIED      9366 non-null    object 
 11  L_THREAT      11113 non-null   object 
 12  ER_VISIT      127 non-null     object 
 13  HOSPITAL      47457 non-null   object 
 14  HOSPDAYS      31192 non-null   float64
 15  X_STAY        378 non-null     object 
 16  DISABLE       11970 non-null   object 
 17  RECOVD        679613 non-null  object 
 18  VAX_

In [5]:
# Inspect the size of each column
for name in vdata.columns:
    col_bytes = vdata[name].memory_usage(index=False, deep=True) 
    col_type = vdata[name].dtype
    print(name, col_type, col_bytes // (1024 ** 2)) 

VAERS_ID int64 5
RECVDATE object 48
STATE object 39
AGE_YRS float64 5
CAGE_YR float64 5
CAGE_MO float64 5
SEX object 41
RPT_DATE object 23
SYMPTOM_TEXT object 496
DIED object 23
DATEDIED object 23
L_THREAT object 23
ER_VISIT object 22
HOSPITAL object 24
HOSPDAYS float64 5
X_STAY object 22
DISABLE object 23
RECOVD object 39
VAX_DATE object 46
ONSET_DATE object 45
NUMDAYS float64 5
LAB_DATA object 53
V_ADMINBY object 43
V_FUNDBY object 23
OTHER_MEDS object 50
CUR_ILL object 33
HISTORY object 50
PRIOR_VAX object 25
SPLTTYPE object 32
FORM_VERS int64 5
TODAYS_DATE object 47
BIRTH_DEFECT object 22
OFC_VISIT object 26
ER_ED_VISIT object 25
ALLERGIES object 36


In [6]:
# Review the Died column
vdata.DIED.memory_usage(index=False, deep=True) 
vdata.DIED.fillna(False).astype(bool).memory_usage(index=False, deep=True) 

753040

In [7]:
# State column
vdata["STATE"] = vdata.STATE.str.upper() 
states = list(vdata["STATE"].unique()) 
vdata["encoded_state"] = vdata.STATE.apply(lambda state: states.index(state)) 
vdata["encoded_state"] = vdata["encoded_state"].astype(np.uint8) 
vdata["STATE"].memory_usage(index=False, deep=True) 
vdata["encoded_state"].memory_usage(index=False, deep=True) 

753040

In [8]:
# Apply optimizations while loading the data
states = list(pd.read_csv("vdata_sample.csv.gz",
    converters={"STATE": lambda state: state.upper()}, 
    usecols=["STATE"] 
)["STATE"].unique()) 

In [9]:
# Skip the symptom_text column
vdata = pd.read_csv("vdata_sample.csv.gz", index_col="VAERS_ID",
    converters={
        "DIED": lambda died: died == "Y", "STATE": lambda state: states.index(state.upper())
    }, usecols=lambda name: name != "SYMPTOM_TEXT")
vdata["STATE"] = vdata["STATE"].astype(np.uint8)
vdata.info(memory_usage="deep") 

  vdata = pd.read_csv("vdata_sample.csv.gz", index_col="VAERS_ID",


<class 'pandas.core.frame.DataFrame'>
Index: 677736 entries, 1184807 to 1034236
Data columns (total 33 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   RECVDATE      677736 non-null  object 
 1   STATE         677736 non-null  uint8  
 2   AGE_YRS       604833 non-null  float64
 3   CAGE_YR       543768 non-null  float64
 4   CAGE_MO       3875 non-null    float64
 5   SEX           677736 non-null  object 
 6   RPT_DATE      824 non-null     object 
 7   DIED          677736 non-null  bool   
 8   DATEDIED      8404 non-null    object 
 9   L_THREAT      10032 non-null   object 
 10  ER_VISIT      114 non-null     object 
 11  HOSPITAL      42625 non-null   object 
 12  HOSPDAYS      28022 non-null   float64
 13  X_STAY        339 non-null     object 
 14  DISABLE       10786 non-null   object 
 15  RECOVD        611676 non-null  object 
 16  VAX_DATE      627531 non-null  object 
 17  ONSET_DATE    618403 non-null  object 
 18  NU

In [None]:
# End of Notebook #