dropping columns

In [9]:
import pandas as pd
df = pd.read_csv("adult_income.csv")
original = df.memory_usage(deep=True).sum()
df = df.drop(columns='native-country')
current = df.memory_usage(deep=True).sum()
print(original, current)

31272917 27887832


downcasting data loss example

In [12]:
import pandas as pd
import numpy as np

df = pd.DataFrame()
df['counts'] = np.arange(1000, 0, -100)
df.counts.to_list()


[1000, 900, 800, 700, 600, 500, 400, 300, 200, 100]

In [14]:
df['counts'] = df['counts'].astype('int8')

values are changed without any warnings

In [17]:
df.counts.to_list()

[-24, -124, 32, -68, 88, -12, -112, 44, -56, 100]

A proper way to set value types

In [21]:
df['counts'] = np.arange(1000, 0, -100, dtype=np.int16)
df.counts.to_list()

[1000, 900, 800, 700, 600, 500, 400, 300, 200, 100]

will result in a warning if data type size is too small to accomodate the values

In [23]:
df['counts'] = np.arange(1000, 0, -100, dtype=np.int8)


For the old behavior, usually:
    np.array(value).astype(dtype)
will give the desired result (the cast overflows).
  df['counts'] = np.arange(1000, 0, -100, dtype=np.int8)
For the old behavior, usually:
    np.array(value).astype(dtype)
will give the desired result (the cast overflows).
  df['counts'] = np.arange(1000, 0, -100, dtype=np.int8)


automatic downcasting function

In [94]:
df = pd.read_csv("adult_income.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


get total memory usage

In [97]:
df.memory_usage(deep=True).sum()

31272917

get total memory usage for affected columns (will be changed later)

In [100]:
df[['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']].memory_usage().sum()

2344548

downcast_numeric function definition

In [106]:
def downcast_numeric(df, use_unsigned=True):
    ints = [col for col in df.columns if pd.api.types.is_integer_dtype(df[col])]
    signed = [col for col in ints if (df[col] < 0).any()]
    unsigned = [col for col in ints if (df[col]>= 0).any()]

    floats = [col for col in df.columns if pd.api.types.is_float_dtype(df[col])]

    if use_unsigned:
        df[signed] = df[signed].apply(pd.to_numeric, downcast='integer')
        df[unsigned] = df[unsigned].apply(pd.to_numeric, downcast='unsigned')
    else:
        df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')

    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')

In [108]:
downcast_numeric(df, use_unsigned=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int8  
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int32 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int8  
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int32 
 11  capital-loss     48842 non-null  int16 
 12  hours-per-week   48842 non-null  int8  
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int16(1), int32(2), int8(3), object(9)
memory usage: 4.0+ MB
None


Total usage

In [111]:
print(df.memory_usage(deep=True).sum())

29563447


get total memory usage for affected columns only

In [118]:
print(df[['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']].memory_usage().sum())

635078
