In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler

# Changing `dtype` of your data

In [3]:
%%time
%%memit
#simple pandas - This is how we do normally, which means we are loading the entire data to the memory
df = pd.read_csv("../data/processed/combined_data.csv")

peak memory: 2584.82 MiB, increment: 2441.97 MiB
CPU times: user 51.5 s, sys: 17.2 s, total: 1min 8s
Wall time: 1min 13s


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13
1,1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0
2,2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0
3,3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0
4,4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658


In [5]:
df.dtypes

Unnamed: 0         int64
time              object
lat_min          float64
lat_max          float64
lon_min          float64
lon_max          float64
rain (mm/day)    float64
dtype: object

In [6]:
print(f"Memory usage with float64: {df[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 2498.71 MB
Memory usage with float32: 1249.36 MB


In [7]:
# Changing the dtype of `lat_min`,`lat_max`,`lon_min`, `lon_max`, and `rain (mm/day)` columns into float32
df[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']] = df[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore');
df.dtypes

Unnamed: 0         int64
time              object
lat_min          float32
lat_max          float32
lon_min          float32
lon_max          float32
rain (mm/day)    float32
dtype: object

**The comments:** 

# Load just columns what we want

In [8]:
print(f"Memory usage with all columns: {df[['Unnamed: 0', 'time', 'lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage without dropped columns: {df[['time', 'lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")

Memory usage with all columns: 2248.84 MB
Memory usage without dropped columns: 1749.10 MB


In [12]:
df = df.drop(['Unnamed: 0'], axis=1);

In [13]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658


**The comments:** 