In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler

# Changing `dtype` of your data

In [3]:
%%time
%%memit
#simple pandas - This is how we do normally, which means we are loading the entire data to the memory
df = pd.read_csv("../data/processed/combined_data.csv")

peak memory: 2893.57 MiB, increment: 2750.25 MiB
CPU times: user 55.2 s, sys: 20.7 s, total: 1min 15s
Wall time: 1min 26s


In [4]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [5]:
df.dtypes

time              object
lat_min          float64
lat_max          float64
lon_min          float64
lon_max          float64
rain (mm/day)    float64
model             object
dtype: object

In [6]:
print(f"Memory usage with float64: {df[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 2498.71 MB
Memory usage with float32: 1249.36 MB


In [7]:
# Changing the dtype of `lat_min`,`lat_max`,`lon_min`, `lon_max`, and `rain (mm/day)` columns into float32
df[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']] = df[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].astype('float32', errors='ignore');
df.dtypes

time              object
lat_min          float32
lat_max          float32
lon_min          float32
lon_max          float32
rain (mm/day)    float32
model             object
dtype: object

**The comments:** 

# Load just columns we want

In [8]:
print(f"Memory usage with all columns: {df[['time', 'lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)', 'model']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage without some dropped columns: {df[['time', 'lat_max', 'lon_max', 'rain (mm/day)', 'model']].memory_usage().sum() / 1e6:.2f} MB")

Memory usage with all columns: 2248.84 MB
Memory usage without some dropped columns: 1749.10 MB


In [9]:
df_reduced = df.drop(['lat_min', 'lon_min'], axis=1);

In [10]:
df_reduced.head()

Unnamed: 0,time,lat_max,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-33.574619,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-33.574619,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-33.574619,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-33.574619,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-33.574619,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


**The comments:** 