In [1]:
import re
import os
import glob
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
from memory_profiler import memory_usage
import numpy as np

In [2]:
%load_ext rpy2.ipython
%load_ext memory_profiler

# Changing `dtype` of the data

In [3]:
# Loading the combined dataframe to the memory
df = pd.read_csv("../data/processed/combined_data.csv")
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [4]:
# Checking data types of the columns
df.dtypes

time              object
lat_min          float64
lat_max          float64
lon_min          float64
lon_max          float64
rain (mm/day)    float64
model             object
dtype: object

In [5]:
def compatible_float64_cols(df):
    """
    Identify float64-type columns in a dataframe and check if
    the columns can be changed into the float32 type.
    Parameters
    ----------
    df : A dataframe.
    Returns
    -------
    A list of the columns that can be changed into the float32 type.
    Examples
    --------
    >>> compatible_float64_cols(df)
    """
        
    # Check for the float64 columns
    cols_float64 = df.select_dtypes(include=["float64"]).columns.values
    
    column_list = []
    for col in cols_float64:
        if df[col].dtypes == "float64":
            if (np.finfo(np.float32).max > df[col].max()) and (np.finfo(np.float32).min < df[col].min()):
                column_list.append(col)

    return column_list

In [6]:
col_list = compatible_float64_cols(df)
print("The list of columns that can be changed into the float32 type:", col_list)

The list of columns that can be changed into the float32 type: ['lat_min', 'lat_max', 'lon_min', 'lon_max', 'rain (mm/day)']


In [7]:
print(f"Memory usage with float64 (default): {df[col_list].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[col_list].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64 (default): 2498.71 MB
Memory usage with float32: 1249.36 MB


In [8]:
print(f"Memory usage with object (default) for the time column: {df[['time']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with datetime64 for the time column: {df[['time']].astype('datetime64[ns]', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with object (default) for the time column: 499.74 MB
Memory usage with datetime64 for the time column: 499.74 MB


In [9]:
# Checking the `model` column's unique values
df.model.unique()

array(['MPI-ESM-1-2-HAM', 'AWI-ESM-1-1-LR', 'NorESM2-LM', 'ACCESS-CM2',
       'FGOALS-f3-L', 'CMCC-CM2-HR4', 'MRI-ESM2-0', 'GFDL-CM4',
       'BCC-CSM2-MR', 'EC-Earth3-Veg-LR', 'CMCC-ESM2', 'NESM3',
       'MPI-ESM1-2-LR', 'ACCESS-ESM1-5', 'FGOALS-g3', 'INM-CM4-8',
       'MPI-ESM1-2-HR', 'TaiESM1', 'NorESM2-MM', 'CMCC-CM2-SR5',
       'KIOST-ESM', 'INM-CM5-0', 'MIROC6', 'BCC-ESM1', 'GFDL-ESM4',
       'CanESM5', 'SAM0-UNICON'], dtype=object)

In [10]:
print(f"Memory usage with object (default) for the model column: {df[['model']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with string for the model column: {df[['model']].astype('string', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with object (default) for the model column: 499.74 MB
Memory usage with string for the model column: 499.74 MB


In [11]:
# Changing the dtype of the `lat_min`,`lat_max`,`lon_min`, `lon_max`, and `rain (mm/day)` columns into float32
df[col_list] = df[col_list].astype('float32', errors='ignore');
df.dtypes

time              object
lat_min          float32
lat_max          float32
lon_min          float32
lon_max          float32
rain (mm/day)    float32
model             object
dtype: object

In [12]:
print(f"Memory usage with float32 (default): {df[['lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float32 (default): 1249.36 MB


# Load just columns we want

In [13]:
print(f"Memory usage with all columns: {df[['time', 'lat_min','lat_max','lon_min', 'lon_max', 'rain (mm/day)', 'model']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with some of the columns: {df[['time', 'lat_max', 'lon_max', 'rain (mm/day)', 'model']].memory_usage().sum() / 1e6:.2f} MB")

Memory usage with all columns: 2248.84 MB
Memory usage with some of the columns: 1749.10 MB


In [14]:
# Dropping `lat_min` and `lon_min` columns
print(f"Memory usage of the lat_min and lon_min columns: {df[['lat_min', 'lon_min']].memory_usage().sum() / 1e6:.2f} MB")
df_reduced = df.drop(['lat_min', 'lon_min'], axis=1);
df_reduced.head()

Memory usage of the lat_min and lon_min columns: 499.74 MB


Unnamed: 0,time,lat_max,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-33.574619,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-33.574619,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-33.574619,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-33.574619,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-33.574619,143.4375,4.270161e-13,MPI-ESM-1-2-HAM


In [15]:
print(f"Memory usage of the reduced dataframe: {df_reduced.memory_usage().sum() / 1e6:.2f} MB")

Memory usage of the reduced dataframe: 1749.10 MB


## Our observations

Our team decided to try two approaches to reduce memory usage when performing the EDA using Python:

**1. Change the data types of the columns in the dataframe.**
<br/> There are 2 columns of the `object` type (`time` and `model`) and 5 columns of the `float64` type (`lat_min`, `lat_max`, `lon_min`, `lon_max`, and `rain (mm/day)`).

The comparison of the `float32` maximum value with the corresponding maximum values of columns `lat_min`, `lat_max`, `lon_min`, `lon_max`, and `rain (mm/day)` using the `compatible_float64_cols()` function showed that for these columns it is possible to change the type from `float64` into `float32`. At the same time, a similar check was carried out regarding the correspondence of these columns to the `float32` minimum value. Changing the data type for these columns into the `float32` type showed the two-fold decrease in memory usage, i.e. the reduction from 2498.71 MB to 1249.36 MB. It is related to the fact that `float64` allocates twice as much memory as `float32`, since `float64` can store much larger numbers than `float32`.

Regarding the `time` and `model` columns, 499.74 MB of memory was allocated for each of the columns. Interestingly, changing the data type for the `time` and `model` columns to `datetime64[ns]` and `string` respectively did not change the amount of memory used.

Based on the above, we can conclude that the `datetime64`, `string`, and `object` data types allocate the same amount of memory for storing data. For this reason, in order to reduce memory usage it is reasonable to change the data type for numeric columns.

**2. Load some specific columns we interested in.**

For the sake of the task, we suppose that now we are not interested in the `lat_min` and `lon_min` columns, i.e. these columns do not play a big role in our prediction problem. Dropping the specified columns reduced the memory usage from 2248.84 MB to 1749.10 MB.

This shows the efficiency of loading only those columns that are necessary for a specific task in terms of memory usage when working with big data.