<a href="https://colab.research.google.com/github/ShaswataJash/LargeDatasetHandling/blob/master/Demonstration_of_river_ML_to_handle_large_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#downloading kaggle competitions files

In [None]:
!pip install kaggle==1.5.12

In [None]:
%%python

import sys
import logging
import os
import subprocess

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s:%(levelname)s:%(message)s')
logger = logging.getLogger('my_logger')
#handling of kaggle interaction
try:
    os.environ["KAGGLE_CONFIG_DIR"] = '/home' #kaggle.json file should be uploaded to /home location before executing this cell
    kaggle_write_cmd = "kaggle competitions download -c open-problems-multimodal"
    kaggle_write_call = subprocess.run(kaggle_write_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    logger.info(kaggle_write_call.stdout)
    if kaggle_write_call.returncode != 0:
        logger.error("Error in kaggle download, errorcode=%s", kaggle_write_call.returncode)
        sys.stdout.flush()
        sys.exit("Forceful exit as kaggle download returned error")
except BaseException as err:
    logger.error("kaggle download related error", exc_info=True)
    sys.stdout.flush()
    sys.exit("Forceful exit as exception encountered while kaggle download")

In [None]:
!mkdir /content/drive/MyDrive/colab_exp_result/kaggle_data
!unzip /content/open-problems-multimodal.zip -d /content/drive/MyDrive/colab_exp_result/kaggle_data

We can mount Google drive in colab and can copy the kaggle competitions files there. This will help not to run kaggle download code everytime before start of the notebook - it can save lot of time. Instead, everytime we can directly copy the contents from drive into the local filesystem of the underneath VM hosting the notebook.

In [None]:
!nohup cp /content/drive/MyDrive/colab_exp_result/kaggle_data/* /mnt &

In [None]:
!ls -l /mnt

#Installation of required software packages

In [None]:
!pip install river

In [None]:
!pip install h5py==3.7.0

In [None]:
!pip install hdf5plugin~=2.0

#HDF5 handling common code

In [None]:
def get_hdf5_dataset_value_key(hdf5_file, debug = 0):
    groups = []
    def node_visit(name):
        groups.append(name)
    
    hdf5_file.visit(node_visit)
    if debug>0: print(hdf5_file, groups)
    
    for g in groups:
        shape = hdf5_file[g].shape if isinstance(hdf5_file[g], h5py._hl.dataset.Dataset) else None
        if debug>0: print(g, type(hdf5_file[g]), shape)
        if (not shape is None) and (len(shape) == 2):
            return g
    
    return None

def get_hdf5_dataset_with_specific_shape(hdf5_file, size, debug = 0):
    groups = []
    def node_visit(name):
        groups.append(name)
    
    hdf5_file.visit(node_visit)
    if debug>0: print(hdf5_file, groups)
    
    for g in groups:
        shape = hdf5_file[g].shape if isinstance(hdf5_file[g], h5py._hl.dataset.Dataset) else None
        if debug>0: print(g, type(hdf5_file[g]), shape)
        if (not shape is None) and (len(shape) == 1) and (shape[0] == size):
            return g
    
    return None

def get_hdf5_info(hdf5_file):
    print('root-group file-object name:', hdf5_file.name)
    def print_keys(gr, level):
        keys = list(gr.keys())
        for k in keys:
            
            if isinstance(gr[k], h5py._hl.group.Group):
                print('->'*level, k, gr[k])
                print_keys(gr[k], level + 1)
            elif isinstance(gr[k], h5py._hl.dataset.Dataset):
                print('->'*level, k, gr[k], 'size=', gr[k].size, 'nbytes=', gr[k].nbytes, 
                      'maxshape=', gr[k].maxshape, 'chunks=', gr[k].chunks)

    print_keys(hdf5_file, 1)



In [None]:
import h5py
import hdf5plugin #without importing this, decompression will not happen by h5py
hdf5_input_file = h5py.File('/mnt/train_multi_inputs.h5') # HDF5 file
get_hdf5_info(hdf5_input_file)
hdf5_input_file.close()

In [None]:
import h5py
import hdf5plugin #without importing this, decompression will not happen by h5py
hdf5_input_file = h5py.File('/mnt/train_multi_inputs.h5') # HDF5 file
hdf5_input_key = get_hdf5_dataset_value_key(hdf5_input_file, debug=1)
d = hdf5_input_file[hdf5_input_key]          # Pointer on on-disk array
print('shape:', d.shape, 'dtype:', d.dtype)  # d can be very large

In [None]:
hdf5_col_name_key = get_hdf5_dataset_with_specific_shape(hdf5_input_file, 228942, debug=1)
cols = hdf5_input_file[hdf5_col_name_key]
print(cols.shape)
from tqdm import tqdm
col_name = []
for c_id in tqdm(range(cols.shape[0])):
    col_name.append(str(cols[c_id], 'UTF-8'))

In [None]:
import h5py
import hdf5plugin #without importing this, decompression will not happen by h5py
hdf5_target_file = h5py.File('/mnt/train_multi_targets.h5') # HDF5 file
hdf5_target_key = get_hdf5_dataset_value_key(hdf5_target_file, debug=1)
d_target = hdf5_target_file[hdf5_target_key]          # Pointer on on-disk array
print('shape:', d_target.shape, 'dtype:', d_target.dtype)  # d can be very large

In [None]:
hdf5_target_col_name_key = get_hdf5_dataset_with_specific_shape(hdf5_target_file, 23418, debug=1)
target_cols = hdf5_target_file[hdf5_target_col_name_key]
print(target_cols.shape)
from tqdm import tqdm
target_col_name = []
for c_id in tqdm(range(target_cols.shape[0])):
    target_col_name.append(str(target_cols[c_id], 'UTF-8'))

#Using river-ML to demonstrate online ML

(River-ML can take significantly long time for training on large dataset as it needs to do frequent for-looping on python dictionaries - note that river-ML expects as wells internally maintains, dictionaries for records. However, it is remarkably conservative from required RAM perspective.)

In [None]:
from river import stream
dataset = stream.iter_array(X=d, y=d_target, feature_names=col_name, target_names=target_col_name)

In [None]:
from tqdm import tqdm
from river import preprocessing
import pandas as pd
import pickle

mini_batch_input = []
standard_scaler = preprocessing.StandardScaler()
for iter_id, (X,_) in tqdm(enumerate(dataset)):
    mini_batch_input.append(X)
    if (iter_id > 0) and (iter_id%128 == 0):
        df_input = pd.DataFrame(mini_batch_input)
        standard_scaler.learn_many(df_input)
        with open('/content/drive/MyDrive/colab_exp_result/standard_scaler.pkl', 'wb') as f:
            pickle.dump(standard_scaler, f)
        mini_batch_input.clear()
        del df_input

if len(mini_batch_input) > 0:
    df_input = pd.DataFrame(mini_batch_input)
    standard_scaler.learn_many(df_input)
    with open('/content/drive/MyDrive/colab_exp_result/standard_scaler.pkl', 'wb') as f:
        pickle.dump(standard_scaler, f)
    mini_batch_input.clear()
    del df_input

del standard_scaler
del mini_batch_input

86271it [7:12:56, 13.69it/s]

In [None]:
import pickle

# load
with open('/content/drive/MyDrive/colab_exp_result/standard_scaler.pkl', 'rb') as f:
    standard_scaler = pickle.load(f)

#Ref:https://riverml.xyz/0.14.0/recipes/on-hoeffding-trees/

from river import tree
from river import metrics
from river import evaluate

model = tree.iSOUPTreeRegressor(
    max_size=2048,
    memory_estimate_period=10,
    stop_mem_management=True,
    remove_poor_attrs=True
)

for iter_count, (X,y) in tqdm(enumerate(dataset)):
    X_t = standard_scaler.transform_one(X)
    model.learn_one(X_t, y)
    if iter_count % 20:
        with open('/content/drive/MyDrive/colab_exp_result/iSOUPTreeRegressor_model.pkl', 'wb') as f:
            pickle.dump(model, f)

with open('/content/drive/MyDrive/colab_exp_result/iSOUPTreeRegressor_model.pkl', 'wb') as f:
    pickle.dump(model, f)