In [1]:
# Import Libraries
import os
import psutil
import time
import cudf
import pandas as pd
import numpy as np
from datetime import datetime

import xgboost as xgb
import lightgbm as lgb

In [2]:
# Load Metadata and Climate Data from GHCND
metadata = pd.read_fwf('https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt', header= None)
metadata.columns = ['station_id', 'latitude', 'longitude', 'elevation', 'name', 'hcn_crn_flag', 'gsn_flag', 'wmo_id']
metadata = metadata[['station_id', 'latitude', 'longitude', 'elevation', 'name']]
metadata = cudf.DataFrame(metadata)

temp_data = cudf.read_csv('/datasets/weather_decomp/2000.csv',header=None) #Downloaded from https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/
temp_data.columns = ['station_id', 'date', 'type', 'value', 'mflag', 'qflag', 'sflag', 'unique_id']

In [3]:
# Pre-processing
req_cols = ['station_id', 'latitude', 'longitude', 'elevation', 'date', 'name', 'type', 'sflag', 'value']
data = cudf.merge(temp_data, metadata, on='station_id', how='left')
data = data[req_cols]
data.dropna(inplace=True)

# Feature Extraction
dates = cudf.to_datetime(data['date'].astype(str), format='%Y%m%d')
dates = cudf.DatetimeIndex(dates)
data['month'] = dates.month
data['day'] = dates.day

# Extract country code
data['country_code'] = data['station_id'].astype(str).str[:2]

# Delete metadata and climate data
del temp_data
del metadata

# Feature Selection
features = ['country_code', 'sflag', 'month', 'type', 'day', 'latitude', 'longitude', 'elevation']
target = 'value'

In [4]:
# COnverting dtypes to categorical
data['country_code'] = data['country_code'].astype('category')
data['month'] = data['month'].astype('category')
data['day'] = data['day'].astype('category')
data['type'] = data['type'].astype('category')
data['sflag'] = data['sflag'].astype('category')

In [5]:
# Train the model
model_params = {'objective': 'reg:linear', 
                'learning_rate': 0.1, 
                'colsample_bytree' : 0.3, 
                'max_depth': 5, 
                'n_estimators':10, 
                'alpha' : 10, 
                'silent': True, 
                'verbose_eval': True, 
                'tree_method':'gpu_hist'}

# Create train and test dmatrix
dtrain = xgb.DMatrix(data[features], data[target], enable_categorical=True)

boost_rounds = [50, 100, 200, 350, 500]

latency_dict ={key: list() for key in ['time', 'iterations', 'memory']}

for k in boost_rounds:
    model_params['n_estimators'] = k
    t = time.time()

    trained_model = xgb.train(model_params, dtrain, num_boost_round=k )   #, evals=[(dtrain, 'train')]
    latency_dict['time'].append(time.time()-t)
    latency_dict['iterations'].append(k)
    latency_dict['memory'].append(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)

    del trained_model

Parameters: { "n_estimators", "silent", "verbose_eval" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators", "silent", "verbose_eval" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators", "silent", "verbose_eval" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an i

In [17]:
latency_dict['time']

[6.13974928855896,
 9.741299867630005,
 19.455682277679443,
 33.842817306518555,
 48.2730872631073]

In [16]:
print("Average memory usage on 3.2 million records: {}MB".format(np.mean(latency_dict['memory'])))

Average memory usage on 3.2 million records: 1830.99375MB


## Light Gradient Boost

In [18]:
# Training
model_params = {'objective':'regression',
                 'learning_rate': 0.1, 
                 'feature_fraction': 0.3, 
                 'max_depth': 5, 
                 'verbose': -1, 
                 'n_estimators':10, 
                 'device': 'gpu', 
                 'gpu_platform_id':'3', 
                 'gpu_device_id':'3'
                 }

lgb_train = lgb.Dataset(data[features].to_pandas(), data[target].to_pandas()) # LightGBM don't support cudf dataframes

lgb_latency_dict ={key: list() for key in ['time','iterations','memory']}

for k in boost_rounds:
    model_params['n_estimators'] = k
    t = time.time()

    trained_model = lgb.train(model_params, lgb_train, num_boost_round = k)   #, evals=[(dtrain, 'train')]
    lgb_latency_dict['time'].append(time.time()-t)
    lgb_latency_dict['iterations'].append(k)
    lgb_latency_dict['memory'].append(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)

    del trained_model



In [20]:
lgb_latency_dict['time']

[19.37099003791809,
 29.78584051132202,
 54.16922187805176,
 98.2325119972229,
 130.54339241981506]

In [22]:
print("Average memory usage on 3.2 million records: {}MB".format(np.mean(lgb_latency_dict['memory'])))

Average memory usage on 3.2 million records: 2364.42578125MB
