### Installing

In [1]:
# use GPU
!nvidia-smi

Wed Mar 25 21:57:11 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P8     6W /  75W |      0MiB /  7611MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null  # install jdk
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz # install spark package from apache website

In [0]:
!tar xf spark-2.4.5-bin-hadoop2.7.tgz # unzip the downloaded spark package

In [0]:
!pip install -q findspark # findspark will find the spark and initialize spark environment

In [3]:
!pip install pyspark



In [18]:
# download XGBoost components
!wget https://repo1.maven.org/maven2/ai/rapids/cudf/0.9.2/cudf-0.9.2-cuda10-1.jar

--2020-03-25 21:03:30--  https://repo1.maven.org/maven2/ai/rapids/cudf/0.9.2/cudf-0.9.2-cuda10-1.jar
Resolving repo1.maven.org (repo1.maven.org)... 151.101.196.209
Connecting to repo1.maven.org (repo1.maven.org)|151.101.196.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64759406 (62M) [application/java-archive]
Saving to: ‘cudf-0.9.2-cuda10-1.jar.1’


2020-03-25 21:03:30 (254 MB/s) - ‘cudf-0.9.2-cuda10-1.jar.1’ saved [64759406/64759406]



In [19]:
!wget https://repo1.maven.org/maven2/ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar

--2020-03-25 21:03:33--  https://repo1.maven.org/maven2/ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar
Resolving repo1.maven.org (repo1.maven.org)... 151.101.196.209
Connecting to repo1.maven.org (repo1.maven.org)|151.101.196.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 349225621 (333M) [application/java-archive]
Saving to: ‘xgboost4j_2.x-1.0.0-Beta5.jar.1’


2020-03-25 21:03:34 (262 MB/s) - ‘xgboost4j_2.x-1.0.0-Beta5.jar.1’ saved [349225621/349225621]



In [20]:
!wget https://repo1.maven.org/maven2/ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar

--2020-03-25 21:03:36--  https://repo1.maven.org/maven2/ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar
Resolving repo1.maven.org (repo1.maven.org)... 151.101.196.209
Connecting to repo1.maven.org (repo1.maven.org)|151.101.196.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4829218 (4.6M) [application/java-archive]
Saving to: ‘xgboost4j-spark_2.x-1.0.0-Beta5.jar.1’


2020-03-25 21:03:36 (188 MB/s) - ‘xgboost4j-spark_2.x-1.0.0-Beta5.jar.1’ saved [4829218/4829218]



In [4]:
!ls

covtype_test.parquet	 spark-2.4.5-bin-hadoop2.7
covtype_train.parquet	 spark-2.4.5-bin-hadoop2.7.tgz
cudf-0.9.2-cuda10-1.jar  xgboost4j_2.x-1.0.0-Beta5.jar
sample_data		 xgboost4j-spark_2.x-1.0.0-Beta5.jar


In [5]:
!pwd

/content


### Setting up

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
# There is a change in python/pyspark/java_gateway.py , which requires PYSPARK_SUBMIT_ARGS includes pyspark-shell if a PYSPARK_SUBMIT_ARGS variable is set by a user.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--master local[2] pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /content/cudf-0.9.2-cuda10-1.jar,/content/xgboost4j_2.x-1.0.0-Beta5.jar,/content/xgboost4j-spark_2.x-1.0.0-Beta5.jar, pyspark-shell'

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.executor.memory", "4g")
spark.conf.set("spark.driver.memory", "4g")

In [0]:
spark.sparkContext.addPyFile('/content/xgboost4j-spark_2.x-1.0.0-Beta5.jar')

In [0]:
from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier
from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import numpy as np
import pandas as pd

### Load the dataset

In [0]:
from sklearn.datasets import fetch_openml
covtype = fetch_openml(name='covertype', version=4)

In [12]:
covtype.data.shape

(581012, 54)

In [13]:
np.unique(covtype.target) # unique target data

array(['1', '2', '3', '4', '5', '6', '7'], dtype=object)

### Data Transformation

In [14]:
# converting data into pandas dataframe
column_names = covtype['feature_names'] + ['target']
data = np.c_[covtype['data'], covtype['target']]

covtype_df = pd.DataFrame(data=data, columns=column_names)
covtype_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,target
0,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


In [15]:
covtype_df.memory_usage(index=True).sum()

255645408

In [16]:
covtype_df.shape

(581012, 55)

In [17]:
covtype_df.target.value_counts()

2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: target, dtype: int64

In [18]:
covtype_df.dtypes

Elevation                             object
Aspect                                object
Slope                                 object
Horizontal_Distance_To_Hydrology      object
Vertical_Distance_To_Hydrology        object
Horizontal_Distance_To_Roadways       object
Hillshade_9am                         object
Hillshade_Noon                        object
Hillshade_3pm                         object
Horizontal_Distance_To_Fire_Points    object
Wilderness_Area1                      object
Wilderness_Area2                      object
Wilderness_Area3                      object
Wilderness_Area4                      object
Soil_Type1                            object
Soil_Type2                            object
Soil_Type3                            object
Soil_Type4                            object
Soil_Type5                            object
Soil_Type6                            object
Soil_Type7                            object
Soil_Type8                            object
Soil_Type9

In [0]:
# converting datatypes from object to the numeric 
for col in covtype_df.columns:
  covtype_df[col] = pd.to_numeric(covtype_df[col])

In [0]:
# re construct the target values
covtype_df['target'] = covtype_df['target'] - 1

In [21]:
covtype_df.dtypes

Elevation                             float64
Aspect                                float64
Slope                                 float64
Horizontal_Distance_To_Hydrology      float64
Vertical_Distance_To_Hydrology        float64
Horizontal_Distance_To_Roadways       float64
Hillshade_9am                         float64
Hillshade_Noon                        float64
Hillshade_3pm                         float64
Horizontal_Distance_To_Fire_Points    float64
Wilderness_Area1                      float64
Wilderness_Area2                      float64
Wilderness_Area3                      float64
Wilderness_Area4                      float64
Soil_Type1                            float64
Soil_Type2                            float64
Soil_Type3                            float64
Soil_Type4                            float64
Soil_Type5                            float64
Soil_Type6                            float64
Soil_Type7                            float64
Soil_Type8                        

### Splitting the dataset

In [0]:
train_df = covtype_df.sample(frac=0.8, random_state=10)
test_df = covtype_df.drop(train_df.index)

In [23]:
(train_df.shape, test_df.shape)

((464810, 55), (116202, 55))

### Save to file

In [0]:
train_df.to_parquet(fname='covtype_train.parquet', compression='snappy', index=False)
test_df.to_parquet(fname='covtype_test.parquet', compression='snappy', index=False)

In [25]:
!ls

covtype_test.parquet	 spark-2.4.5-bin-hadoop2.7
covtype_train.parquet	 spark-2.4.5-bin-hadoop2.7.tgz
cudf-0.9.2-cuda10-1.jar  xgboost4j_2.x-1.0.0-Beta5.jar
sample_data		 xgboost4j-spark_2.x-1.0.0-Beta5.jar


### Load the saved file

In [0]:
import pyarrow.parquet as pq

In [0]:
pq_table = pq.read_table('covtype_train.parquet')

In [0]:
# Load the data in GPU Memory from Local Memory
train_data = GpuDataReader(spark).format('parquet').load('/content/covtype_train.parquet')
test_data = GpuDataReader(spark).format('parquet').load('covtype_test.parquet')

In [33]:
train_data.schema

<bound method GpuDataset.schema of <ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset object at 0x7fe201ab5828>>

In [34]:
# seperate features and lables in the train dataset
label = "target"
features = [col for col in pq_table.column_names if col != label ]
features

['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points',
 'Wilderness_Area1',
 'Wilderness_Area2',
 'Wilderness_Area3',
 'Wilderness_Area4',
 'Soil_Type1',
 'Soil_Type2',
 'Soil_Type3',
 'Soil_Type4',
 'Soil_Type5',
 'Soil_Type6',
 'Soil_Type7',
 'Soil_Type8',
 'Soil_Type9',
 'Soil_Type10',
 'Soil_Type11',
 'Soil_Type12',
 'Soil_Type13',
 'Soil_Type14',
 'Soil_Type15',
 'Soil_Type16',
 'Soil_Type17',
 'Soil_Type18',
 'Soil_Type19',
 'Soil_Type20',
 'Soil_Type21',
 'Soil_Type22',
 'Soil_Type23',
 'Soil_Type24',
 'Soil_Type25',
 'Soil_Type26',
 'Soil_Type27',
 'Soil_Type28',
 'Soil_Type29',
 'Soil_Type30',
 'Soil_Type31',
 'Soil_Type32',
 'Soil_Type33',
 'Soil_Type34',
 'Soil_Type35',
 'Soil_Type36',
 'Soil_Type37',
 'Soil_Type38',
 'Soil_Type39',
 'Soil_Type40']

## Modelling

In [35]:
import time
params = {
    'eta': 0.1,
    'gamma': 0.1,
    'missing': 0.0,
    'treeMethod': 'gpu_hist',
    'maxDepth': 8,
    'growPolicy': 'depthwise',
    'lambda_': 1.0,
    'subsample': 1.0,
    'numRound': 1000, # create 1000 trees
    'numWorkers': 1,
    'verbosity': 1
  }

classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)
classifier

XGBoostClassifier_9f3303baa896

In [37]:
start_time = time.time()
model = classifier.fit(train_data)
print(f"Time Taken: {str(time.time() - start_time)}")

Time Taken: 28.228018760681152


In [38]:
!nvidia-smi

Wed Mar 25 22:11:11 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    23W /  75W |   1005MiB /  7611MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

### Save the model

In [0]:
model.write().overwrite().save('/content/model/')

In [40]:
!ls

covtype_test.parquet	 spark-2.4.5-bin-hadoop2.7
covtype_train.parquet	 spark-2.4.5-bin-hadoop2.7.tgz
cudf-0.9.2-cuda10-1.jar  xgboost4j_2.x-1.0.0-Beta5.jar
model			 xgboost4j-spark_2.x-1.0.0-Beta5.jar
sample_data


### Load the saved model

In [0]:
loaded_model = XGBoostClassificationModel().load('/content/model/')

In [0]:
result = loaded_model.transform(test_data)

In [43]:
result.show()

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+----------------+----------------+----------------+----------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------+--------------------+--------------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm|Horizontal_Distance_To_Fire_Points|Wild

### Evaluation

In [44]:
MulticlassClassificationEvaluator().setLabelCol(label).evaluate(result)

0.7143879104341959