In [1]:
import imp
import sys
# sys.path.append("")
if "/home/ecbm4040/project/random-forests-cuda/" not in sys.path:
    sys.path.insert(0, "/home/ecbm4040/project/random-forests-cuda/")
from src.python.cuda_utils import CudaUtils

Number of CUDA devices available:  1
Device Name: Tesla T4
Compute Capability: 7.5
Total Device Memory: 15109 megabytes


ImportError: cannot import name 'CudaUtils'

In [2]:
import pandas as pd
import numpy as np
import time
import pycuda
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuArray
from pycuda.compiler import SourceModule
from numpy import unravel_index

cuda.init()
print("Number of CUDA devices available: ", cuda.Device.count())
my_device = cuda.Device(0)
# cc: compute capability
cc = float('%d.%d' % my_device.compute_capability())
print('Device Name: {}'.format(my_device.name()))
print('Compute Capability: {}'.format(cc))
print('Total Device Memory: {} megabytes'.format(my_device.total_memory()//1024**2))



Number of CUDA devices available:  1
Device Name: Tesla T4
Compute Capability: 7.5
Total Device Memory: 15109 megabytes


In [16]:

class DecisionTreeCudaUtils():

    def __init__(self):
        """
        Attributes for instance of EncoderDecoder module
        """
        return
    
    def get_source_module(self):
        # kernel code wrapper
        kernelwrapper = """
        // Helps in the calculation of the gina scores
        __global__ void calculate_gina_scores(float* impurity_scores,float* X_train,int* y_train,const int unique_classes,const int l,const int w){
            int Dim = threadIdx.y+blockIdx.y*blockDim.y;
            int Row = threadIdx.x+blockIdx.x*blockDim.x;

            if(Dim < w && Row < l){
                float split_value =X_train[Row * w+ Dim];

                float group1_counts[20] = {0};//Max of 20 dimensions which can be increased
                float group2_counts[20] = {0};
                float length1=0;
                float length2=0;
                float sum1=0;
                float sum2=0;

                for(int i=0;i<l;i++){
                    if(X_train[i* w+ Dim]>=split_value){
                        //Belongs to group 1
                        group1_counts[y_train[i]]++;
                        length1++;
                    }
                    else{
                        //Belongs to group 2
                        group2_counts[y_train[i]]++;
                        length2++;
                    }
                }
                float p1 = length1/(length1+length2);
                float p2 = length2/(length1+length2);

                if(length1 > 0){
                    for(int i=0;i<unique_classes;i++){
                        sum1+=(group1_counts[i]*group1_counts[i])/(length1*length1);
                    }
                }
                if(length2 > 0){
                    for(int i=0;i<unique_classes;i++){
                        sum2+=(group2_counts[i]*group2_counts[i])/(length2*length2);
                    }
                }

                float impurity = p1*sum1+p2*sum2;
                // Write our new pixel value out
                impurity_scores[Row * w + Dim] =impurity;

            }
        }

        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
        //Finds the max value of all the gina impurity scores that we have calculated
        #define BLOCKSIZE 1024
        __global__ void find_best_gina_score(int* index, float* all_gina_scores, const int len){
            //loading segment of data in local memory
            __shared__ float scan_array[2*BLOCKSIZE];
            __shared__ float ii_array[2*BLOCKSIZE];
            unsigned int t =threadIdx.x;
            unsigned int start=2*blockIdx.x*blockDim.x;

            if(start+t <len){
                scan_array[t]=all_gina_scores[start+t];
                ii_array[t]=index[start+t];
            }
            if(start+blockDim.x+t <len){
                scan_array[blockDim.x+t]=all_gina_scores[start+blockDim.x+t];
                ii_array[blockDim.x+t]=index[start+blockDim.x+t];
            }

            for (unsigned int stride = blockDim.x;stride > 0; stride /= 2){
                __syncthreads();
                if (t < stride){
                  
                  if(scan_array[t] < scan_array[t+stride]){
                      scan_array[t]=scan_array[t+stride];
                      ii_array[t]= ii_array[t+stride];
                  }
                  if (scan_array[t] == 123){
                      printf("found max scan value %f and index is  %d\\n", scan_array[t], ii_array[t]);
                  }
                }            
            }
            
            if(start + t <len){
                index[t] = ii_array[t];
            }
            
            
            //This returns max value and index at the 1st index i.e. 0 in all_gina_scores and index matrices respectively
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
        //takes the input matrix X and returns the labels for l or r
        __global__ void split_data(float* label,float* X, const int bound,const int dim, const int l,const int w){
            int Row = threadIdx.x+blockIdx.x*blockDim.x;
            if(Row < l){
                if(X[Row*w+dim] <= bound){
                    label[Row]=1;
                }
                else{
                    label[Row]=0;
                }
            }           
        }
        """
        return SourceModule(kernelwrapper)
         
    def calculate_score(self, X_train_b, y_train_b, dim, row):
        # Implement CUDA function
        
        self.mod = self.get_source_module()

        unique_classes = np.unique(y_train_b)
        #Making categorical data into integers
        for j,label in enumerate(unique_classes):
            y_train_b[y_train_b == label]=j
        
        #Fetch the kernel
        start =cuda.Event()
        end = cuda.Event()

        calculate_scores=self.mod.get_function("calculate_gina_scores")

        #Converting to 32 bit
        X_train_b = X_train_b.astype(np.float32)
        y_train_b =y_train_b.astype(np.int32)

        unique_classes=np.array(len(unique_classes)).astype(np.int32)
        row = np.array(X_train_b.shape[0]).astype(np.int32)
        dim = np.array(X_train_b.shape[1]).astype(np.int32)

        #Grid and block dimensions
        blocksize=64
        blockDim=(blocksize,X_train_b.shape[1],1)
        gridDim =(X_train_b.shape[0]//blocksize+1,X_train_b.shape[1]//blocksize+1,1)


        #Memory allocation
        X_train_b_gpu = gpuArray.to_gpu(X_train_b)
        y_train_b_gpu = gpuArray.to_gpu(y_train_b)  
        impurity_scores_gpu = gpuArray.zeros_like(X_train_b_gpu)
        
        #run and time the kernel
        start.record()
        calculate_scores(impurity_scores_gpu,X_train_b_gpu,y_train_b_gpu,unique_classes,row,dim,block=blockDim,grid=gridDim)

        # Wait for the event to complete
        end.record()
        end.synchronize()
        time = start.time_till(end)

        #Fetch the impurity scores
        impurity_scores = impurity_scores_gpu.get()
        return impurity_scores,time
    
    def choose_best_score(self, all_gina_scores: np.array):
        #Unravel the matrix
        all_gina_scores_flatten =all_gina_scores.flatten()

        self.mod = self.get_source_module()
        
        #Fetch the kernel 
        start =cuda.Event()
        end = cuda.Event()

        find_best_gina_score=self.mod.get_function("find_best_gina_score")
        #setting up the eindex matrix
        index=[i for i in range(all_gina_scores_flatten.shape[0])]
        index=np.array(index)
        
        print([(i, val) for i, val in enumerate(all_gina_scores_flatten) if val==122])

        # 
#         num_threads = all_gina_scores_flatten.shape[0]
#         print(f"num_threads is {num_threads}")
#         blockDim=(num_threads,1,1)
#         gridDim =(1,1,1)
        
        #Grid and block dimensions
        blockDim=(1024,1,1)
        num_blocks = all_gina_scores_flatten.shape[0]//1024+1
        gridDim =(num_blocks,1,1)
        grid_dim = all_gina_scores_flatten.shape[0]//1024+1
        print(f"num_blocks is", num_blocks )

        #Converting to 32 bit
        row =np.float32(all_gina_scores_flatten.shape[0])
        all_gina_scores_flatten=all_gina_scores_flatten.astype(np.float32)

        #memory allocation
        all_gina_scores_gpu=gpuArray.to_gpu(all_gina_scores_flatten)
        index=index.astype(np.int32)
        index_gpu=gpuArray.to_gpu(index)

        #run and time the kernel
        start.record()
        find_best_gina_score(index_gpu,all_gina_scores_gpu,row,block=blockDim,grid=gridDim)

        # Wait for the event to complete
        end.record()
        end.synchronize()
        time = start.time_till(end)

        #Fetch the impurity scores
        index=index_gpu.get()
        gina_scores=all_gina_scores_gpu.get()
        max_index=unravel_index(int(index[0]),all_gina_scores.shape)

        return(max_index,time, index)

    def split_data(self, X: np.array, y: np.array, bound: float, dim: float):

        # Implement CUDA function
        #Fetch the kernel
        start =cuda.Event()
        end = cuda.Event()

        split_data=self.mod.get_function("split_data")
        #Grid and block dimensions
        blockDim=(1024,1,1)
        gridDim =(X.shape[0]//1024+1,1,1)
        #Converting to 32 bit
        labels = np.zeros(y.shape).astype(np.float32)
        X_32 = X.astype(np.float32)
        row = np.array([X.shape[0]], dtype=np.int32)
        col = np.array([X.shape[1]], dtype=np.int32)
        bound = np.array([bound], dtype=np.int32)
        dim =np.array([dim], dtype=np.int32)
        #Memory allocation
        X_gpu = gpuArray.to_gpu(X_32) 
        labels_gpu = gpuArray.to_gpu(labels)

        #run and time the kernel
        start.record()
        split_data(labels_gpu,X_gpu,bound,dim,row,col,block=blockDim,grid=gridDim)

        # Wait for the event to complete
        end.record()
        end.synchronize()
        time = start.time_till(end)

        #Fetch the impurity scores
        labels = labels_gpu.get()

        #code for splitting the child
        print(labels==0)
        y_l = y[labels==0]
        y_r = y[labels==1]
        print(y_l,y_r)
        print(X[0][:])
        X_l = X[(labels==0),:]
        X_r = X[(labels==1),:]
        return (X_l, y_l, X_r, y_r)

In [17]:
dtu = DecisionTreeCudaUtils()
data = np.ones((200, 200))
data[190,190] = 123
max_index, time, all_index = dtu.choose_best_score(data)
print("Calculated max index is:" , max_index)
print("Reference max index is:" , np.unravel_index(np.argmax(data), data.shape))

[]
num_blocks is 40
Calculated max index is: (190, 190)
Reference max index is: (190, 190)






In [13]:
np.unravel_index(2047, data.shape)

(10, 47)

In [None]:

for block_num in [0,1,2]:
    print(np.unravel_index(all_index[block_num*1024], data.shape))

In [None]:
max_index = 5*63+10
all_index[all_index==max_index]

In [19]:
[np.unravel_index(x, data.shape) for x in all_index.tolist()]

[(190, 190),
 (184, 65),
 (190, 190),
 (184, 67),
 (184, 68),
 (184, 69),
 (190, 190),
 (184, 71),
 (184, 72),
 (184, 73),
 (184, 74),
 (184, 75),
 (184, 76),
 (184, 77),
 (190, 190),
 (184, 79),
 (184, 80),
 (184, 81),
 (184, 82),
 (184, 83),
 (184, 84),
 (184, 85),
 (184, 86),
 (184, 87),
 (184, 88),
 (184, 89),
 (184, 90),
 (184, 91),
 (184, 92),
 (184, 93),
 (184, 94),
 (184, 95),
 (184, 96),
 (184, 97),
 (184, 98),
 (184, 99),
 (184, 100),
 (184, 101),
 (184, 102),
 (184, 103),
 (184, 104),
 (184, 105),
 (184, 106),
 (184, 107),
 (184, 108),
 (184, 109),
 (190, 190),
 (184, 111),
 (184, 112),
 (184, 113),
 (184, 114),
 (184, 115),
 (184, 116),
 (184, 117),
 (184, 118),
 (184, 119),
 (184, 120),
 (184, 121),
 (184, 122),
 (184, 123),
 (184, 124),
 (184, 125),
 (184, 126),
 (184, 127),
 (184, 128),
 (184, 129),
 (184, 130),
 (184, 131),
 (184, 132),
 (184, 133),
 (184, 134),
 (184, 135),
 (184, 136),
 (184, 137),
 (184, 138),
 (184, 139),
 (184, 140),
 (184, 141),
 (184, 142),
 (184

In [None]:
 def choose_best_score(self, all_gina_scores: np.array):
        
        current_scores = all_gina_scores
        current_index = [i for i in range(all_gina_scores_flatten.shape[0])]
        
        #setting up the eindex matrix
        
        
        # In loop
            # check if all the score can fit in one block
            
            # if not, call recursive function
            #
    
    def choose_best_score_recursive(self, all_gina_scores: np.array, index):
        # Change this to take in indexes and scores
        #Unravel the matrix
        all_gina_scores_flatten =all_gina_scores.flatten()

        self.mod = self.get_source_module()
        
        #Fetch the kernel 
        start =cuda.Event()
        end = cuda.Event()

        find_best_gina_score=self.mod.get_function("find_best_gina_score")
        #setting up the eindex matrix
        
        index=np.array(index)

        # 
#         num_threads = all_gina_scores_flatten.shape[0]
#         print(f"num_threads is {num_threads}")
#         blockDim=(num_threads,1,1)
#         gridDim =(1,1,1)
        
        #Grid and block dimensions
        blockDim=(1024,1,1)
        num_blocks = all_gina_scores_flatten.shape[0]//1024+1
        gridDim =(num_blocks,1,1)
        grid_dim = all_gina_scores_flatten.shape[0]//1024+1
        print(f"num_blocks is", num_blocks )

        #Converting to 32 bit
        row =np.float32(all_gina_scores_flatten.shape[0])
        all_gina_scores_flatten=all_gina_scores_flatten.astype(np.float32)

        #memory allocation
        all_gina_scores_gpu=gpuArray.to_gpu(all_gina_scores_flatten)
        index=index.astype(np.int32)
        index_gpu=gpuArray.to_gpu(index)

        #run and time the kernel
        start.record()
        find_best_gina_score(index_gpu,all_gina_scores_gpu,row,block=blockDim,grid=gridDim)

        # Wait for the event to complete
        end.record()
        end.synchronize()
        time = start.time_till(end)

        #Fetch the impurity scores
        index=index_gpu.get()
        gina_scores=all_gina_scores_gpu.get()
        max_index=unravel_index(int(index[0]),all_gina_scores.shape)
        # Change this to return scores and max index for each block
        return(max_index,time, index)