In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Numba And CUDA

# **CUDA**

# CUDA is a parallel computing platform and programming model that enables dramatic increment in computing performance by harnessing the power of GPU i.e.,graphical processing unit GPU.

- Before going forward with CUDA , let's first understand the difference between CPU and GPU

# CPU:
CPU is good for running just a simple serial programming code

but if you want go for a parallel programming then that would be

not that much efficient and fast on CPU since it has limited no.

of cores.

and CPU depends mostly upon two things:

1.Large Cache

2.Advanced Control Logic

# GPU

# GPU stands for graphical processing unit
- GPU Designed to maximize the throughput
- Majority of silicon area dedicated to:
- Massive no. of cores.

# Massive no. of arithmetic logic units which nvidia calls CUDA CORES.
- host: the CPU

- device: the GPU

- host memory: the system main memory

- device memory: onboard memory on a GPU card

- kernels: a GPU function launched by the host and executed on the device

- device function: a GPU function executed on the device which can only be called from the device (i.e. from a kernel or another device function)

# install the CUDA toolkit with:

In [1]:
conda install cudatoolkit

# CUDA Bindings

# Numba supports interacting with the CUDA Driver
- API via the NVIDIA CUDA Python bindings
- and its own ctypes-based bindings.
- Functionality is equivalent between the two bindings. 
- The ctypes-based bindings are presently the default,
- but the NVIDIA bindings will be used by 
- default (if they are available in the environment) in a future Numba release.

- You can install the NVIDIA bindings with:

In [6]:
# or conda install nvidia::cuda-python
        
!pip install cuda-python        

In [7]:
# NUMBA INSTALLATION
!pip install numba

In [8]:
!find / -iname 'libdevice'
!find / -iname 'libnvvm.so'

In [22]:
#install libraries
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import math
from  numba import jit,njit,vectorize,cuda,uint8,f8,uint32, cuda
from pylab import imshow,show
from timeit import default_timer as timer

# Mandelbrot

# Non- GPU Mandelbrot (i.e., CPU)

In [23]:
def mandel(x,y,max_iters):
    """ 
       Given the real and imaginary parts of complex number,
       determine if it is candidate for membership in the mandlebrot
       set gives a fixed number of iterations.
    """   
    
    c=complex(x,y)
    z=0.0j
    for i in range(max_iters):
        z=z*z+c
        if (z.real*z.real + z.imag)>=4:
            return i
    return max_iters

In [26]:
def create_fractal(min_x,max_x,min_y,max_y,image,iters):
    height=image.shape[0]
    width=image.shape[1]
    
    pixel_size_x=(max_x - min_x)/width
    pixel_size_y=(max_y - min_y)/height
    
    for x in range(width):
        real=min_x + x * pixel_size_x
        for y in range(height):
            imag=min_y + y * pixel_size_y
            color= mandel(real , imag , iters)
            image[y,x] = color

In [27]:
image = np.zeros((1024,1536),dtype=np.uint8)
start = timer()
create_fractal(-2.0,1.0, -1.0 ,1.0,image,20)
dt = timer() - start

print("Mandlebrot created in %f s " % dt)
imshow(image)
show()

# Now let's use a Numba Version

# Numba Mandlebrot

# Numba is LLVM JIT Supports for Python

# Run Numba Mandlebrot

In [29]:
@jit
def mandel(x,y,max_iters):
    """
    Given the real and imaginary parts of complex number,
       determine if it is candidate for membership in the mandlebrot
       set gives a fixed number of iterations.
    """
    c=complex(x,y)
    z=0.0j
    for i in range(max_iters):
        z=z*z+c
        if (z.real*z.real + z.imag)>=4:
            return i
    return max_iters

@jit

def create_fractal(min_x,max_x,min_y,max_y,image,iters):
    height=image.shape[0]
    width=image.shape[1]
    
    pixel_size_x=(max_x - min_x)/width
    pixel_size_y=(max_y - min_y)/height
    
    for x in range(width):
        real=min_x + x * pixel_size_x
        for y in range(height):
            imag=min_y + y * pixel_size_y
            color= mandel(real , imag , iters)
            image[y,x] = color

In [30]:
image = np.zeros((1024,1536),dtype=np.uint8)
start = timer()
create_fractal(-2.0,1.0, -1.0 ,1.0,image,20)
dt = timer() - start

print("Mandlebrot created in %f s " % dt)
imshow(image)
show()

# Let's See with CUDA-GPU Mandlebrot

In [39]:
#cuda.jit(retype=uint32 , argtypes=[ uint32, uint8 , f8], device = True)(mandel)
mandel_gpu = cuda.jit(retype=uint32(f8, f8, uint32),device = True)

In [45]:
@cuda.jit(retype=uint32(f8,f8,f8,f8,uint8[:,:]))
def mandal_kernel(min_x,max_x,min_y,max_y,image,iters):
    height=image.shape[0]
    width=image.shape[1]
    
    pixel_size_x=(max_x - min_x)/width
    pixel_size_y=(max_y - min_y)/height
    
    startX,startY = cuda.grid(2)
    gridX = cuda.gridDim.x * cuda.blockDim.x;
    gridY = cuda.gridDim.y * cuda.blockDim.y
    
    
    for x in range(startX, width , gridX):
        real=min_x + x * pixel_size_x
        for y in range(startY, height, gridY):
            imag=min_y + y * pixel_size_y
            image[y,x] = mandel_gpu(real , imag , iters)

In [46]:
image = np.zeros((1024,1536),dtype=np.uint8)
start = timer()
create_fractal(-2.0,1.0, -1.0 ,1.0,image,20)
dt = timer() - start

print("Mandlebrot created in %f s " % dt)
imshow(image)
show()

# CUDA Vectorize

In [65]:
from numba import(cuda , vectorize)
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans 

from functools import wraps
from time import time

def real_estate_df():
    """30 years of housing prices"""
    
    df = pd.read_csv('../input/region-ready/Region.csv')
    df.rename(columns={'RegionName':'ZipCode'}, inplace = True)
    df['ZipCode'] = df['ZipCode'].map(lambda x: "{:.0f}".format(x))
    df['RegionID'] = df['RegionID'].map(lambda x: "{:.0f}".format(x))
    return df

def numerical_real_estate_array(df):
    """Converts df to numpy numerical array"""
    
    columns_to_drop = ['RegionID','zipCode','City','State','Metro','CountryName']
    df_numerical=df.dropna()
    df_numerical=df_numerical.drop(columns_to_drop,axis=1)
    return df_numerical.values

def real_estate_array():
    """Returns Real Estate array"""
    
    df = real_estate_df()
    rea = numerical_real_estate_array(df)
    return np.float32(rea)

@vectorize(['float32(float32,float32)'],target='cuda')
def add_ufunc(x,y):
    return x+y

def cuda_operation():
    """Performs Vectorized Operations om GPU"""
    
    x=real_estate_array()
    y=real_estate_array()
    
    print("Moving calculations to GPU memory")
    x_device=cuda.to_device(x)
    y_device=cuda.to_device(y)
    out_device= cuda.device_array(
        shape=(x_device.shape[0],x_device.shape[1]),dtype=np.float32)
    print(x_device)
    print(x_device.shape)
    print(x_device.dtype)
    
    print('Calculating on GPU')
    add_unfunc(x_device,y_device,out=out_device)
    
    out_host=out_device.copy_to_host()
    print(f"Calculations from GPU {out_host}")

cuda_operation()    