# First: Numba tutorial

In [1]:
from numba import cuda
import numpy as np

From CUDA guid: "They guide the programmer to partition the problem into coarse **sub-problems** that can be **solved independently in parallel by blocks of threads**, and **each sub-problem** into finer pieces that can be **solved cooperatively in parallel by all threads within the block.**"

Total Threads = threadsperblock * blockspergrid 

In [77]:
@cuda.jit
def increment_by_one(an_array):
    pos = cuda.grid(1)
    if pos < an_array.size:
        an_array[pos] += 1

In [78]:
an_array = np.arange(100)
an_array

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [79]:
threadsperblock = 32

In [80]:
blockspergrid = np.ceil(an_array.size / 32).astype('int')
blockspergrid

4

In [81]:
increment_by_one[blockspergrid, threadsperblock](an_array)



In [82]:
an_array

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

Do it in 2D:

In [83]:
an_array2D = np.vstack((np.arange(100), np.arange(100)))
an_array2D

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
        32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
        48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
        64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
        96, 97, 98, 99],
       [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
        32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
        48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
        64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
        96, 97, 98, 99]])

In [84]:
@cuda.jit
def increment_a_2D_array(an_array):
    x, y = cuda.grid(2)
    if x < an_array.shape[0] and y < an_array.shape[1]:
        if x < 1:
            an_array[x, y] += 1
        else:
            an_array[x, y] -= 1

In [85]:
threadsperblock = (1, 32)
blockspergrid_x = np.ceil(an_array2D.shape[0] / threadsperblock[0]).astype('int')
blockspergrid_y = np.ceil(an_array2D.shape[1] / threadsperblock[1]).astype('int')
blockspergrid = (blockspergrid_x, blockspergrid_y)
blockspergrid

(2, 4)

In [86]:
increment_a_2D_array[blockspergrid, threadsperblock](an_array2D)



In [87]:
an_array2D

array([[  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100],
       [ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
         12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
         25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
         38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
         51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
         64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  7

# Solution a)

In [2]:
with open('test_input.txt', 'r') as f:
    input = [[ord(char) for char in line[:-1]] for line in f.readlines()]
input

[[49, 97, 98, 99, 50],
 [112, 113, 114, 51, 115, 116, 117, 56, 118, 119, 120],
 [97, 49, 98, 50, 99, 51, 100, 52, 101, 53, 102],
 [116, 114, 101, 98, 55, 117, 99, 104, 101, 116]]

In [3]:
lengths = [len(i) for i in input]
max_len = max(lengths)
max_len

11

In [4]:
input_padded = np.full((len(input), max_len), 0)
for idx, line in enumerate(input):
    input_padded[idx, :len(line)] = line
input_padded

array([[ 49,  97,  98,  99,  50,   0,   0,   0,   0,   0,   0],
       [112, 113, 114,  51, 115, 116, 117,  56, 118, 119, 120],
       [ 97,  49,  98,  50,  99,  51, 100,  52, 101,  53, 102],
       [116, 114, 101,  98,  55, 117,  99, 104, 101, 116,   0]])

In [5]:
output = np.zeros(len(input))
output

array([0., 0., 0., 0.])

**every thread get's one string**

In [6]:
chr(48) + chr(50)

'02'

In [7]:
ord('0')

48

In [8]:
@cuda.jit
def find_numbers(input_array, output_array):
    pos = cuda.grid(1)
    first = -1
    last = -1
    for char in input_array[pos]:
        if char <= ord('9') and char >= ord('0'):
            if first == -1:
                first = char - ord('0')
            last = char - ord('0')
    output_array[pos] = first*10 + last

In [9]:
threadsperblock = 32
blockspergrid = np.ceil(len(input) / threadsperblock).astype('int')
blockspergrid

1

In [10]:
find_numbers[blockspergrid, threadsperblock](input_padded, output)



In [11]:
np.sum(output)

142.0

# Solution b)

input now can **also** be: one, two, three, four, five, six, seven, eight, and nine

In [33]:
with open('input.txt', 'r') as f:
    input = [[ord(char) for char in line[:-1]] for line in f.readlines()]
input

[[114,
  104,
  113,
  114,
  112,
  100,
  120,
  115,
  113,
  104,
  103,
  120,
  122,
  107,
  110,
  114,
  50,
  102,
  111,
  117,
  114,
  115,
  110,
  114,
  99,
  102,
  116,
  104,
  114,
  101,
  101],
 [50, 98, 109, 99, 107, 108],
 [102,
  111,
  117,
  114,
  57,
  53,
  113,
  118,
  107,
  118,
  118,
  101,
  105,
  103,
  104,
  116,
  53],
 [50,
  116,
  113,
  98,
  120,
  103,
  114,
  114,
  112,
  109,
  120,
  113,
  102,
  103,
  108,
  115,
  113,
  106,
  107,
  113,
  116,
  104,
  114,
  101,
  101,
  54,
  110,
  104,
  106,
  118,
  98,
  120,
  112,
  102,
  108,
  104,
  114,
  49,
  101,
  105,
  103,
  104,
  116,
  119,
  111,
  104,
  114],
 [55, 116, 119, 111, 54, 56],
 [110,
  105,
  110,
  101,
  55,
  116,
  119,
  111,
  115,
  108,
  115,
  101,
  118,
  101,
  110,
  52,
  115,
  102,
  111,
  117,
  114,
  115,
  105,
  120],
 [102, 105, 118, 101, 109, 110, 106, 120, 98, 114, 110, 115, 118, 108, 51],
 [51,
  113,
  99,
  102,
  120,
  103,

In [34]:
lengths = [len(i) for i in input]
max_len = max(lengths)
max_len

49

In [35]:
input_padded = np.full((len(input), max_len), 0)
for idx, line in enumerate(input):
    input_padded[idx, :len(line)] = line
input_padded

array([[114, 104, 113, ...,   0,   0,   0],
       [ 50,  98, 109, ...,   0,   0,   0],
       [102, 111, 117, ...,   0,   0,   0],
       ...,
       [101, 105, 103, ...,   0,   0,   0],
       [116, 119, 111, ...,   0,   0,   0],
       [115, 115, 101, ...,   0,   0,   0]])

In [36]:
words = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
words

['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

In [37]:
words_ascii = [[ord(l) for l in w] for w in words]
words_ascii

[[111, 110, 101],
 [116, 119, 111],
 [116, 104, 114, 101, 101],
 [102, 111, 117, 114],
 [102, 105, 118, 101],
 [115, 105, 120],
 [115, 101, 118, 101, 110],
 [101, 105, 103, 104, 116],
 [110, 105, 110, 101]]

In [38]:
lengths = [len(i) for i in words_ascii]
max_len = max(lengths)
max_len

5

In [39]:
words_padded = np.full((len(words_ascii), max_len), 0)
for idx, line in enumerate(words_ascii):
    words_padded[idx, :len(line)] = line
words_padded

array([[111, 110, 101,   0,   0],
       [116, 119, 111,   0,   0],
       [116, 104, 114, 101, 101],
       [102, 111, 117, 114,   0],
       [102, 105, 118, 101,   0],
       [115, 105, 120,   0,   0],
       [115, 101, 118, 101, 110],
       [101, 105, 103, 104, 116],
       [110, 105, 110, 101,   0]])

In [40]:
word_lengths = np.array([len(word) for word in words])
word_lengths

array([3, 3, 5, 4, 4, 3, 5, 5, 4])

In [41]:
# input_padded = input_padded[0:1]
# input_padded

In [42]:
word_numbers = np.array([ord(f'{i+1}') for i in range(9)])
word_numbers

array([49, 50, 51, 52, 53, 54, 55, 56, 57])

In [43]:
input_padded.shape

(1000, 49)

In [44]:
@cuda.jit
def find_numbers_and_words(input_array, words, word_lengths, output_array):
    i = cuda.grid(1)

    if i < input_array.shape[0]:
        for j, (word, word_len) in enumerate(zip(words, word_lengths)):
            
            number_of_shifts = input_array[i].size - word_len + 1

            for shift in range(number_of_shifts):
                window = input_array[i][shift:shift+word_len]
                same = True
                for idx in range(word_len):
                    if window[idx] == word[idx]:
                        same *= True
                    else:
                        same *= False
                if same:
                    for idx in range(word_len):
                        output_array[i][shift+idx] = j + 1 + ord('0')

In [45]:
output_b = input_padded.copy()
output_b

array([[114, 104, 113, ...,   0,   0,   0],
       [ 50,  98, 109, ...,   0,   0,   0],
       [102, 111, 117, ...,   0,   0,   0],
       ...,
       [101, 105, 103, ...,   0,   0,   0],
       [116, 119, 111, ...,   0,   0,   0],
       [115, 115, 101, ...,   0,   0,   0]])

In [46]:
threadsperblock = 7
blockspergrid = np.ceil(len(input_padded) / threadsperblock).astype('int')
blockspergrid

143

In [47]:
find_numbers_and_words[blockspergrid, threadsperblock](input_padded, words_padded, word_lengths, output_b)



In [48]:
input_padded

array([[114, 104, 113, ...,   0,   0,   0],
       [ 50,  98, 109, ...,   0,   0,   0],
       [102, 111, 117, ...,   0,   0,   0],
       ...,
       [101, 105, 103, ...,   0,   0,   0],
       [116, 119, 111, ...,   0,   0,   0],
       [115, 115, 101, ...,   0,   0,   0]])

In [49]:
output_b

array([[114, 104, 113, ...,   0,   0,   0],
       [ 50,  98, 109, ...,   0,   0,   0],
       [ 52,  52,  52, ...,   0,   0,   0],
       ...,
       [ 56,  56,  56, ...,   0,   0,   0],
       [ 50,  50,  50, ...,   0,   0,   0],
       [115,  55,  55, ...,   0,   0,   0]])

In [50]:
output_b_2 = np.zeros(len(output_b))
output_b_2

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [51]:
find_numbers[blockspergrid, threadsperblock](output_b, output_b_2)

In [52]:
output_b_2

array([23., 22., 45., 22., 78., 96., 53., 31., 97., 92., 52., 44., 73.,
       15., 86., 82., 59., 32., 45., 51., 37., 39., 76., 62., 11., 45.,
       73., 21., 37., 14., 48., 66., 75., 58., 16., 39., 75., 13., 87.,
       84., 37., 65., 32., 46., 28., 22., 66., 87., 19., 24., 38., 88.,
       96., 53., 55., 22., 51., 91., 28., 93., 81., 82., 17., 57., 19.,
       46., 62., 33., 37., 57., 97., 15., 78., 15., 91., 63., 86., 33.,
       55., 19., 36., 38., 86., 18., 44., 86., 26., 36., 29., 57., 21.,
       91., 44., 61., 59., 52., 67., 53., 11., 81., 66., 82., 61., 11.,
       55., 18., 56., 18., 55., 87., 96., 55., 51., 41., 57., 19., 23.,
       45., 21., 47., 61., 36., 68., 65., 38., 79., 62., 77., 84., 88.,
       96., 92., 98., 72., 55., 96., 91., 48., 76., 57., 53., 31., 13.,
       38., 23., 29., 91., 52., 16., 22., 24., 23., 12., 99., 36., 17.,
       56., 53., 65., 42., 28., 67., 42., 26., 71., 51., 21., 71., 95.,
       83., 28., 44., 64., 42., 21., 17., 95., 72., 43., 23., 77

In [53]:
np.sum(output_b_2)

54578.0