# **Fully Connected Neural Network: A `CUDA` and `C++` Implementation**

## **Prepare workspace**

In [18]:
from google.colab import drive
drive.mount("/content/drive")
%cd /content/drive/MyDrive/CUDA

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/CUDA


## **Extract `.gz` data (if needed)**

In [None]:
# Extract data from `.gz`
# Only need to run once!
!pip install patool
import patoolib
patoolib.extract_archive("mnist/t10k-images-idx3-ubyte.gz", outdir="mnist")
patoolib.extract_archive("mnist/t10k-labels-idx1-ubyte.gz", outdir="mnist")
patoolib.extract_archive("mnist/train-images-idx3-ubyte.gz", outdir="mnist")
patoolib.extract_archive("mnist/train-labels-idx1-ubyte.gz", outdir="mnist")

Collecting patool
  Downloading patool-3.1.0-py2.py3-none-any.whl.metadata (4.3 kB)
Downloading patool-3.1.0-py2.py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.4/98.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patool
Successfully installed patool-3.1.0


INFO patool: Extracting mnist/t10k-images-idx3-ubyte.gz ...
INFO:patool:Extracting mnist/t10k-images-idx3-ubyte.gz ...
INFO patool: running /usr/bin/7z e -omnist -- mnist/t10k-images-idx3-ubyte.gz
INFO:patool:running /usr/bin/7z e -omnist -- mnist/t10k-images-idx3-ubyte.gz
INFO patool: ... mnist/t10k-images-idx3-ubyte.gz extracted to `mnist'.
INFO:patool:... mnist/t10k-images-idx3-ubyte.gz extracted to `mnist'.
INFO patool: Extracting mnist/t10k-labels-idx1-ubyte.gz ...
INFO:patool:Extracting mnist/t10k-labels-idx1-ubyte.gz ...
INFO patool: running /usr/bin/7z e -omnist -- mnist/t10k-labels-idx1-ubyte.gz
INFO:patool:running /usr/bin/7z e -omnist -- mnist/t10k-labels-idx1-ubyte.gz
INFO patool: ... mnist/t10k-labels-idx1-ubyte.gz extracted to `mnist'.
INFO:patool:... mnist/t10k-labels-idx1-ubyte.gz extracted to `mnist'.
INFO patool: Extracting mnist/train-images-idx3-ubyte.gz ...
INFO:patool:Extracting mnist/train-images-idx3-ubyte.gz ...
INFO patool: running /usr/bin/7z e -omnist -- mni

'mnist'

## **Edit `Makefile`**

In [None]:
%%writefile Makefile

# Compilers
CXX := g++
CXX_FLAGS := -std=c++17 -ggdb
NVCC := nvcc

# Folders
BIN := bin
SRC := src
INCLUDE := include

EXECUTABLE := nn_main

all: $(BIN)/$(EXECUTABLE)

run: clean all
	clear
	./$(BIN)/$(EXECUTABLE)

$(BIN)/$(EXECUTABLE): $(SRC)/*.cu $(SRC)/*.cpp
	$(NVCC) -I $(INCLUDE) $^ -o $@

clean:
	-rm $(BIN)/*

Overwriting Makefile


## **Compile and run**

In [22]:
# Compile
!make

make: Nothing to be done for 'all'.


In [23]:
# Run the program
# ./main <#-neurons> <#-epochs> <learning-rate>
!./bin/nn_main 20 3 0.5

-- # neurons: 20
-- # epochs: 3
-- learning rate: 0.5
Train Images: 60000 with size 784
Train Labels: 60000 labels loaded
Test Images: 10000 with size 784
Test Labels: 10000 labels loaded


CPU Train start...
-- number of epochs: 3
- layer 0 forward time: 3026.636719 ms
- layer 1 forward time: 84.778114 ms
- layer 2 forward time: 53.116928 ms
>>> Epoch 1 CEE loss: 13.8162
- layer 0 forward time: 4367.266602 ms
- layer 1 forward time: 84.238655 ms
- layer 2 forward time: 54.450562 ms
>>> Epoch 2 CEE loss: 14.2307
- layer 0 forward time: 3020.112061 ms
- layer 1 forward time: 84.411232 ms
- layer 2 forward time: 50.453152 ms
>>> Epoch 3 CEE loss: 14.4916
TRAIN TIME: 25052.363281 ms


GPU Train start...
-- number of epochs: 3
- layer 0 forward time: 51.684544 ms
- layer 1 forward time: 6.116736 ms
- layer 2 forward time: 2.822240 ms
>>> Epoch 1 CEE loss: 7.42336
- layer 0 forward time: 51.637470 ms
- layer 1 forward time: 5.341312 ms
- layer 2 forward time: 2.619584 ms
>>> Epoch 2 CEE los