In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Author: Raquel Ricoy

#Benchmark to study Kaggle's GPUs, CPUs and TPUs potential.
#It's going to use Pytorch and to stablish a script to calculate its performance and GFLOPS.

#Install pytorch
#!conda install -y pytorch torchvision -c pytorch

import torch
import platform

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
#print(os.listdir("../input"))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#Importing Libraries needed for use torch
import timeit
import torch.utils.benchmark as benchmark

In [2]:
#Information about system
print('Platform processor:', platform.processor())
print('Platform architecture:', platform.architecture())

#Number of threads
num_cores = os.cpu_count()
print('Number of cores:',num_cores)
torch.set_num_threads(num_cores)
num_threads = num_cores

Platform processor: x86_64
Platform architecture: ('64bit', '')
Number of cores: 4


In [3]:
#Functions obtained from Torch Webpages por PyTorch Benchmarks
def batched_dot_mul_sum(a, b):
    '''Computes batched dot by multiplying and summing'''
    return a.mul(b).sum(-1)


def batched_dot_bmm(a, b):
    '''Computes batched dot by reducing to bmm'''
    a = a.reshape(-1, 1, a.shape[-1])
    b = b.reshape(-1, b.shape[-1], 1)
    return torch.bmm(a, b).flatten(-3)

In [4]:
# Method that do the benchmark and compare results with dot mul sum implementations and vectorSum
def benchMark(sizes,nThreads):
    results = []
    if(len(sizes) == 0):
        print("Parameter 'sizes' has to a have minumun of 1 parameters")
        return
    if(len(nThreads)==0):
        print("Parameter 'nThreads' has to a have minumun of 1 parameters")
    
    for n in sizes:
        # label and sub_label are the rows
        # description is the column
        label = 'Batched dot'
        sub_label = f'[{n}, {n}]'
        x = torch.ones((n, n))
        for num_threads in nThreads:
            results.append(benchmark.Timer(
                stmt='batched_dot_mul_sum(x, x)',
                setup='from __main__ import batched_dot_mul_sum',
                globals={'x': x},
                num_threads=num_threads,
                label=label,
                sub_label=sub_label,
                description='mul/sum',
            ).blocked_autorange())
            results.append(benchmark.Timer(
                stmt='batched_dot_bmm(x, x)',
                setup='from __main__ import batched_dot_bmm',
                globals={'x': x},
                num_threads=num_threads,
                label=label,
                sub_label=sub_label,
                description='bmm',
            ).blocked_autorange())   
    compare = benchmark.Compare(results)
    compare.print()
    return compare

In [5]:
#The limit dimension of the sizes is with matrix of 16384x16384. It is running out of memory with that sizes
sizes = [512,1024,2048,4096,8192,16384,32768]
threads = range(1,num_threads+1)
compares = []

#The benchmark execute 5 times to gather data and afterwards 
for i in range(0,5):
    print("Benchmark execution: ",i+1, "\n")
    compares.insert(i,benchMark(sizes,threads))

Benchmark execution:  1 

[---------------- Batched dot -----------------]
                      |   mul/sum   |     bmm   
1 threads: -------------------------------------
      [512, 512]      |       58.8  |      379.2
      [1024, 1024]    |      234.8  |     1533.8
      [2048, 2048]    |     2980.8  |     5923.0
      [4096, 4096]    |    27599.4  |    23608.1
      [8192, 8192]    |   106999.8  |   100657.1
      [16384, 16384]  |   452855.9  |   391097.6
      [32768, 32768]  |  1730799.5  |  1497240.4
2 threads: -------------------------------------
      [512, 512]      |       45.4  |      216.5
      [1024, 1024]    |      120.8  |      794.2
      [2048, 2048]    |     1300.9  |     3120.0
      [4096, 4096]    |    18656.1  |    12283.5
      [8192, 8192]    |    73262.4  |    48122.6
      [16384, 16384]  |   289714.9  |   191590.8
      [32768, 32768]  |  1138733.6  |   762893.7
3 threads: -------------------------------------
      [512, 512]      |       55.3  |      

In [6]:
import time #-> time.time() returns the time in seconds

sizes = [512,1024,2048,4096,8192,16384,32768] # maximun size withou running out memory -> 65536

#Firstly batched_dot_mul_sum
for i in range(0,5):
    print("\nBenchmark execution for batched_dot_mul_sum: ",i+1, "\n")
    for thread in threads:
        print("\nNumber of threads: ",thread, "\n")
        torch.set_num_threads(thread)
        for n in sizes:
            timeInit = time.time()

            xCPU = torch.ones(n, n)
            batched_dot_mul_sum(xCPU,xCPU)

            timeFinish = time.time()

            print(f"size matrix [{n}] -> {(timeFinish - timeInit):0.8f} s")


Benchmark execution for batched_dot_mul_sum:  1 


Number of threads:  1 

size matrix [512] -> 0.00142360 s
size matrix [1024] -> 0.00203443 s
size matrix [2048] -> 0.00999641 s
size matrix [4096] -> 0.04697704 s
size matrix [8192] -> 0.17838335 s
size matrix [16384] -> 0.71890712 s
size matrix [32768] -> 2.79828310 s

Number of threads:  2 

size matrix [512] -> 0.16695285 s
size matrix [1024] -> 0.00138998 s
size matrix [2048] -> 0.00683355 s
size matrix [4096] -> 0.03448224 s
size matrix [8192] -> 0.11792541 s
size matrix [16384] -> 0.43686557 s
size matrix [32768] -> 1.87386990 s

Number of threads:  3 

size matrix [512] -> 0.16946530 s
size matrix [1024] -> 0.00147009 s
size matrix [2048] -> 0.00527644 s
size matrix [4096] -> 0.02475953 s
size matrix [8192] -> 0.09903407 s
size matrix [16384] -> 0.39970922 s
size matrix [32768] -> 1.55395746 s

Number of threads:  4 

size matrix [512] -> 0.18396664 s
size matrix [1024] -> 0.00126934 s
size matrix [2048] -> 0.00461555 s
size ma

In [7]:
#Generate a file.out with the results.
#Benchmark from pytorch just generate a print from the sdtout, so we need to change the stdout to write it in a file.
#import sys

#original_stdout = sys.stdout # Save a reference to the original standard output

#with open('output_benchmark.out', 'w') as file:
#    sys.stdout = file # Change the standard output to the file we created.
#    i=1
#    for compare in compares:
#        print("Benchmark execution: ",i, "\n")
#        compare.print()
#        i += 1
#    sys.stdout = original_stdout # Reset the standard output to its original value
