In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Author: Raquel Ricoy

#Benchmark to study Kaggle's GPUs, CPUs and TPUs potential.
#It's going to use Pytorch and to stablish a script to calculate its performance and GFLOPS.

#Install pytorch
#!conda install -y pytorch torchvision -c pytorch

import torch
import platform

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
#print(os.listdir("../input"))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#Importing Libraries needed for use torch
import timeit
import torch.utils.benchmark as benchmark
from itertools import product

In [2]:
#Information about system
print('Platform processor:', platform.processor())
print('Platform architecture:', platform.architecture())

#Number of threads
num_cores = os.cpu_count()
print('Number of cores:',num_cores)
torch.set_num_threads(num_cores)
num_threads = num_cores

Platform processor: x86_64
Platform architecture: ('64bit', '')
Number of cores: 4


In [3]:
#Functions obtained from Torch Webpages por PyTorch Benchmarks
def batched_dot_mul_sum(a, b):
    '''Computes batched dot by multiplying and summing'''
    return a.mul(b).sum(-1)


def batched_dot_bmm(a, b):
    '''Computes batched dot by reducing to bmm'''
    a = a.reshape(-1, 1, a.shape[-1])
    b = b.reshape(-1, b.shape[-1], 1)
    return torch.bmm(a, b).flatten(-3)

#Function developed by my own. Sum two vectors and save the output in vector C
def sumVector(aVector,bVector):
    lengthVectors = len(aVector);
    cVector = torch.empty(lengthVectors,dtype=torch.float)
    for i in torch.arange(0,lengthVectors):
        cVector[i] = aVector[i] + bVector[i]
    return cVector

In [30]:
# Method that do the benchmark and compare results with dot mul sum implementations and vectorSum
def benchMark(sizes,nThreads):
    results = []
    if(len(sizes) == 0):
        print("Parameter 'sizes' has to a have minumun of 1 parameters")
        return
    if(len(nThreads)==0):
        print("Parameter 'nThreads' has to a have minumun of 1 parameters")
    
    for n in sizes:
        # label and sub_label are the rows
        # description is the column
        label = 'Batched dot'
        sub_label = f'[{n}, {n}]'
        x = torch.ones((n, n))
        for num_threads in nThreads:
            results.append(benchmark.Timer(
                stmt='batched_dot_mul_sum(x, x)',
                setup='from __main__ import batched_dot_mul_sum',
                globals={'x': x},
                num_threads=num_threads,
                label=label,
                sub_label=sub_label,
                description='mul/sum',
            ).blocked_autorange(min_run_time=1))
            results.append(benchmark.Timer(
                stmt='batched_dot_bmm(x, x)',
                setup='from __main__ import batched_dot_bmm',
                globals={'x': x},
                num_threads=num_threads,
                label=label,
                sub_label=sub_label,
                description='bmm',
            ).blocked_autorange(min_run_time=1))
    compare = benchmark.Compare(results)
    compare.print()

In [31]:
#The limit dimension of the sizes is [100000,100000]. It is running out of memory with that sizes
sizes = [512,1024,2048,4096,5120]
threads = range(1,num_threads+1)

#Este metodo no vale para hacer benchmark de gpus, habria que tunearlo
benchMark(sizes,threads)

[------------- Batched dot --------------]
                    |  mul/sum  |    bmm  
1 threads: -------------------------------
      [512, 512]    |    139.9  |    821.8
      [1024, 1024]  |    606.3  |   3194.9
      [2048, 2048]  |   4427.8  |  12577.0
      [4096, 4096]  |  19489.2  |  49718.9
      [5120, 5120]  |  29783.5  |  77762.1
2 threads: -------------------------------
      [512, 512]    |     63.6  |    431.4
      [1024, 1024]  |    320.5  |   1625.0
      [2048, 2048]  |   1988.8  |   6451.8
      [4096, 4096]  |  10112.6  |  25135.2
      [5120, 5120]  |  15452.0  |  39218.4
3 threads: -------------------------------
      [512, 512]    |    101.4  |    387.5
      [1024, 1024]  |    426.7  |   1386.2
      [2048, 2048]  |   2487.1  |   5538.8
      [4096, 4096]  |  12022.9  |  20877.9
      [5120, 5120]  |  18719.1  |  32533.7
4 threads: -------------------------------
      [512, 512]    |     68.3  |    299.8
      [1024, 1024]  |    317.2  |   1060.1
      [2048