In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Author: Raquel Ricoy

#Benchmark to study Kaggle's GPUs, CPUs and TPUs potential.
#It's going to use Pytorch and to stablish a script to calculate its performance and GFLOPS.

#Install pytorch
#!conda install -y pytorch torchvision -c pytorch

import torch
import platform

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
#print(os.listdir("../input"))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#Importing Libraries needed for use torch
import timeit
import torch.utils.benchmark as benchmark
from itertools import product

In [2]:
#Information about system
print('Platform processor:', platform.processor())
print('Platform architecture:', platform.architecture())

#Number of threads
num_cores = os.cpu_count()
print('Number of cores:',num_cores)
torch.set_num_threads(num_cores)
num_threads = num_cores

Platform processor: x86_64
Platform architecture: ('64bit', '')
Number of cores: 4


In [3]:
#Functions obtained from Torch Webpages por PyTorch Benchmarks
def batched_dot_mul_sum(a, b):
    '''Computes batched dot by multiplying and summing'''
    return a.mul(b).sum(-1)


def batched_dot_bmm(a, b):
    '''Computes batched dot by reducing to bmm'''
    a = a.reshape(-1, 1, a.shape[-1])
    b = b.reshape(-1, b.shape[-1], 1)
    return torch.bmm(a, b).flatten(-3)

#Function developed by my own. Sum two vectors and save the output in vector C
def sumVector(aVector,bVector):
    lengthVectors = len(aVector);
    cVector = torch.empty(lengthVectors,dtype=torch.float)
    for i in torch.arange(0,lengthVectors):
        cVector[i] = aVector[i] + bVector[i]
    return cVector

In [4]:
# Method that do the benchmark and compare results with dot mul sum implementations and vectorSum
def benchMark(sizes,nThreads):
    results = []
    if(len(sizes) == 0):
        print("Parameter 'sizes' has to a have minumun of 1 parameters")
        return
    if(len(nThreads)==0):
        print("Parameter 'nThreads' has to a have minumun of 1 parameters")
    
    for n in sizes:
        # label and sub_label are the rows
        # description is the column
        label = 'Batched dot'
        sub_label = f'[{n}, {n}]'
        x = torch.ones((n, n))
        for num_threads in nThreads:
            results.append(benchmark.Timer(
                stmt='batched_dot_mul_sum(x, x)',
                setup='from __main__ import batched_dot_mul_sum',
                globals={'x': x},
                num_threads=num_threads,
                label=label,
                sub_label=sub_label,
                description='mul/sum',
            ).blocked_autorange(min_run_time=1))
            results.append(benchmark.Timer(
                stmt='batched_dot_bmm(x, x)',
                setup='from __main__ import batched_dot_bmm',
                globals={'x': x},
                num_threads=num_threads,
                label=label,
                sub_label=sub_label,
                description='bmm',
            ).blocked_autorange(min_run_time=1))
    compare = benchmark.Compare(results)
    compare.print()
    return compare

In [5]:
#The limit dimension of the sizes is with matrix of 100000x100000. It is running out of memory with that sizes
sizes = [512,1024,2048,4096,5120]
threads = range(1,num_threads+1)
compares = []

#The benchmark execute 5 times to gather data and afterwards 
for i in range(0,5):
    print("Benchmark execution: ",i+1, "\n")
    compares.insert(i,benchMark(sizes,threads))

Benchmark execution:  1 

[------------- Batched dot --------------]
                    |  mul/sum  |    bmm  
1 threads: -------------------------------
      [512, 512]    |    125.3  |    923.9
      [1024, 1024]  |    587.5  |   3661.0
      [2048, 2048]  |   4758.0  |  14325.8
      [4096, 4096]  |  48614.0  |  56336.2
      [5120, 5120]  |  77514.7  |  87857.0
2 threads: -------------------------------
      [512, 512]    |     57.7  |    483.7
      [1024, 1024]  |    317.8  |   1853.4
      [2048, 2048]  |   2361.1  |   7247.8
      [4096, 4096]  |  27958.4  |  28501.6
      [5120, 5120]  |  46861.4  |  46678.7
3 threads: -------------------------------
      [512, 512]    |     94.2  |    347.8
      [1024, 1024]  |    402.3  |   1306.0
      [2048, 2048]  |   2730.7  |   5057.1
      [4096, 4096]  |  28783.1  |  19682.8
      [5120, 5120]  |  43937.8  |  30589.1
4 threads: -------------------------------
      [512, 512]    |     65.0  |    286.3
      [1024, 1024]  |    283

In [6]:
#Generate a file.out with the results.
#Benchmark from pytorch just generate a print from the sdtout, so we need to change the stdout to write it in a file.
import sys

original_stdout = sys.stdout # Save a reference to the original standard output

with open('output_benchmark.out', 'w') as file:
    sys.stdout = file # Change the standard output to the file we created.
    i=1
    for compare in compares:
        print("Benchmark execution: ",i, "\n")
        compare.print()
        i += 1
    sys.stdout = original_stdout # Reset the standard output to its original value
