In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Author: Raquel Ricoy

#Benchmark to study Kaggle's GPUs, CPUs and TPUs potential.
#It's going to use Pytorch and to stablish a script to calculate its performance and GFLOPS.

#Install pytorch
#!conda install -y pytorch torchvision -c pytorch

#Install openpyxl to import an excel with the operations in pandas
!conda install -y openpyxl

import torch
import platform

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
#print(os.listdir("../input"))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#Importing Libraries needed for use torch
import timeit
import torch.utils.benchmark as benchmark

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Solving environment: / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | 

In [2]:
#Information about system
print('Platform processor:', platform.processor())
print('Platform architecture:', platform.architecture())

#Number of threads
num_cores = os.cpu_count()
print('Number of cores:',num_cores)
torch.set_num_threads(num_cores)
num_threads = num_cores

Platform processor: x86_64
Platform architecture: ('64bit', '')
Number of cores: 4


In [3]:
#Functions obtained from Torch Webpages por PyTorch Benchmarks
def batched_dot_mul_sum(a, b):
    '''Computes batched dot by multiplying and summing'''
    return a.mul(b).sum(-1)


def batched_dot_bmm(a, b):
    '''Computes batched dot by reducing to bmm'''
    a = a.reshape(-1, 1, a.shape[-1])
    b = b.reshape(-1, b.shape[-1], 1)
    return torch.bmm(a, b).flatten(-3)

In [4]:
# Method that do the benchmark and compare results with dot mul sum implementations and vectorSum
def benchMark(sizes,nThreads):
    results = []
    if(len(sizes) == 0):
        print("Parameter 'sizes' has to a have minumun of 1 parameters")
        return
    if(len(nThreads)==0):
        print("Parameter 'nThreads' has to a have minumun of 1 parameters")
    
    for n in sizes:
        # label and sub_label are the rows
        # description is the column
        label = 'Batched dot'
        sub_label = f'[{n}, {n}]'
        x = torch.ones((n, n))
        for num_threads in nThreads:
            results.append(benchmark.Timer(
                stmt='batched_dot_mul_sum(x, x)',
                setup='from __main__ import batched_dot_mul_sum',
                globals={'x': x},
                num_threads=num_threads,
                label=label,
                sub_label=sub_label,
                description='mul/sum',
            ).blocked_autorange())
            results.append(benchmark.Timer(
                stmt='batched_dot_bmm(x, x)',
                setup='from __main__ import batched_dot_bmm',
                globals={'x': x},
                num_threads=num_threads,
                label=label,
                sub_label=sub_label,
                description='bmm',
            ).blocked_autorange())   
    compare = benchmark.Compare(results)
    compare.print()
    return compare

In [5]:
#The limit dimension of the sizes is with matrix of 16384x16384. It is running out of memory with that sizes
sizes = [512,1024,2048,4096,8192,16384,32768]
threads = range(1,num_threads+1)

#The benchmark execute 5 times to gather data and afterwards 
for i in range(0,5):
    print("Benchmark execution: ",i+1, "\n")
    benchMark(sizes,threads)

Benchmark execution:  1 

[---------------- Batched dot -----------------]
                      |   mul/sum   |     bmm   
1 threads: -------------------------------------
      [512, 512]      |      147.4  |     1033.3
      [1024, 1024]    |      601.7  |     4038.3
      [2048, 2048]    |     4174.2  |    16077.3
      [4096, 4096]    |    50495.4  |    63381.5
      [8192, 8192]    |   202793.2  |   253407.5
      [16384, 16384]  |   815654.0  |  1004948.7
      [32768, 32768]  |  3269393.3  |  4040062.7
2 threads: -------------------------------------
      [512, 512]      |       66.4  |      524.2
      [1024, 1024]    |      312.0  |     2044.3
      [2048, 2048]    |     1879.7  |     8183.8
      [4096, 4096]    |    28383.4  |    31700.5
      [8192, 8192]    |   113162.3  |   126763.5
      [16384, 16384]  |   451245.8  |   506287.1
      [32768, 32768]  |  1841218.6  |  2025048.7
3 threads: -------------------------------------
      [512, 512]      |      101.8  |      

In [6]:
def ownBenchmark(sizes,threads,writerCSV,operation):        
    for i in range(0,5):
        print("\nBenchmark execution for ",operation,": ",i+1, "\n")
        for thread in threads:
            print("\nNumber of threads: ",thread, "\n")
            torch.set_num_threads(thread)
            for n in sizes:
                timeInit = time.time()

                xCPU = torch.ones(n, n)

                if(operation == "mul_sum"):
                    batched_dot_mul_sum(xCPU,xCPU)
                else:
                    batched_dot_bmm(xCPU,xCPU)

                timeFinish = time.time()
                print(f"size matrix [{n}] -> {(timeFinish - timeInit):0.8f} s")
                writer.writerow([operation, n, i+1,thread,(timeFinish - timeInit)])

In [7]:
#Now my own benchmark. With this i going to measure Speed ups and efficiencies. The pytorch benchmark give us too good results to be true...

import time #-> time.time() returns the time in seconds
import csv #We are going to generate an csv with the results to work with pandas

sizes = [512,1024,2048,4096,8192,16384,32768] # maximun size withou running out memory -> 65536
threads = range(1,num_threads+1)

with open('results_cpu.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["operation", "sizeMatrix", "numberCase","nThreads","timeElpased"])
    ownBenchmark(sizes,threads,writer,"mul_sum")
    ownBenchmark(sizes,threads,writer,"bmm")


Benchmark execution for  mul_sum :  1 


Number of threads:  1 

size matrix [512] -> 0.00445342 s
size matrix [1024] -> 0.01218557 s
size matrix [2048] -> 0.05881882 s
size matrix [4096] -> 0.30668116 s
size matrix [8192] -> 1.10526419 s
size matrix [16384] -> 3.31922936 s
size matrix [32768] -> 10.42859626 s

Number of threads:  2 

size matrix [512] -> 0.16462946 s
size matrix [1024] -> 0.00319481 s
size matrix [2048] -> 0.01174021 s
size matrix [4096] -> 0.05181456 s
size matrix [8192] -> 0.19295168 s
size matrix [16384] -> 0.79300761 s
size matrix [32768] -> 3.42784548 s

Number of threads:  3 

size matrix [512] -> 0.16488266 s
size matrix [1024] -> 0.00346351 s
size matrix [2048] -> 0.01179934 s
size matrix [4096] -> 0.05033088 s
size matrix [8192] -> 0.19379497 s
size matrix [16384] -> 0.73534966 s
size matrix [32768] -> 2.81769228 s

Number of threads:  4 

size matrix [512] -> 0.17831373 s
size matrix [1024] -> 0.00265169 s
size matrix [2048] -> 0.00859118 s
size matrix [409

In [8]:
#Generate the excel and giving a little of format
#TODO include the calculate of FLOPS in excel/dataFrame
import pandas as pd

df = pd.read_csv("results_cpu.csv")
df.info()

df.sort_values(by=["operation","numberCase","nThreads"])

df[df.nThreads==1].to_excel("results_cpu_excel_1Thread.xlsx")

df[df.nThreads==2].to_excel("results_cpu_excel_2Thread.xlsx")

df[df.nThreads==3].to_excel("results_cpu_excel_3Thread.xlsx")

df[df.nThreads==4].to_excel("results_cpu_excel_4Thread.xlsx")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   operation    280 non-null    object 
 1   sizeMatrix   280 non-null    int64  
 2   numberCase   280 non-null    int64  
 3   nThreads     280 non-null    int64  
 4   timeElpased  280 non-null    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 11.1+ KB
