<a href="https://colab.research.google.com/github/SathishRama/parallelization_tests/blob/main/cpu_vs_gpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
import timeit
import tensorflow as tf

In [80]:
#file_name = "sample_data/dataset_10M_rows.csv"
file_name = "sample_data/card_payments.csv"  # 500 rows 
#file_name1 = "sample_data/dataset_100K_rows.csv"
file_name1 = "sample_data/dataset_1M_rows.csv"    # 1.2m rows

In [81]:
tf.debugging.set_log_device_placement(False)

In [116]:
num_gpus = len(tf.config.list_physical_devices('GPU'))
num_cpus = len(tf.config.list_physical_devices('CPU'))
print(f"Number of cpu's:{num_cpus} gpu's:{num_gpus} in the runtime.")

Number of cpu's:1 gpu's:1 in the runtime.


In [124]:
# CPU : Python Looping code Test on pre-loaded tf ( i.e time to load the dataset is excluded)
percent_util = []
file = open(file_name)
next(file)
num_rows_python = len(file.readlines())
def python_calc():
    for line in file:
        line = line.split(",")
        percent_util.append(float(line[3]) * float(line[1]))
    res = sum(percent_util)
num_executions_python = 2424
total_time_looping = round(timeit.timeit(python_calc, number=num_executions_python),3)
print(f"Total Time using python looping code:{total_time_looping} for {num_executions_python} iterations with {num_rows_python} rows in each iteration")

Total Time using python looping code:0.009 for 2424 iterations with 490 rows in each iteration


In [None]:
#load the file1 with 1.2m rows
df_1m = pd.read_csv(file_name1)
df_1m = df_1m.drop(columns = ['card_id','bill_date'])
#df.head()
df_1m = df_1m.astype('float')
#convert the pandas df to tf 
my_tf_1m = tf.convert_to_tensor(df_1m)
num_rows_1m = my_tf_1m.shape[0]
tf1_1m = tf.reshape(my_tf_1m[:,0],[1,num_rows_1m])
tf2_1m = tf.reshape(my_tf_1m[:,1],[num_rows_1m,1])

In [121]:
# CPU : Test 1.2M rows using tf multiply
def tf_cpu_preloaded():
  with tf.device('/CPU:0'):
    res = tf.math.reduce_sum(tf.math.multiply(my_tf_1m[:,0], my_tf_1m[:,1]),0)
    print(res)
    return
num_executions = 1
total_time = timeit.timeit(tf_cpu_preloaded, number=num_executions)
avg_cpu_time = total_time/num_executions
print(f"Avg Time: {avg_cpu_time} for {num_rows_1m}")

Executing op StridedSlice in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op StridedSlice in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Sum in device /job:localhost/replica:0/task:0/device:CPU:0
tf.Tensor(1058531308404000.0, shape=(), dtype=float64)
Avg Time: 0.03172968699982448 for 1187760


In [125]:
# GPU : Test 1.2M rows using tf matmul. matmul operation kernel has gpu implementation 
def tf_gpu_preloaded():
  with tf.device('/GPU:0'):
    res = tf.matmul(tf1_1m,tf2_1m)   
    #print(res)
    return
num_executions = 1
total_time = timeit.timeit(tf_gpu_preloaded, number=num_executions)
avg_gpu_time = total_time/num_executions
print(f"Avg Time: {avg_gpu_time} for {num_rows_1m}")

Executing op MatMul in device /job:localhost/replica:0/task:0/device:GPU:0
Avg Time: 0.0029357990006246837 for 1187760


In [126]:
#Summarize Looping vs using vector computation on cpu & gpu
print(f"Number of cpu's:{num_cpus} gpu's:{num_gpus} in the runtime.")
print(f"Time with Loop logic: {total_time_looping} for {num_executions_python} iterations with {num_rows_python} rows in each iteration. Rows processed {num_executions_python*num_rows_python}")
if num_gpus:
  print(f"GPU time: {avg_gpu_time} for {num_rows_1m} rows")
y = round(total_time_looping/avg_cpu_time,3)
print(f"Using Looping code vs TF CPU : {y} x improvement for {num_rows_1m} rows processed")  
if num_gpus:
  x = round(total_time_looping/avg_gpu_time,3)
  print(f"Using Looping code vs TF GPU : {x} x improvement for {num_rows_1m} rows processed")

Number of cpu's:1 gpu's:1 in the runtime.
Time with Loop logic: 0.009 for 2424 iterations with 490 rows in each iteration. Rows processed 1187760
GPU time: 0.0029357990006246837 for 1187760 rows
Using Looping code vs TF CPU : 0.284 x improvement for 1187760 rows processed
Using Looping code vs TF GPU : 3.066 x improvement for 1187760 rows processed
