# Import modules

In [1]:
import time
import csv
import cudf
import cupy as cp
import numpy as np
import pandas as pd
from csvToInteger import csvToInteger
from csvToBitInteger import csvToBitInteger

# Set a file to test on

In [2]:
fileName = '/home/tarun/test.csv'

# Simplify the reading the database reading functions for benchmarking

In [3]:
def cpugpudf(file):
    start = time.time()
    df = pd.read_parquet(file)

    gpudf = cudf.DataFrame.from_pandas(df)

    end = time.time()

    print("Read time + transfer to GPU: ", end - start)
    print("memory usage of df:",df.memory_usage(deep=True).sum())

    return gpudf

def gpudf(file, bytes = 0):
    start = time.time()
    if bytes > 0:
        df = cudf.read_parquet(file, bytes)
    else:
        df = cudf.read_parquet(file)
    end = time.time()


    print("Read time on GPU: ", end - start)
    print("memory usage of df:",df.memory_usage(deep=True).sum())


    return df

# Create a synthetic database to test additional performance

In [4]:
from PAMI.extras.generateDatabase.generateTemporalDatabase import generateTemporalDatabase as gtd

a = gtd(50000, 20, 10, fileName, 50, '\t', "Database")
a.createTemporalFile()

# Get the synthetic database stats

# Run the tests

In [5]:
from PAMI.extras.dbStats.temporalDatabaseStats import temporalDatabaseStats as alg

obj = alg(fileName)
obj.run()
obj.printStats()

Database size : 50000
Number of items : 20
Minimum Transaction Size : 1
Average Transaction Size : 5.51876
Maximum Transaction Size : 10
Minimum Inter Arrival Period : 1
Average Inter Arrival Period : 1.0
Maximum Inter Arrival Period : 1
Minimum periodicity : 27
Average periodicity : 31.1
Maximum periodicicty : 44
Standard Deviation Transaction Size : 2.872568199782209
Variance : 8.251813098661973
Sparsity : 0.724062


In [6]:
from csvToInteger import csvToInteger
from csvToBitInteger import csvToBitInteger

In [7]:
intpar = csvToInteger(fileName)
bitpar = csvToBitInteger(fileName)

No output file specified. Using default name. /home/tarun/Temporal_kosarak.csv
Done creating integer representation. fileName: /home/tarun/Temporal_kosarak.csv_int
No output file specified. Using default name. /home/tarun/Temporal_kosarak.csv
Done creating bit integer representation. fileName: /home/tarun/Temporal_kosarak.csv_bitInt


In [8]:
df = cpugpudf(intpar)
df = gpudf(intpar)

Read time + transfer to GPU:  0.6050150394439697
memory usage of df: 187840
Read time on GPU:  0.579683780670166
memory usage of df: 187840


In [9]:
df = cpugpudf(bitpar)
df = gpudf(bitpar)

In [None]:
start = time.time()

# Open the CSV file and read the first few lines to determine the maximum number of columns
max_cols = 0
with open(fileName, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        max_cols = max(max_cols, len(row[0].split('\t')))

# Define column names based on the maximum number of columns
col_names = [f'col{i+1}' for i in range(max_cols)]

# Read the CSV file with the determined column names
df = pd.read_csv(fileName, names=col_names, delimiter= "\t",header=None, engine='python')
# print(df)

# replace NA to 0
df = df.fillna(0)

df = df.astype('int32')


df = cudf.from_pandas(df)

# Now, you can work with the DataFrame 'df'
end = time.time()
print(end - start)
print("memory usage of df:",df.memory_usage(deep=True).sum())

0.8496224880218506
memory usage of df: 2400000


In [None]:
start = time.time()

# Read the CSV file using cuDF
df = cudf.read_csv(fileName, delimiter='\t', header = None)
# print(df.head())
end = time.time()
for col in df.columns:
    df[col] = df[col].astype('int32')
print(end - start)

print("memory usage of df:",df.memory_usage(deep=True).sum())

0.01169443130493164
memory usage of df: 2050176


In [None]:
# convert cells in df to numeric
# df = cudf.to_numeric(df, errors='coerce')
