In [None]:
# Upload all txt files 
from google.colab import files 
uploaded = files.upload() # uploaded to the content folder 

Saving input_10000.csv to input_10000.csv
Saving input_100000.csv to input_100000.csv
Saving input_1000000.csv to input_1000000.csv
Saving sol_10000.txt to sol_10000 (1).txt
Saving sol_100000.txt to sol_100000 (1).txt
Saving sol_1000000.txt to sol_1000000 (1).txt


In [None]:
!nvcc --version
!nvprof --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
nvprof: NVIDIA (R) Cuda command line profiler
Copyright (c) 2012 - 2019 NVIDIA Corporation
Release version 10.1.243 (21)


In [None]:
%%writefile compareResults.c 
#include <stdio.h>
#include <stdlib.h>


void compareFiles(char *file_name1, char *file_name2) 
{ 
//get from https://www.tutorialspoint.com/c-program-to-compare-two-files-and-report-mismatches
FILE* fp1 = fopen(file_name1, "r");
FILE* fp2 = fopen(file_name2, "r");
    // fetching character of two file 
    // in two variable ch1 and ch2 
    char ch1 = getc(fp1); 
    char ch2 = getc(fp2); 
  
    // error keeps track of number of errors 
    // pos keeps track of position of errors 
    // line keeps track of error line 
    int error = 0, pos = 0, line = 1; 
  
    // iterate loop till end of file 
    while (ch1 != EOF && ch2 != EOF) 
    { 
        pos++; 
  
        // if both variable encounters new 
        // line then line variable is incremented 
        // and pos variable is set to 0 
        if (ch1 == '\n' && ch2 == '\n') 
        { 
            line++; 
            pos = 0; 
        } 
  
        // if fetched data is not equal then 
        // error is incremented 
        if (ch1 != ch2) 
        { 
            error++; 
            printf("Line Number : %d \tError"
               " Position : %d \n", line, pos); 
        } 
  
        // fetching character until end of file 
        ch1 = getc(fp1); 
        ch2 = getc(fp2); 
    } 
  
    printf("Total Errors : %d\t", error); 
} 

int main(int argc, char *argv[]){

    if( argc < 3) {
      printf("Require two files\n");
      exit(1);
      
   }
compareFiles(argv[1], argv[2]);
}


Overwriting compareResults.c


In [None]:
%%writefile gputimer.h

// Taken from Udacity Parallel Programming course

#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__

struct GpuTimer
{
      cudaEvent_t start;
      cudaEvent_t stop;
 
      GpuTimer()
      {
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
      }
 
      ~GpuTimer()
      {
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
      }
 
      void Start()
      {
            cudaEventRecord(start, 0);
      }
 
      void Stop()
      {
            cudaEventRecord(stop, 0);
      }
 
      float Elapsed()
      {
            float elapsed;
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&elapsed, start, stop);
            return elapsed;
      }
};

#endif  /* __GPU_TIMER_H__ */

Overwriting gputimer.h


In [None]:
%%writefile parallel_unified.cu

#include "gputimer.h"
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define AND 0
#define OR 1
#define NAND 2
#define NOR 3
#define XOR 4
#define XNOR 5

__global__ void classify(char* data, int SIZE, char* results) {
	// since only needs to launch one thread per logic gate, SIZE should equal to the number of rows in the file
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	if (i < SIZE) {
		if (data[i * 6 + 4] == '0')
		{
			int result = (((data[i * 6]) - '0') & ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '1')
		{
			int result = ((data[i * 6]) - '0') | ((data[i * 6 + 2]) - '0');
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '2')
		{
			int result = !(((data[i * 6]) - '0') & ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '3')
		{
			int result = !(((data[i * 6]) - '0') | ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '4')
		{
			int result = (((data[i * 6]) - '0') ^ ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '5')
		{
			int result = !(((data[i * 6]) - '0') ^ ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
	}
}


void parallel_unified(FILE* fp_in, int length, FILE* fp_out) {
	// input has length 'length' and output has length 'length/3'
	// output file has only 2 elements in one line (a number and a '\n') while input file has 6 (listed in main)
	char* data, * results;
	// timer_kernel records time for kernel function
	GpuTimer timer_kernel;
	// Unified memory allocation methods
	cudaMallocManaged(&data, length);
	cudaMallocManaged(&results, length / 3);
	fread(data, 1, length, fp_in);
	int maxThreadNum = 1024;
	// distribute the total threads equally in blocks
	while (1)
	{
		if (length / 6 % maxThreadNum != 0)
		{
			maxThreadNum--;
		}
		else
		{
			break;
		}
	}
	int totalBlocks = length / 6 / maxThreadNum;
	timer_kernel.Start();
	classify <<<totalBlocks, maxThreadNum >>> (data, length / 6, results);
	timer_kernel.Stop();
	cudaDeviceSynchronize();
	printf("Time for kernel functions: %f ms\n", timer_kernel.Elapsed());

	fputs(results, fp_out);

	cudaFree(data);
	cudaFree(results);
}

int main(int argc, char* argv[])
{
	if (argc < 4)
	{
		printf("You must enter 3 input files!\n");
		exit(1);
	}

	// argv[1] : input_file_path
	// argv[2] : input_file_length
	// argv[3] : output_file_path

	char* fileName = argv[1];
	FILE* input = fopen(fileName, "r");
	if (input == NULL)
	{
		exit(EXIT_FAILURE);
		printf("No input file.\n");
	}


	char* outputFileName = argv[3];
	FILE* output = fopen(outputFileName, "w");
	if (output == NULL)
	{
		exit(EXIT_FAILURE);
		printf("No output file.\n");
	}

	// atoi(argv[2]) is the number of lines in a file
	// each line has 6 elements, including 3 numbers, 2 commas and 1 '\n'
	// as a result the length pass in should be atoi(argv[2]) * 6 
	printf("For input file %s \n", fileName);
	parallel_unified(input, atoi(argv[2]) * 6, output);

	fclose(input);
	fclose(output);

	return 0;
}

Overwriting parallel_unified.cu


In [None]:
!nvcc parallel_unified.cu -o parallel_unified
!./parallel_unified input_10000.csv 10000 output_parallel_unified_10000.txt
!./parallel_unified input_100000.csv 100000 output_parallel_unified_100000.txt
!./parallel_unified input_1000000.csv 1000000 output_parallel_unified_1000000.txt

For input file input_10000.csv 
Time for kernel functions: 0.518176 ms
For input file input_100000.csv 
Time for kernel functions: 0.725664 ms
For input file input_1000000.csv 
Time for kernel functions: 2.660480 ms


In [None]:
!nvprof ./parallel_unified input_10000.csv 10000 output_parallel_unified_10000.txt

For input file input_10000.csv 
==283== NVPROF is profiling process 283, command: ./parallel_unified input_10000.csv 10000 output_parallel_unified_10000.txt
Time for kernel functions: 0.484320 ms
==283== Profiling application: ./parallel_unified input_10000.csv 10000 output_parallel_unified_10000.txt
==283== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  451.62us         1  451.62us  451.62us  451.62us  classify(char*, int, char*)
      API calls:   89.63%  185.91ms         2  92.957ms     968ns  185.91ms  cudaEventCreate
                    9.83%  20.386ms         2  10.193ms  21.254us  20.364ms  cudaMallocManaged
                    0.22%  460.22us         1  460.22us  460.22us  460.22us  cudaDeviceSynchronize
                    0.17%  344.29us         1  344.29us  344.29us  344.29us  cuDeviceTotalMem
                    0.07%  134.84us        97  1.3900us     137ns  56.733us  cuDeviceGetAttribute
      

In [None]:
!nvprof ./parallel_unified input_100000.csv 100000 output_parallel_unified_100000.txt

For input file input_100000.csv 
==294== NVPROF is profiling process 294, command: ./parallel_unified input_100000.csv 100000 output_parallel_unified_100000.txt
Time for kernel functions: 1.015680 ms
==294== Profiling application: ./parallel_unified input_100000.csv 100000 output_parallel_unified_100000.txt
==294== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  984.00us         1  984.00us  984.00us  984.00us  classify(char*, int, char*)
      API calls:   89.49%  189.11ms         2  94.554ms  1.0300us  189.11ms  cudaEventCreate
                    9.65%  20.394ms         2  10.197ms  26.933us  20.367ms  cudaMallocManaged
                    0.47%  993.84us         1  993.84us  993.84us  993.84us  cudaDeviceSynchronize
                    0.19%  393.29us         1  393.29us  393.29us  393.29us  cuDeviceTotalMem
                    0.08%  178.38us         2  89.191us  39.098us  139.29us  cudaFree
           

In [None]:
!nvprof ./parallel_unified input_1000000.csv 1000000 output_parallel_unified_1000000.txt

For input file input_1000000.csv 
==305== NVPROF is profiling process 305, command: ./parallel_unified input_1000000.csv 1000000 output_parallel_unified_1000000.txt
Time for kernel functions: 3.928640 ms
==305== Profiling application: ./parallel_unified input_1000000.csv 1000000 output_parallel_unified_1000000.txt
==305== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  3.8932ms         1  3.8932ms  3.8932ms  3.8932ms  classify(char*, int, char*)
      API calls:   87.48%  177.59ms         2  88.793ms     944ns  177.58ms  cudaEventCreate
                   10.03%  20.354ms         2  10.177ms  50.062us  20.304ms  cudaMallocManaged
                    1.92%  3.9058ms         1  3.9058ms  3.9058ms  3.9058ms  cudaDeviceSynchronize
                    0.28%  563.05us         2  281.53us  160.67us  402.38us  cudaFree
                    0.17%  346.27us         1  346.27us  346.27us  346.27us  cuDeviceTotalMem
    

In [None]:
%%writefile parallel_explicit.cu

#include "gputimer.h"
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <ctime>

#define AND 0
#define OR 1
#define NAND 2
#define NOR 3
#define XOR 4
#define XNOR 5

__global__ void classify(char* d_data, int SIZE, char* d_results) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	// since only needs to launch one thread per logic gate, SIZE should equal to the number of rows in the file
	if (i < SIZE) {
		if (d_data[i * 6 + 4] == '0')
		{
			int result = (((d_data[i * 6]) - '0') & ((d_data[i * 6 + 2]) - '0'));
			d_results[2 * i] = (result + '0');
			d_results[2 * i + 1] = '\n';
		}
		else if (d_data[i * 6 + 4] == '1')
		{
			int result = ((d_data[i * 6]) - '0') | ((d_data[i * 6 + 2]) - '0');
			d_results[2 * i] = (result + '0');
			d_results[2 * i + 1] = '\n';
		}
		else if (d_data[i * 6 + 4] == '2')
		{
			int result = !(((d_data[i * 6]) - '0') & ((d_data[i * 6 + 2]) - '0'));
			d_results[2 * i] = (result + '0');
			d_results[2 * i + 1] = '\n';
		}
		else if (d_data[i * 6 + 4] == '3')
		{
			int result = !(((d_data[i * 6]) - '0') | ((d_data[i * 6 + 2]) - '0'));
			d_results[2 * i] = (result + '0');
			d_results[2 * i + 1] = '\n';
		}
		else if (d_data[i * 6 + 4] == '4')
		{
			int result = (((d_data[i * 6]) - '0') ^ ((d_data[i * 6 + 2]) - '0'));
			d_results[2 * i] = (result + '0');
			d_results[2 * i + 1] = '\n';
		}
		else if (d_data[i * 6 + 4] == '5')
		{
			int result = !(((d_data[i * 6]) - '0') ^ ((d_data[i * 6 + 2]) - '0'));
			d_results[2 * i] = (result + '0');
			d_results[2 * i + 1] = '\n';
		}
	}
}

void parallel_explicit(FILE* fp_in, int length, FILE* fp_out) {
	// input has length 'length' and output has length 'length/3'
	// output file has only 2 elements in one line (a number and a '\n') while input file has 6 (listed in main)
	char* data, * d_data, * results, * d_results;
	// timer_kernel records time for kernel function
	// timer_migration records explicit data migration time (copy data from host to device)
	GpuTimer timer_kernel, timer_migration_write, timer_migration_read;
	data = (char*)malloc(length);
	results = (char*)malloc(length/3);
	cudaMalloc(&d_data, length);
	cudaMalloc(&d_results, length/3);
	fread(data, 1, length, fp_in);
	timer_migration_write.Start();
	cudaMemcpy(d_data, data, length, cudaMemcpyHostToDevice);
	cudaMemcpy(d_results, results, length/3, cudaMemcpyHostToDevice);
	timer_migration_write.Stop();
	int maxThreadNum = 1024;
	// distribute the total threads equally in blocks
	while(1)
	{
		if(length/6 % maxThreadNum != 0)
		{
			maxThreadNum--;
		}
		else
		{
			break;
		}
	}
	int totalBlocks = length / 6 / maxThreadNum;
	timer_kernel.Start();
	classify <<<totalBlocks, maxThreadNum>>> (d_data, length/6, d_results);
	timer_kernel.Stop();
	timer_migration_read.Start();
	cudaMemcpy(data, d_data, length, cudaMemcpyDeviceToHost);
	cudaMemcpy(results, d_results, length/3, cudaMemcpyDeviceToHost);
	timer_migration_read.Stop();
	printf("Time for kernel functions: %f ms\n", timer_kernel.Elapsed());
	printf("Time for explicit data migration: %f ms\n", timer_migration_write.Elapsed() + timer_migration_read.Elapsed());

	fputs(results, fp_out);
	
	cudaFree(d_data);
	cudaFree(d_results);
	free(data);
	free(results);
}

int main(int argc, char* argv[])
{
	if ( argc < 4)
	{
		printf("You must enter 3 input files!\n");
		exit(1);
	}

	// argv[1] : input_file_path
	// argv[2] : input_file_length
	// argv[3] : output_file_path

	char* fileName = argv[1];
	FILE* input = fopen(fileName, "r");
	if (input == NULL)
	{
		exit(EXIT_FAILURE);
		printf("No input file.\n");
	}


	char* outputFileName = argv[3];
	FILE* output = fopen(outputFileName, "w");
	if (output == NULL)
	{
		exit(EXIT_FAILURE);
		printf("No output file.\n");
	}

	// atoi(argv[2]) is the number of lines in a file
	// each line has 6 elements, including 3 numbers, 2 commas and 1 '\n'
	// as a result the length pass in should be atoi(argv[2]) * 6
	printf("For input file %s \n", fileName);
	parallel_explicit(input, atoi(argv[2]) * 6, output);
	fclose(input);
	fclose(output);

	return 0;
}

Writing parallel_explicit.cu


In [None]:
!nvcc parallel_explicit.cu -o parallel_explicit
!./parallel_explicit input_10000.csv 10000 output_parallel_explicit_10000.txt
!./parallel_explicit input_100000.csv 100000 output_parallel_explicit_100000.txt
!./parallel_explicit input_1000000.csv 1000000 output_parallel_explicit_1000000.txt

For input file input_10000.csv 
Time for kernel functions: 0.017152 ms
Time for explicit data migration: 0.183040 ms
For input file input_100000.csv 
Time for kernel functions: 0.024416 ms
Time for explicit data migration: 0.522976 ms
For input file input_1000000.csv 
Time for kernel functions: 0.127552 ms
Time for explicit data migration: 3.883904 ms


In [None]:
!nvprof ./parallel_explicit input_10000.csv 10000 output_parallel_explicit_10000.txt

For input file input_10000.csv 
==364== NVPROF is profiling process 364, command: ./parallel_explicit input_10000.csv 10000 output_parallel_explicit_10000.txt
Time for kernel functions: 0.027968 ms
Time for explicit data migration: 0.130688 ms
==364== Profiling application: ./parallel_explicit input_10000.csv 10000 output_parallel_explicit_10000.txt
==364== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   37.97%  9.3440us         2  4.6720us  2.9120us  6.4320us  [CUDA memcpy DtoH]
                   37.97%  9.3440us         2  4.6720us  2.9440us  6.4000us  [CUDA memcpy HtoD]
                   24.06%  5.9200us         1  5.9200us  5.9200us  5.9200us  classify(char*, int, char*)
      API calls:   99.48%  177.72ms         6  29.620ms     504ns  177.72ms  cudaEventCreate
                    0.19%  343.29us         1  343.29us  343.29us  343.29us  cuDeviceTotalMem
                    0.07%  132.49us         2  66.243us 

In [None]:
!nvprof ./parallel_explicit input_100000.csv 100000 output_parallel_explicit_100000.txt

For input file input_100000.csv 
==375== NVPROF is profiling process 375, command: ./parallel_explicit input_100000.csv 100000 output_parallel_explicit_100000.txt
Time for kernel functions: 0.029248 ms
Time for explicit data migration: 0.509088 ms
==375== Profiling application: ./parallel_explicit input_100000.csv 100000 output_parallel_explicit_100000.txt
==375== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   46.88%  71.392us         2  35.696us  19.328us  52.064us  [CUDA memcpy HtoD]
                   42.47%  64.672us         2  32.336us  17.344us  47.328us  [CUDA memcpy DtoH]
                   10.65%  16.224us         1  16.224us  16.224us  16.224us  classify(char*, int, char*)
      API calls:   99.23%  179.65ms         6  29.941ms     544ns  179.65ms  cudaEventCreate
                    0.28%  498.98us         4  124.74us  77.995us  158.66us  cudaMemcpy
                    0.20%  353.42us         1  353.42us

In [None]:
!nvprof ./parallel_explicit input_1000000.csv 1000000 output_parallel_explicit_1000000.txt

For input file input_1000000.csv 
==386== NVPROF is profiling process 386, command: ./parallel_explicit input_1000000.csv 1000000 output_parallel_explicit_1000000.txt
Time for kernel functions: 0.139296 ms
Time for explicit data migration: 5.072160 ms
==386== Profiling application: ./parallel_explicit input_1000000.csv 1000000 output_parallel_explicit_1000000.txt
==386== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   58.14%  1.4851ms         2  742.53us  384.16us  1.1009ms  [CUDA memcpy HtoD]
                   36.88%  941.98us         2  470.99us  180.00us  761.98us  [CUDA memcpy DtoH]
                    4.98%  127.30us         1  127.30us  127.30us  127.30us  classify(char*, int, char*)
      API calls:   96.59%  181.43ms         6  30.239ms     537ns  181.43ms  cudaEventCreate
                    2.75%  5.1630ms         4  1.2908ms  699.07us  2.0591ms  cudaMemcpy
                    0.22%  413.17us         2  2

In [None]:
%%writefile parallel_prefetch.cu

#include "gputimer.h"
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define AND 0
#define OR 1
#define NAND 2
#define NOR 3
#define XOR 4
#define XNOR 5

__global__ void classify(char* data, int SIZE, char* results) {
	// since only needs to launch one thread per logic gate, SIZE should equal to the number of rows in the file
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	if (i < SIZE) {
		if (data[i * 6 + 4] == '0')
		{
			int result = (((data[i * 6]) - '0') & ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '1')
		{
			int result = ((data[i * 6]) - '0') | ((data[i * 6 + 2]) - '0');
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '2')
		{
			int result = !(((data[i * 6]) - '0') & ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '3')
		{
			int result = !(((data[i * 6]) - '0') | ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '4')
		{
			int result = (((data[i * 6]) - '0') ^ ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
		else if (data[i * 6 + 4] == '5')
		{
			int result = !(((data[i * 6]) - '0') ^ ((data[i * 6 + 2]) - '0'));
			results[2 * i] = (result + '0');
			results[2 * i + 1] = '\n';
		}
	}
}


void parallel_prefetch(FILE* fp_in, int length, FILE* fp_out) {
	// input has length 'length' and output has length 'length/3'
	// output file has only 2 elements in one line (a number and a '\n') while input file has 6 (listed in main)
	char* data, * results;
	// timer_kernel records time for kernel function
	GpuTimer timer_kernel;
	// // Unified memory allocation methods
	cudaMallocManaged(&data, length);
  cudaMallocManaged(&results, length / 3);
	fread(data, 1, length, fp_in);
	int maxThreadNum = 1024;
	// distribute the total threads equally in blocks
	while (1)
	{
		if (length / 6 % maxThreadNum != 0)
		{
			maxThreadNum--;
		}
		else
		{
			break;
		}
	}
	int totalBlocks = length / 6 / maxThreadNum;

  // Prefetch the data to the GPU
  int device = -1;
  cudaGetDevice(&device);
  cudaMemPrefetchAsync(data, length, device, NULL);
  cudaMemPrefetchAsync(results, length/3, device, NULL);
	
	timer_kernel.Start();
	classify <<<totalBlocks, maxThreadNum >>> (data, length / 6, results);
	timer_kernel.Stop();
	cudaDeviceSynchronize();
	printf("Time for kernel functions: %f ms\n", timer_kernel.Elapsed());

	fputs(results, fp_out);

	cudaFree(data);
	cudaFree(results);
}

int main(int argc, char* argv[])
{
	if (argc < 4)
	{
		printf("You must enter 3 input files!\n");
		exit(1);
	}

	// argv[1] : input_file_path
	// argv[2] : input_file_length
	// argv[3] : output_file_path

	char* fileName = argv[1];
	FILE* input = fopen(fileName, "r");
	if (input == NULL)
	{
		exit(EXIT_FAILURE);
		printf("No input file.\n");
	}


	char* outputFileName = argv[3];
	FILE* output = fopen(outputFileName, "w");
	if (output == NULL)
	{
		exit(EXIT_FAILURE);
		printf("No output file.\n");
	}

	// atoi(argv[2]) is the number of lines in a file
	// each line has 6 elements, including 3 numbers, 2 commas and 1 '\n'
	// as a result the length pass in should be atoi(argv[2]) * 6 
	printf("For input file %s \n", fileName);
	parallel_prefetch(input, atoi(argv[2]) * 6, output);

	fclose(input);
	fclose(output);

	return 0;
}

Writing parallel_prefetch.cu


In [None]:
!nvcc parallel_prefetch.cu -o parallel_prefetch
!./parallel_prefetch input_10000.csv 10000 output_parallel_prefetch_10000.txt
!./parallel_prefetch input_100000.csv 100000 output_parallel_prefetch_100000.txt
!./parallel_prefetch input_1000000.csv 1000000 output_parallel_prefetch_1000000.txt

For input file input_10000.csv 
Time for kernel functions: 0.011680 ms
For input file input_100000.csv 
Time for kernel functions: 0.022592 ms
For input file input_1000000.csv 
Time for kernel functions: 0.124448 ms


In [None]:
!nvprof ./parallel_prefetch input_10000.csv 10000 output_parallel_prefetch_10000.txt

For input file input_10000.csv 
==446== NVPROF is profiling process 446, command: ./parallel_prefetch input_10000.csv 10000 output_parallel_prefetch_10000.txt
Time for kernel functions: 0.036672 ms
==446== Profiling application: ./parallel_prefetch input_10000.csv 10000 output_parallel_prefetch_10000.txt
==446== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  6.6880us         1  6.6880us  6.6880us  6.6880us  classify(char*, int, char*)
      API calls:   89.13%  176.83ms         2  88.415ms     900ns  176.83ms  cudaEventCreate
                   10.29%  20.417ms         2  10.209ms  25.492us  20.392ms  cudaMallocManaged
                    0.20%  405.36us         1  405.36us  405.36us  405.36us  cuDeviceTotalMem
                    0.16%  308.48us         2  154.24us  47.286us  261.19us  cudaMemPrefetchAsync
                    0.10%  201.87us         2  100.93us  30.754us  171.11us  cudaFree
               

In [None]:
!nvprof ./parallel_prefetch input_100000.csv 100000 output_parallel_prefetch_100000.txt

For input file input_100000.csv 
==459== NVPROF is profiling process 459, command: ./parallel_prefetch input_100000.csv 100000 output_parallel_prefetch_100000.txt
Time for kernel functions: 0.023168 ms
==459== Profiling application: ./parallel_prefetch input_100000.csv 100000 output_parallel_prefetch_100000.txt
==459== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  17.312us         1  17.312us  17.312us  17.312us  classify(char*, int, char*)
      API calls:   89.08%  175.97ms         2  87.986ms     944ns  175.97ms  cudaEventCreate
                   10.31%  20.374ms         2  10.187ms  25.080us  20.349ms  cudaMallocManaged
                    0.19%  372.03us         1  372.03us  372.03us  372.03us  cuDeviceTotalMem
                    0.19%  369.44us         2  184.72us  126.39us  243.05us  cudaMemPrefetchAsync
                    0.07%  138.00us        97  1.4220us     132ns  57.642us  cuDeviceGetAttrib

In [None]:
!nvprof ./parallel_prefetch input_1000000.csv 1000000 output_parallel_prefetch_1000000.txt

For input file input_1000000.csv 
==471== NVPROF is profiling process 471, command: ./parallel_prefetch input_1000000.csv 1000000 output_parallel_prefetch_1000000.txt
Time for kernel functions: 0.157088 ms
==471== Profiling application: ./parallel_prefetch input_1000000.csv 1000000 output_parallel_prefetch_1000000.txt
==471== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  128.48us         1  128.48us  128.48us  128.48us  classify(char*, int, char*)
      API calls:   88.06%  166.96ms         2  83.478ms     936ns  166.96ms  cudaEventCreate
                   10.78%  20.433ms         2  10.216ms  46.027us  20.387ms  cudaMallocManaged
                    0.52%  988.09us         2  494.05us  28.125us  959.97us  cudaMemPrefetchAsync
                    0.28%  529.51us         2  264.75us  163.20us  366.31us  cudaFree
                    0.18%  344.38us         1  344.38us  344.38us  344.38us  cuDeviceTotalMem
 

Comparison For Unified Memory With Prefetching

In [None]:
!gcc compareResults.c -o compareResults
!./compareResults output_parallel_prefetch_10000.txt sol_10000.txt
!./compareResults output_parallel_prefetch_100000.txt sol_100000.txt
!./compareResults output_parallel_prefetch_1000000.txt sol_1000000.txt

Total Errors : 0	Total Errors : 0	Total Errors : 0	