# This is the process for the large zipped data set to:

- access the zipped files in a folder 'DATA'
- iterate through each zipped file to extract the '.csv' file contained within
- put a common ',' delimiter to each CSV file
- change from byte-strings to normal-strings
- eliminate redundant ' "" ' wrapping each element in each record
- update improper values (NULL, \\N) to being empty elements
- correct specifically identified product descriptions which contained ',' delimiter (ie, 'artificial' elements)
- created a folder to hold the "cleaned" versions of the CSV's
- created/populated new CSV's into new folder, modifying name from '__.csv' to '__clean.csv'
- set a timer to determine total processing time to iterate over all 53 input zipped files

In [1]:
import os

In [2]:
zip_files = os.listdir("DATA/")  # determine zip file contents of DATA & put into list named "zip_files"

In [3]:
from zipfile import ZipFile  # tool to enable accessing zipped files more readily

In [4]:
import csv  # for the purpose of using the "Sniffer" tool to identify delimiters

In [5]:
if not os.path.isdir("clean_full") :  # establish folder for CSV output; if folder exists
    os.mkdir("clean_full")            # if not, make it

In [6]:
from timeit import default_timer as timer # to establish access to 'process timer'
start = timer()  # begin timer for this code block (will return #seconds elapsed)

headers = ['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 
           'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 
           'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 
           'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 
           'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 
           'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 
           'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']

for zip_file in zip_files :  # begin the iteration of the various zipped files
    
    with ZipFile("DATA/" + zip_file, 'r') as my_zip_file :  # extracting the current iteration => 'my_zip_file'
    
        files_inside = my_zip_file.namelist()  # a LIST of the contents of 'my_zip_file' => 'files_inside'
        for zipped_file in files_inside :  # begin iteration for each (CSV) 'files_inside' of current 'my_zip_file'
            sniffer = csv.Sniffer()  # enable 'Sniffer' process under handle 'sniffer' (command will be 'sniff')
            
            print(f"PROCESSING {zipped_file} NOW.")  # notification of 'files _inside' currently in iteration
            
            with my_zip_file.open(zipped_file, 'r') as input_file : # extract current CSV in iteration => 'input_file' 
                
        
                
                for idx, line in enumerate(input_file) : # iterates over each record from the source CSV
                    if idx == 0 :
                        print(line)
                        break


end = timer() # stops the code-block timer process after all iterations of all loops
print(end - start)  # displays total seconds elapsed in this code block process

PROCESSING transArchive_201001_201003.csv NOW.
b'"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"\n'
PROCESSING transArchive_201004_201006.csv NOW.
b'"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","v

In [None]:
# no headers 201511 thru 201701