# This is the process for the large zipped data set to:

- access the zipped files in a folder 'DATA'
- iterate through each zipped file to extract the '.csv' file contained within
- put a common ',' delimiter to each CSV file
- change from byte-strings to normal-strings
- eliminate redundant ' "" ' wrapping each element in each record
- update improper values (NULL, \\N) to being empty elements
- correct specifically identified product descriptions which contained ',' delimiter (ie, 'artificial' elements)
- created a folder to hold the "cleaned" versions of the CSV's
- created/populated new CSV's into new folder, modifying name from '__.csv' to '__clean.csv'
- set a timer to determine total processing time to iterate over all 53 input zipped files

In [1]:
import os

In [2]:
zip_files = os.listdir("DATA/")  # determine zip file contents of DATA & put into list named "zip_files"

In [3]:
from zipfile import ZipFile  # tool to enable accessing zipped files more readily

In [4]:
import csv  # for the purpose of using the "Sniffer" tool to identify delimiters

In [5]:
if not os.path.isdir("clean_full") :  # establish folder for CSV output; if folder exists
    os.mkdir("clean_full")            # if not, make it

In [6]:
from timeit import default_timer as timer # to establish access to 'process timer'
start = timer()  # begin timer for this code block (will return #seconds elapsed)

headers = ['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 
           'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 
           'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 
           'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 
           'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 
           'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 
           'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']

for zip_file in zip_files :  # begin the iteration of the various zipped files
    
    with ZipFile("DATA/" + zip_file, 'r') as my_zip_file :  # extracting the current iteration => 'my_zip_file'
    
        files_inside = my_zip_file.namelist()  # a LIST of the contents of 'my_zip_file' => 'files_inside'
        for zipped_file in files_inside :  # begin iteration for each (CSV) 'files_inside' of current 'my_zip_file'
            sniffer = csv.Sniffer()  # enable 'Sniffer' process under handle 'sniffer' (command will be 'sniff')
            
            print(f"PROCESSING {zipped_file} NOW.")  # notification of 'files _inside' currently in iteration
            
            with my_zip_file.open(zipped_file, 'r') as input_file : # extract current CSV in iteration => 'input_file' 
                
                output_file_name = input_file.name.replace(".csv","_clean.csv") # create output name with suffix
                
                with open("clean_full/" + output_file_name,'w') as outfile : # make output file available for loading
                                   
                    rows_printed = 0    # initialize a counter for the record volume being populated by process         
                
                    for idx, line in enumerate(input_file) : # iterates over each record from the source CSV
#
                        file_has_header = False # begins with assumption that there is NO header in the contents
#
                        dialect = sniffer.sniff(line.decode("utf-8")) # invokes 'sniff' on each record as 'dialect'
                                                                    # will also changes byte-string to normal-string
                        line = line.decode("utf-8").strip().split(dialect.delimiter) # forces ',' delimiter to each row
                        line = [piece.replace('"','') for piece in line] # removes redundant ' ""' from each element
                 
                        line = [Null.replace('NULL','') for Null in line] # changes 'NULL' contents to empty string

                        line = [DblBk_N.replace('\\N','') for DblBk_N in line] # changes '\\N' contents to empty string

                # following 11 situations (previously identified) have 3 elements concatenated => 50 elements/row
                        if zipped_file == ('transArchive_201207_201209.csv') :            
                            if 326345 == idx :
                                line[5:8] = [' -'.join(line[5:8])]

                        if zipped_file == ('transArchive_201301_201303.csv') :            
                            if 1327813 == idx :
                                line[5:8] = [' -'.join(line[5:8])]

                        if zipped_file == ('transArchive_201304_201306.csv') :            
                            if 255261 == idx :
                                line[5:8] = [' -'.join(line[5:8])]
                            
                        if zipped_file == ('transArchive_201304_201306.csv') :            
                            if 255487 == idx :
                                line[5:8] = [' -'.join(line[5:8])]

                        if zipped_file == ('transArchive_201307_201309.csv') :            
                            if 913669 == idx :
                                line[5:8] = [' -'.join(line[5:8])]

                        if zipped_file == ('transArchive_201307_201309.csv') :            
                            if 2151692 == idx :
                                line[5:8] = [' -'.join(line[5:8])]

                        if zipped_file == ('transArchive_201310_201312.csv') :            
                            if 1239520 == idx :
                                line[5:8] = [' -'.join(line[5:8])]

                        if zipped_file == ('transArchive_201401_201403.csv') :            
                            if 392934 == idx :
                                line[5:8] = [' -'.join(line[5:8])]

                        if zipped_file == ('transArchive_201401_201403.csv') :            
                            if 2402865 == idx :
                                line[5:8] = [' -'.join(line[5:8])]

                        if zipped_file == ('transArchive_201404_201406.csv') :            
                            if 2461306 == idx :
                                line[5:8] = [' -'.join(line[5:8])]
                            
                        if zipped_file == ('transArchive_201407_201409.csv') :            
                            if 41527 == idx :
                                line[5:8] = [' -'.join(line[5:8])]                            

                        if idx == 0 :  # assesses first row to determine if there really is a header or not
                            if 'datetime' in line[0] :  # uses first header element 'datetime' to determine
                                file_has_header = True # changes boolean to TRUE if 'datetime' is found (default FALSE)
                            
                                
                        if file_has_header and idx == 0 : # assesses if this is both initial record and a header..
                            pass  # ignore this row in populating the new CSV
                        else :
                          
                            outfile.write(",".join(line) + "\n") # writes the 'non-header' record to new clean CSV
                            rows_printed += 1 # increments the count of rows populated into new clean CSV
                            
                        if rows_printed <= 1 : # will identify rows 0 (if header found) and 1st row of data for each CSV
                            print(line)    # prints row(s) to screen
                            
#                     if not file_has_header :
#                         outfile.write(",".join(headers) + "\n") # add header row where none was evident
#                         print('added header')

end = timer() # stops the code-block timer process after all iterations of all loops
print(end - start)  # displays total seconds elapsed in this code block process

PROCESSING transArchive_201001_201003.csv NOW.
['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']
['2010-01-01 09:04:09', '5', '17', '2', '0005385200400', 'Medium Salsa 16oz GMG', 'I', ' ', ' ', '1', '1', '0', '2.6480', '2.9900', '2.9900', '3.9900', '0.0000', '0', '0', '1', '0', '1.0000', '0.0000', '1', '1', '0', '0.00000000', '1', '0', '0', '0.0000', '0', '0', '', '0', '2', '0', '0', '0', '0', '', '0', '', '', '0', '3', '1', '0', '0', 

PROCESSING transArchive_201110_201112.csv NOW.
['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']
['2011-10-01 08:57:23', '16', '54', '6', '0000000040423', 'CC Chai 12oz', 'I', ' ', ' ', '14', '1', '0', '0.6407', '2.9900', '2.9900', '2.9900', '0.0000', '1', '0', '0', '0', '0.0000', '0.0000', '1', '0', '0', '0.00000000', '1', '0', '0', '0.0000', '0', '0', '', '', '5', '0', '0', '', '0', '', '0', '0', ' ', '0', '13381', '1', '0', '0', '1'

PROCESSING transArchive_201301_201303.csv NOW.
['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']
['2013-01-01 07:17:09', '16', '54', '1', '0000000009506', 'Offsite: Plain Croissant', 'I', ' ', ' ', '17', '12', '0', '0.2100', '1.3900', '16.6800', '1.3900', '0.0000', '0', '0', '1', '0', '0.0000', '0.0000', '1', '0', '0', '', '12', '0', '0', '0.0000', '0', '0', '', '', '1', '0', '0', '', '0', '', '0', '0', ' ', '0', '50056', '32', '0', '0

PROCESSING transArchive_201401_201403_inactive.csv NOW.
['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']
['2014-01-01 09:27:15', '16', '54', '24', '0000000040432', 'Latte/Cappuccino 16oz', 'I', ' ', ' ', '14', '1', '0', '0.5810', '3.4900', '3.4900', '3.4900', '0.0000', '1', '0', '0', '0', '0.0000', '0.0000', '1', '0', '0', '0.00000000', '1', '0', '0', '0.0000', '0', '0', '', '', '5', '0', '0', '', '0', '', '0', '0', ' ', '0', '47893',

PROCESSING transArchive_201507_201509.csv NOW.
['datetime', 'register_no', 'emp_no', 'trans_no', 'upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'charflag', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']
['2015-07-01 07:01:36', '51', '94', '2', '0020128000000', 'Co-op Curried Couscous', 'I', ' ', ' ', '8', '0.380476', '0', '5.0000', '7.9900', '3.0400', '7.9900', '', '0', '0', '1', '', '0.0000', '0.0000', '1', '0', '0', '10.00000000', '1', '0', '0', '0.0000', '0', '0', '0', '', '1', '0', '0', '', '', '', '0', '0', '', '0', '17105', '1', '3',