# The Wedge

## Task 1: Building a Transaction Database in Google Big Query!


## Python Modules

In [1]:
import os
import io
import shutil
import re
import datetime 
import csv

import pandas as pd
import numpy as np
import pandas_gbq
import janitor

from zipfile import ZipFile # usually you'd do all these imports at the beginning

# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account

## Define Global Variables

In [13]:
# # Small File Sample
# zip_file_name = "WedgeZipOfZips_Small.zip"

## Full data Set
zip_file_name = "WedgeZipOfZips.zip"

# Clean data Set
# zip_file_name = "WedgeFiles_Clean.zip"

# Small Clean Data Set
# zip_file_name = "WedgeZipOfZips_Small_Clean.zip"

# Working Directory included in .gitignore
# working_directory = "/media/psf/Home/Repos/BMKT670.V60-72020-Fall2022-Wedge-Project/eggs/"
working_directory = "/home/blackvwgolf95/BMKT670.V60-72020-Fall2022-Wedge-Project/eggs/"

## Define Functions

In [6]:
def extract_zip(zf):
    # printing what's in the zip file.  
    # zf.printdir() 

    # extracting all the files 
    print('Extracting all the files now...') 
    # pick a folder name already in .gitignore
    
    # Instead of always extracting ALL, check if file exists first
    # zf.extractall(working_directory) 
    
    zipped_files = zf.namelist()
    # display_zip_contents(zipped_files)
    
    # Only extract files if they don't exist
    for file_name in zipped_files :
        
        # Ignore .DS_Store hidden files
        if(file_name.endswith( '.DS_Store' )):
            continue
            
        # Ignore __MACOSX hidden files
        if(file_name.startswith( '__' )):
            continue
        
        # Ignore folders
        if(file_name.endswith( '/' )):
            continue
        
        if os.path.exists(working_directory + file_name) :
            print("File Exists, skipping")
            print(file_name)
        else :
            print("Need to Extract")
            print(file_name)
            zf.extract(file_name, working_directory) 
        
        zip_files.append(file_name)
        
def extract_single_zip(zf):
    zipped_files = zf.namelist()
    # display_zip_contents(zipped_files)
    
    # Only extract files if they don't exist
    for file_name in zipped_files :
        
        # Ignore .DS_Store hidden files
        if(file_name.endswith( '.DS_Store' )):
            continue
            
        # Ignore __MACOSX hidden files
        if(file_name.startswith( '__' )):
            continue
        
        # Ignore folders
        if(file_name.endswith( '/' )):
            continue
        
        if os.path.exists(working_directory + file_name) :
            print("File Exists, skipping")
            print(file_name)
        else :
            print("Need to Extract")
            print(file_name)
            zf.extract(file_name, working_directory) 
        
        data_files.append(file_name)


def display_zip_contents(zipped_files):
    for file_name in zipped_files :
        # Ignore __MACOSX hidden files
        if(file_name.startswith( '__' )):
            continue
        # Ignore .DS_Store hidden files
        if(file_name.endswith( '.DS_Store' )):
            continue
        # Ignore folders
        if(file_name.endswith( '/' )):
            continue

        print("File: ", file_name," Size:", os.path.getsize(working_directory+file_name))

def display_file_contents(files):
    for file_name in files :
        # Ignore __MACOSX hidden files
        if(file_name.startswith( '__' )):
            continue
        # Ignore .DS_Store hidden files
        if(file_name.endswith( '.DS_Store' )):
            continue
        # Ignore folders
        if(file_name.endswith( '/' )):
            continue

        print("File: ", file_name," Size:", os.path.getsize(working_directory+file_name))

def get_delimiter(file_name) :
    # Get separator
    input_file = open(working_directory+file_name,'r')
    # input_file = io.TextIOWrapper(input_file,encoding="utf-8")
            
    dialect = csv.Sniffer().sniff(sample=input_file.readline(),
                                  delimiters=[",",";","\t"])
    delimiter = dialect.delimiter
    # delimiters[file_name] = dialect.delimiter

    #     print(" ".join(["It looks like",
    #                    file_name,
    #                    "has delimiter",
    #                    dialect.delimiter,
    #                    "."]))
    input_file.close() # tidy up
    return delimiter

def get_header(file_name) :
    
    with open(working_directory+file_name) as f:
        first_line = f.readline()
        # print(first_line)
        if first_line.startswith('datetime') :
            return 0
        if first_line.startswith('"datetime"') :
            return 0
        if first_line.startswith("'datetime'") :
            return 0
        else :
            return None

def upload_data(data):
    # https://stackoverflow.com/a/24083253
    grouped = data.groupby(pd.Grouper(freq='M'))
    for name, group in grouped:

        # Construct table name from index
        # table_name = "dram_items_"+reformat_date(name.strftime('%Y-%m-%d'))

        # 3. For each month in the file, subset the data to that month and 
        #    upload the data to a table called `dram_items_YYYYMM01`. 
        # table_id = ".".join([gbq_proj_id,dataset_id,table_name])
        # print(table_id)
        # pandas_gbq.to_gbq(item_lu, table_id, project_id=gbq_proj_id,if_exists="replace") # let's discuss this last bit
        print("Data Uploaded!")
        
def cleanup_data(data): 
    # Clean the names with the janitor package.
    data = janitor.clean_names(data)

#     for column in ( 'gross_sales', 'discounts', 'net_sales', 'tax' ):
#         # Convert the fields that have dollar signs (such as `gross_sales`) into numeric data. Watch out for dollar signs and commas.
#         data[column] = ( data[column]
#                            .str.replace("$", '', regex=False)
#                            .str.replace(",", '', regex=False)
#                            .astype(float) )

    # Change the type of the column `modifiers_applied` to string.
#     data['modifiers_applied'] = data['modifiers_applied'].astype(str)

    # Replace the `sku` column with a column of empty strings. 
#     data['sku'] = ''

    # print( item_lu.head() )
#     data.index = pd.to_datetime(data['date']) # ,format='%y-%m-%d'  
    return data
        


## GBQ Setup

In [11]:
# These first two values will be different on your machine. 
# service_path = "/Users/chandler/Dropbox/Teaching/"
# service_file = 'umt-msba-037daf11ee16.json' # change this to your authentication information  
# gbq_proj_id = 'umt-msba' # change this to your project. 
# service_path = "/media/psf/Home/Repos/"
service_path = "/home/blackvwgolf95/"
service_file = 'bmkt670-fall2022-wedge-project-6ce4398b80e4.json' # change this to your authentication information  
gbq_proj_id = 'bmkt670-fall2022-wedge-project' # change this to your project. 
dataset_id = 'wedgedataset'

# And this should stay the same. 
private_key = service_path + service_file

# Now we pass in our credentials so that Python has permission to access our project.
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

# And finally we establish our connection
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

# for item in client.list_datasets() : 
#    print(item.full_dataset_id)

## Phase 1, Upload Clean Files

In [16]:
# In this cell, do the following: 

# Master list of all data files
zip_files = []

with ZipFile( zip_file_name, 'r') as zf : 
    extract_zip(zf)
    print('Done Extracting!')
    

print("Done building file list")

Extracting all the files now...
File Exists, skipping
transArchive_201001_201003.zip
File Exists, skipping
transArchive_201004_201006.zip
File Exists, skipping
transArchive_201007_201009.zip
File Exists, skipping
transArchive_201010_201012.zip
File Exists, skipping
transArchive_201101_201103.zip
File Exists, skipping
transArchive_201104.zip
File Exists, skipping
transArchive_201105.zip
File Exists, skipping
transArchive_201106.zip
File Exists, skipping
transArchive_201107_201109.zip
File Exists, skipping
transArchive_201110_201112.zip
File Exists, skipping
transArchive_201201_201203.zip
File Exists, skipping
transArchive_201201_201203_inactive.zip
File Exists, skipping
transArchive_201204_201206.zip
File Exists, skipping
transArchive_201204_201206_inactive.zip
File Exists, skipping
transArchive_201207_201209.zip
File Exists, skipping
transArchive_201207_201209_inactive.zip
File Exists, skipping
transArchive_201210_201212.zip
File Exists, skipping
transArchive_201210_201212_inactive.zip

## Verify ZIP Files

In [17]:
display_file_contents(zip_files)

File:  transArchive_201001_201003.zip  Size: 99503110
File:  transArchive_201004_201006.zip  Size: 105550247
File:  transArchive_201007_201009.zip  Size: 99939988
File:  transArchive_201010_201012.zip  Size: 101366220
File:  transArchive_201101_201103.zip  Size: 100781163
File:  transArchive_201104.zip  Size: 32868743
File:  transArchive_201105.zip  Size: 32785831
File:  transArchive_201106.zip  Size: 27652309
File:  transArchive_201107_201109.zip  Size: 101917553
File:  transArchive_201110_201112.zip  Size: 107752470
File:  transArchive_201201_201203.zip  Size: 102376566
File:  transArchive_201201_201203_inactive.zip  Size: 8652264
File:  transArchive_201204_201206.zip  Size: 104625854
File:  transArchive_201204_201206_inactive.zip  Size: 8318216
File:  transArchive_201207_201209.zip  Size: 98925996
File:  transArchive_201207_201209_inactive.zip  Size: 6652257
File:  transArchive_201210_201212.zip  Size: 99601626
File:  transArchive_201210_201212_inactive.zip  Size: 5771070
File:  tra

## Extract Inner Zips


In [18]:
data_files = []

for inner_zip_file_name in zip_files :
    # print(working_directory + inner_zip_file_name)
    # Ignore folders
    if not inner_zip_file_name.endswith( '.zip' ):
        continue
    with ZipFile( working_directory + inner_zip_file_name, 'r') as zf : 
        extract_single_zip(zf)
#         extract_single_zips(zip_files)

Need to Extract
transArchive_201001_201003.csv
Need to Extract
transArchive_201004_201006.csv
Need to Extract
transArchive_201007_201009.csv
Need to Extract
transArchive_201010_201012.csv
Need to Extract
transArchive_201101_201103.csv
Need to Extract
transArchive_201104.csv
Need to Extract
transArchive_201105.csv
Need to Extract
transArchive_201106.csv
Need to Extract
transArchive_201107_201109.csv
Need to Extract
transArchive_201110_201112.csv
Need to Extract
transArchive_201201_201203.csv
Need to Extract
transArchive_201201_201203_inactive.csv
Need to Extract
transArchive_201204_201206.csv
Need to Extract
transArchive_201204_201206_inactive.csv
Need to Extract
transArchive_201207_201209.csv
Need to Extract
transArchive_201207_201209_inactive.csv
Need to Extract
transArchive_201210_201212.csv
Need to Extract
transArchive_201210_201212_inactive.csv
Need to Extract
transArchive_201301_201303.csv
Need to Extract
transArchive_201301_201303_inactive.csv
Need to Extract
transArchive_201304_

## Verify Data Files

In [19]:
display_file_contents(data_files)


File:  transArchive_201001_201003.csv  Size: 872986330
File:  transArchive_201004_201006.csv  Size: 924799264
File:  transArchive_201007_201009.csv  Size: 870731849
File:  transArchive_201010_201012.csv  Size: 871310253
File:  transArchive_201101_201103.csv  Size: 864157752
File:  transArchive_201104.csv  Size: 227905318
File:  transArchive_201105.csv  Size: 228086734
File:  transArchive_201106.csv  Size: 211641602
File:  transArchive_201107_201109.csv  Size: 887540984
File:  transArchive_201110_201112.csv  Size: 921964153
File:  transArchive_201201_201203.csv  Size: 882675809
File:  transArchive_201201_201203_inactive.csv  Size: 73077877
File:  transArchive_201204_201206.csv  Size: 909110503
File:  transArchive_201204_201206_inactive.csv  Size: 70658122
File:  transArchive_201207_201209.csv  Size: 862091646
File:  transArchive_201207_201209_inactive.csv  Size: 56592162
File:  transArchive_201210_201212.csv  Size: 854750338
File:  transArchive_201210_201212_inactive.csv  Size: 48461658

### Checking for and deleting previous tables

We'll get all the tables in our Dram data set that match our pattern, then delete them. We do not want to accidentally delete the item lookup table that we put in this data set in class. 


In [20]:
# create a regex that matches our table pattern
# ymd_pattern = re.compile(r"^dram_items_[1-2][9,0][1-2][9,0,1,2][01][0-9][01][0-9]$") 

transArchive_pattern = re.compile(r"^transArchive_*") 

tables = client.list_tables(dataset_id)  

for table in tables:
    
    print(f'Looking at {table.table_id}')

    # Test to see if table.table_id matches the pattern
    # if so, delete it
    if transArchive_pattern.match(table.table_id):
        # print(table.table_id)
        print(f'She swiped right, we have a MATCH! {table.table_id}')
        # table_id = ".".join([gbq_proj_id,dataset_id,table.table_id])
        # Disabling to prevent accidently running
        client.delete_table(table, not_found_ok=True)
        print(f"She blocked us, all hope is lost {table.table_id}.")


Looking at transArchive_201001_201003
She swiped right, we have a MATCH! transArchive_201001_201003
She blocked us, all hope is lost transArchive_201001_201003.
Looking at transArchive_201004_201006
She swiped right, we have a MATCH! transArchive_201004_201006
She blocked us, all hope is lost transArchive_201004_201006.
Looking at transArchive_201007_201009
She swiped right, we have a MATCH! transArchive_201007_201009
She blocked us, all hope is lost transArchive_201007_201009.
Looking at transArchive_201010_201012
She swiped right, we have a MATCH! transArchive_201010_201012
She blocked us, all hope is lost transArchive_201010_201012.
Looking at transArchive_201101_201103
She swiped right, we have a MATCH! transArchive_201101_201103
She blocked us, all hope is lost transArchive_201101_201103.
Looking at transArchive_201104
She swiped right, we have a MATCH! transArchive_201104
She blocked us, all hope is lost transArchive_201104.
Looking at transArchive_201105
She swiped right, we hav

She blocked us, all hope is lost transArchive_201612.
Looking at transArchive_201701
She swiped right, we have a MATCH! transArchive_201701
She blocked us, all hope is lost transArchive_201701.


## Uploading


# Cleanup ALL Local Files

In [22]:
# https://linuxize.com/post/python-delete-files-and-directories/
try:
    # shutil.rmtree(working_directory)
    print('Done Cleanup')
    print("Completed Exit Code 0")
except OSError as e:
    print("Error: %s : %s" % (working_directory, e.strerror))
    print("Completed Exit Code -1")


Done Cleanup
Completed Exit Code 0
