# The Wedge

## Task 1: Building a Transaction Database in Google Big Query!


## Python Modules

In [12]:
import os
import io
import shutil
import re
import datetime 
import csv

import pandas as pd
import numpy as np
import pandas_gbq
import janitor

from zipfile import ZipFile # usually you'd do all these imports at the beginning

# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account

# Python program to illustrate the concept
# of threading
# importing the threading module
import threading
import time

from multiprocessing.pool import ThreadPool as Pool

import multiprocessing

import enum

## Define Global Variables

In [2]:
# # Small File Sample
# zip_file_name = "WedgeZipOfZips_Small.zip"

## Full data Set
zip_file_name = "WedgeZipOfZips.zip"

# Clean data Set
# zip_file_name = "WedgeFiles_Clean.zip"

# Small Clean Data Set
# zip_file_name = "WedgeZipOfZips_Small_Clean.zip"

# CSV File Name
csv_file_name = "sample_transactions.csv"

# Working Directory included in .gitignore
# working_directory = "/media/psf/Home/Repos/BMKT670.V60-72020-Fall2022-Wedge-Project/eggs/"
working_directory = "/home/blackvwgolf95/BMKT670.V60-72020-Fall2022-Wedge-Project/eggs/"

## Define Functions

In [47]:
def display_file_contents(file_name):
    # Ignore __MACOSX hidden files
    if(file_name.startswith( '__' )):
        return
    # Ignore folders
    if(file_name.endswith( '/' )):
        return

    print("File: ", file_name,
          " Size:", os.path.getsize(working_directory+file_name), 
          "Bytes ", convert_unit(os.path.getsize(working_directory+file_name), SIZE_UNIT.MB ), "MB" )
    
#https://thispointer.com/python-get-file-size-in-kb-mb-or-gb-human-readable-format/
# Enum for size units
class SIZE_UNIT(enum.Enum):
   BYTES = 1
   KB = 2
   MB = 3
   GB = 4
def convert_unit(size_in_bytes, unit):
   """ Convert the size from bytes to other units like KB, MB or GB"""
   if unit == SIZE_UNIT.KB:
       return size_in_bytes/1024
   elif unit == SIZE_UNIT.MB:
       return size_in_bytes/(1024*1024)
   elif unit == SIZE_UNIT.GB:
       return size_in_bytes/(1024*1024*1024)
   else:
       return size_in_bytes

## GBQ Setup

In [5]:
# These first two values will be different on your machine. 
# service_path = "/Users/chandler/Dropbox/Teaching/"
# service_file = 'umt-msba-037daf11ee16.json' # change this to your authentication information  
# gbq_proj_id = 'umt-msba' # change this to your project. 
# service_path = "/media/psf/Home/Repos/"
service_path = "/home/blackvwgolf95/"
service_file = 'bmkt670-fall2022-wedge-project-6ce4398b80e4.json' # change this to your authentication information  
gbq_proj_id = 'bmkt670-fall2022-wedge-project' # change this to your project. 
dataset_id = 'wedgedataset'

# And this should stay the same. 
private_key = service_path + service_file

# Now we pass in our credentials so that Python has permission to access our project.
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

# And finally we establish our connection
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

# for item in client.list_datasets() : 
#    print(item.full_dataset_id)

## Phase 2, Query DB

In [37]:
query = """
    SELECT * FROM `bmkt670-fall2022-wedge-project.wedgedataset.transArchive_*`
    WHERE `card_no` IN (
      SELECT `card_no` FROM `bmkt670-fall2022-wedge-project.wedgedataset.transArchive_*` -- TABLESAMPLE SYSTEM (1 PERCENT) 
      WHERE `card_no` NOT IN ( '3' )
      ORDER BY RAND()
      LIMIT 8
    )
"""

transactions = pandas_gbq.read_gbq(query,project_id = gbq_proj_id)

# transactions.head(12)

Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1091937/1091937 [05:06<00:00, 3564.91rows/s]


In [40]:
# print(transactions.shape)

## Write CSV

In [38]:
transactions.to_csv(working_directory + csv_file_name, index=False)

In [48]:
display_file_contents( csv_file_name )

File:  sample_transactions.csv  Size: 220668087 Bytes  210.4454870223999 MB


# Cleanup ALL Local Files

In [None]:
# https://linuxize.com/post/python-delete-files-and-directories/
try:
    # shutil.rmtree(working_directory)
    print('Done Cleanup')
    print("Completed Exit Code 0")
except OSError as e:
    print("Error: %s : %s" % (working_directory, e.strerror))
    print("Completed Exit Code -1")
