## Make the connection to the appropriate data in Google Big Query

In [1]:
from google.cloud import bigquery
from google.oauth2 import service_account
import random

In [2]:
service_path = "C:\\Users\\Craig\\Documents\\000000000 UofM fall 2021\\BMIS670 Chandler AppliedDataAnalytics\\Wedge project\\"
# THE PATH TO THE DATA READY FOR ANALYSIS
service_file = 'wedge-project-fall2021-92691-f3182a53adb6.json' # this is my authentication information  
gbq_proj_id = 'wedge-project-fall2021-92691'  # this is my project_id
gbq_dataset_id = 'Wedge_FULL'

In [3]:
# Get my credentials
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

# Creating a client to talk to GBQ
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

## Create queries to access owner identified information except for large owner 00003.0

In [4]:
query = '''SELECT DISTINCT(card_no) FROM `wedge-project-fall2021-92691.Wedge_FULL.transArchive*`
WHERE card_no!=3.0'''  # establishes a query to pull owners via their heading 'card_no'

In [5]:
owners = client.query(query,location="US")

In [6]:
owner_list = []
for owner in owners:
    owner_list.append(owner[0])  # searches through all data sets to create a DISTINCT list of all owners 
    #putting them into object 'owner_list'

In [7]:
len(owner_list)  # identifies the total population count of different owners in the Wedge data

27207

In [8]:
owner_sample = random.sample(owner_list,400)
# create a random sample of specified size of owners for further analysis

In [9]:
owner_trans = '''SELECT * FROM `wedge-project-fall2021-92691.Wedge_FULL.transArchive*`
WHERE card_no IN ('''

In [10]:
for owner in owner_sample:
    owner_trans += str(owner)+','
# building a list, so separately delimit with a comma until you've ran through the sampling process
# assigning results to list object 'owner_trans'

In [11]:
owner_trans = owner_trans[:-1]+')' # conclude sample list by removing final comma and adding closing parenthesis

In [12]:
random_trans = client.query(owner_trans,location="US") # process random sample in GBQ, where records are assigned to
# object 'random_trans' but only for those whose card_no resides in the owner_trans list

In [13]:
from timeit import default_timer as timer # to establish access to 'process timer'
start = timer()  # begin timer for this code block (will return #seconds elapsed)

for trans in random_trans:
    print('\t'.join([str(x) for x in trans]))
    break
# takes first record that matches a sampled owner and puts it into a tab-delimited string

end = timer() # stops the code-block timer process after all iterations of all loops
print(end - start)  # displays total seconds elapsed in this code block process

2013-01-04 20:50:54+00:00	7.0	33.0	41.0	0	Change	T	CA	 	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	8.0	None	0.0	0.0	0.0	0.0	0.0	0.0	None	None	0.0	0.0	0.0	None	0.0	None	0.0	None	None	0.0	14187.0	1.0	0.0	0.0	47.0
16.698439200000003


In [14]:
list(random_trans)[0]  # reflects the first transaction in a list that is comma delimited

Row((datetime.datetime(2013, 1, 4, 20, 50, 54, tzinfo=<UTC>), 7.0, 33.0, 41.0, '0', 'Change', 'T', 'CA', ' ', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.0, None, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, None, None, 0.0, 0.0, 0.0, None, 0.0, None, 0.0, None, None, 0.0, 14187.0, 1.0, 0.0, 0.0, 47.0), {'datetime': 0, 'register_no': 1, 'emp_no': 2, 'trans_no': 3, 'upc': 4, 'description': 5, 'trans_type': 6, 'trans_subtype': 7, 'trans_status': 8, 'department': 9, 'quantity': 10, 'Scale': 11, 'cost': 12, 'unitPrice': 13, 'total': 14, 'regPrice': 15, 'altPrice': 16, 'tax': 17, 'taxexempt': 18, 'foodstamp': 19, 'wicable': 20, 'discount': 21, 'memDiscount': 22, 'discountable': 23, 'discounttype': 24, 'voided': 25, 'percentDiscount': 26, 'ItemQtty': 27, 'volDiscType': 28, 'volume': 29, 'VolSpecial': 30, 'mixMatch': 31, 'matched': 32, 'memType': 33, 'staff': 34, 'numflag': 35, 'itemstatus': 36, 'tenderstatus': 37, 'charflag': 38, 'varflag': 39, 'batchHeaderID': 40, 'lo

In [15]:
len(list(random_trans))   #identifies the total number of transactions associated with the sampled owners

921316

## Write the output of the transaction query to a local file

In [16]:
headers = ["datetime","register_no","emp_no","trans_no","upc","description","trans_type",
           "trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice",
           "total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount",
           "memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty",
           "volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag",
           "itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic",
           "display","receipt","card_no","store","branch","match_id","trans_id"]
# pulls in the headers

In [17]:
from timeit import default_timer as timer # to establish access to 'process timer'
start = timer()  # begin timer for this code block (will return #seconds elapsed)

with open("sample_owner_trans.txt", 'w') as outfile:
    outfile.write(",".join(headers) + "\n")
    for row in random_trans:
         outfile.write(",".join([str(item) for item in row])+ '\n')
            
end = timer() # stops the code-block timer process after all iterations of all loops
print(end - start)  # displays total seconds elapsed in this code block process                    

283.82671159999995
