## Task 1: Cleaning the Wedge transaction data files and uploading them to GBQ.

In [1]:
# This line imports the os module, which provides functions for interacting with the operating system. 
# It allows you to perform operations such as file and directory manipulation.

import os

# This line imports the re module, which provides support for regular expressions. 
# Regular expressions are a powerful tool for pattern matching and manipulation of strings.

import re

# This line imports the datetime module, which provides classes for manipulating dates and times. 
# It allows you to work with dates, times, time intervals, and perform various operations on them.

import datetime 

# This line imports the ZipFile class from the zipfile module. It allows you to create, read, write, and 
# extract files from ZIP archives.

from zipfile import ZipFile

# This line imports the pandas library and assigns it the alias pd. pandas is a powerful data manipulation and 
# analysis library in Python. It provides data structures and functions for efficiently working with structured 
# data, such as tables or CSV files.

import pandas as pd

# This line imports the numpy library and assigns it the alias np. numpy is a fundamental package for scientific 
# computing in Python. It provides support for large, multi-dimensional arrays and matrices, along with a collection 
# of mathematical functions to operate on them.

import numpy as np

# This line imports the pandas_gbq module, which provides functionality for working with Google BigQuery from within 
# the pandas library. It allows you to read data from and write data to BigQuery tables.

import pandas_gbq

# This line imports the janitor library, which provides additional cleaning and data manipulation functions for 
# pandas DataFrames. It extends the functionality of pandas and allows for more streamlined data cleaning operations.

import janitor

# This line imports the shutil module, which provides high-level file and directory operations. It allows you to 
# perform operations such as copying, moving, and deleting files or directories.

import shutil

# This line imports the glob module, which provides a function for searching directories and retrieving file paths 
# that match specified patterns. It allows for flexible file path matching based on wildcards and patterns.

import glob

# This line imports the bigquery module from the google.cloud package. It provides functionality for interacting with 
# Google BigQuery, a fully-managed, serverless data warehouse.

from google.cloud import bigquery

# This line imports the service_account module from the google.oauth2 package. It provides support for authenticating 
# and authorizing access to Google Cloud services using a service account key.

from google.oauth2 import service_account

# This line imports the warnings module and will be used to stop warnings from printing to output for all code chunks.

import warnings

In [2]:
# Ignoring all warnings

# warnings.filterwarnings("ignore")

#### This next section of cells is necessary to generate lists of delimiters and headers that correspond with the .csv file names for use later on in the code in this notebook.

In [3]:
# Creates list of zip files in this directory.

zip_files = os.listdir("WedgeZipOfZips/")

In [4]:
# Looking at the list of zip files

zip_files

['transArchive_201001_201003.zip',
 'transArchive_201004_201006.zip',
 'transArchive_201007_201009.zip',
 'transArchive_201010_201012.zip',
 'transArchive_201101_201103.zip',
 'transArchive_201104.zip',
 'transArchive_201105.zip',
 'transArchive_201106.zip',
 'transArchive_201107_201109.zip',
 'transArchive_201110_201112.zip',
 'transArchive_201201_201203.zip',
 'transArchive_201201_201203_inactive.zip',
 'transArchive_201204_201206.zip',
 'transArchive_201204_201206_inactive.zip',
 'transArchive_201207_201209.zip',
 'transArchive_201207_201209_inactive.zip',
 'transArchive_201210_201212.zip',
 'transArchive_201210_201212_inactive.zip',
 'transArchive_201301_201303.zip',
 'transArchive_201301_201303_inactive.zip',
 'transArchive_201304_201306.zip',
 'transArchive_201304_201306_inactive.zip',
 'transArchive_201307_201309.zip',
 'transArchive_201307_201309_inactive.zip',
 'transArchive_201310_201312.zip',
 'transArchive_201310_201312_inactive.zip',
 'transArchive_201401_201403.zip',
 'tr

In [5]:
# Checking to see what delimiters were used for each .csv file

import csv # module for handling csv files
import io # module for handling input/output operations

# Here, a dictionary called delimiters is initialized. 
# This dictionary will store the filenames as keys and their corresponding delimiters as values.

delimiters = dict() 

# Iterating over the list of filenames in the directory

for this_zf in zip_files :
    
    # Within the loop, the code opens a ZIP file specified by 
    # "WedgeZipOfZips/" + this_zf in read mode ('r'). It assigns the opened ZIP file object to the variable zf. 
    # The with statement ensures that the ZIP file is properly closed after the code block is executed.
    
    with ZipFile("WedgeZipOfZips/" + this_zf,'r') as zf : # r-read,w-write,a-append
        
        # This line retrieves a list of filenames contained within the currently opened ZIP file (zf) using the 
        # namelist() method. The list of filenames is assigned to the variable zipped_files.
        
        zipped_files = zf.namelist()
        
        # Another for loop is initiated within the previous loop, iterating over each file_name in the zipped_files list.
        
        for file_name in zipped_files :
            
            # Here, the code opens the current file_name within the ZIP file (zf) in read mode ('r'). 
            # The opened file object is assigned to the variable input_file.
            
            input_file = zf.open(file_name,'r')
            
            # This line wraps the input_file object with a TextIOWrapper from the io module, which allows 
            # reading the file in a text mode. It specifies the encoding as UTF-8.
            
            input_file = io.TextIOWrapper(input_file,encoding="utf-8")
            
            # Here, the code uses the csv.Sniffer() class to automatically determine the delimiter used in the CSV file. 
            # It does this by reading a single line (sample) from the input_file and analyzing the characters that 
            # separate the values. The potential delimiters are specified as a list [",", ";", "\t"].
            
            dialect = csv.Sniffer().sniff(sample=input_file.readline(), delimiters=[",",";","\t"])
            
            # This line updates the delimiters dictionary. It sets the value for the current file_name as the delimiter 
            # detected in the dialect object.
            
            delimiters[file_name] = dialect.delimiter
            
            # These lines print a message indicating the detected delimiter for the current file. The join() method 
            # concatenates the strings within the provided list, separated by spaces, and the resulting message 
            # is printed to the console. 
            
            print(zipped_files)
            print(input_file)
            print(dialect.delimiter)
            print(" ".join(["It looks like", file_name, "has delimiter", dialect.delimiter, "."]))
            print()
            
            # This line closes the input_file object to free up system resources.
            
            input_file.close() # Tidy up.      

['transArchive_201001_201003.csv']
<_io.TextIOWrapper name='transArchive_201001_201003.csv' encoding='utf-8'>
,
It looks like transArchive_201001_201003.csv has delimiter , .

['transArchive_201004_201006.csv']
<_io.TextIOWrapper name='transArchive_201004_201006.csv' encoding='utf-8'>
,
It looks like transArchive_201004_201006.csv has delimiter , .

['transArchive_201007_201009.csv']
<_io.TextIOWrapper name='transArchive_201007_201009.csv' encoding='utf-8'>
,
It looks like transArchive_201007_201009.csv has delimiter , .

['transArchive_201010_201012.csv']
<_io.TextIOWrapper name='transArchive_201010_201012.csv' encoding='utf-8'>
,
It looks like transArchive_201010_201012.csv has delimiter , .

['transArchive_201101_201103.csv']
<_io.TextIOWrapper name='transArchive_201101_201103.csv' encoding='utf-8'>
,
It looks like transArchive_201101_201103.csv has delimiter , .

['transArchive_201104.csv']
<_io.TextIOWrapper name='transArchive_201104.csv' encoding='utf-8'>
,
It looks like transArc

In [6]:
delimiters

{'transArchive_201001_201003.csv': ',',
 'transArchive_201004_201006.csv': ',',
 'transArchive_201007_201009.csv': ',',
 'transArchive_201010_201012.csv': ',',
 'transArchive_201101_201103.csv': ',',
 'transArchive_201104.csv': ',',
 'transArchive_201105.csv': ',',
 'transArchive_201106.csv': ',',
 'transArchive_201107_201109.csv': ',',
 'transArchive_201110_201112.csv': ',',
 'transArchive_201201_201203.csv': ',',
 'transArchive_201201_201203_inactive.csv': ';',
 'transArchive_201204_201206.csv': ',',
 'transArchive_201204_201206_inactive.csv': ';',
 'transArchive_201207_201209.csv': ',',
 'transArchive_201207_201209_inactive.csv': ';',
 'transArchive_201210_201212.csv': ',',
 'transArchive_201210_201212_inactive.csv': ';',
 'transArchive_201301_201303.csv': ',',
 'transArchive_201301_201303_inactive.csv': ';',
 'transArchive_201304_201306.csv': ',',
 'transArchive_201304_201306_inactive.csv': ';',
 'transArchive_201307_201309.csv': ',',
 'transArchive_201307_201309_inactive.csv': ';'

In [7]:
# Checking for headers

# These lines initialize an empty dictionary called headers. This dictionary will store the filenames as keys and a 
# boolean value indicating whether the file has a header as values.

headers = dict()

# This line starts a for loop that iterates over each item (this_zf) in the zip_files list.

for this_zf in zip_files :
    
    # Within the loop, the code opens a ZIP file specified by "WedgeZipOfZips/" + this_zf in read mode ('r'). 
    # It assigns the opened ZIP file object to the variable zf. The with statement ensures that the ZIP file 
    # is properly closed after the code block is executed.
    
    with ZipFile("WedgeZipOfZips/" + this_zf,'r') as zf :
        
        # This line retrieves a list of filenames contained within the currently opened ZIP file (zf) using the 
        # namelist() method. The list of filenames is assigned to the variable zipped_files.
        
        zipped_files = zf.namelist()
        
        # Another for loop is initiated within the previous loop, iterating over each file_name in the zipped_files list.
        
        for file_name in zipped_files :
            
            # Here, the code opens the current file_name within the ZIP file (zf) in read mode ('r'). The opened 
            # file object is assigned to the variable input_file.
            
            input_file = zf.open(file_name,'r')
            
            # This line wraps the input_file object with a TextIOWrapper from the io module, which allows reading the 
            # file in a text mode. It specifies the encoding as UTF-8.
            
            input_file = io.TextIOWrapper(input_file,encoding="utf-8")
            
            # This line retrieves the delimiter associated with the current file_name from the delimiters dictionary 
            # and assigns it to the variable this_delimiter.
            
            this_delimiter = delimiters[file_name]
            
            # Just printing these so we know what file and such the first printed row is associated with
            
            print(zipped_files)
            print(input_file)
            print(this_delimiter)
            
            # Within this loop, the code iterates over each line in the input_file. However, it only processes the 
            # first line (break statement is used to exit the loop after the first iteration). The line itself is 
            # stripped of leading and trailing whitespace and then split into a list of values using the this_delimiter.
            
            for line in input_file :
                print(input_file)
                print(line.strip().split(this_delimiter))
                print()
                break     
                
            # This line assigns a boolean value to the headers dictionary for the current file_name. It checks if the 
            # string "datetime" is present in the last line read (line) and stores the result (True or False) as the 
            # value in the dictionary.
            
            headers[file_name] = "datetime" in line
            
            # This line closes the input_file object to free up system resources.       
            
            input_file.close() # Tidy up.

['transArchive_201001_201003.csv']
<_io.TextIOWrapper name='transArchive_201001_201003.csv' encoding='utf-8'>
,
<_io.TextIOWrapper name='transArchive_201001_201003.csv' encoding='utf-8'>
['"datetime"', '"register_no"', '"emp_no"', '"trans_no"', '"upc"', '"description"', '"trans_type"', '"trans_subtype"', '"trans_status"', '"department"', '"quantity"', '"Scale"', '"cost"', '"unitPrice"', '"total"', '"regPrice"', '"altPrice"', '"tax"', '"taxexempt"', '"foodstamp"', '"wicable"', '"discount"', '"memDiscount"', '"discountable"', '"discounttype"', '"voided"', '"percentDiscount"', '"ItemQtty"', '"volDiscType"', '"volume"', '"VolSpecial"', '"mixMatch"', '"matched"', '"memType"', '"staff"', '"numflag"', '"itemstatus"', '"tenderstatus"', '"charflag"', '"varflag"', '"batchHeaderID"', '"local"', '"organic"', '"display"', '"receipt"', '"card_no"', '"store"', '"branch"', '"match_id"', '"trans_id"']

['transArchive_201004_201006.csv']
<_io.TextIOWrapper name='transArchive_201004_201006.csv' encoding=

In [8]:
# Just having a look.

print(zipped_files)
print(input_file)
print(this_delimiter)

['transArchive_201701.csv']
<_io.TextIOWrapper name='transArchive_201701.csv' encoding='utf-8'>
,


In [9]:
# Just checking it out.

headers

{'transArchive_201001_201003.csv': True,
 'transArchive_201004_201006.csv': True,
 'transArchive_201007_201009.csv': True,
 'transArchive_201010_201012.csv': True,
 'transArchive_201101_201103.csv': True,
 'transArchive_201104.csv': True,
 'transArchive_201105.csv': True,
 'transArchive_201106.csv': True,
 'transArchive_201107_201109.csv': True,
 'transArchive_201110_201112.csv': True,
 'transArchive_201201_201203.csv': True,
 'transArchive_201201_201203_inactive.csv': True,
 'transArchive_201204_201206.csv': True,
 'transArchive_201204_201206_inactive.csv': True,
 'transArchive_201207_201209.csv': True,
 'transArchive_201207_201209_inactive.csv': True,
 'transArchive_201210_201212.csv': True,
 'transArchive_201210_201212_inactive.csv': True,
 'transArchive_201301_201303.csv': True,
 'transArchive_201301_201303_inactive.csv': True,
 'transArchive_201304_201306.csv': True,
 'transArchive_201304_201306_inactive.csv': True,
 'transArchive_201307_201309.csv': True,
 'transArchive_201307_20

#### Next, the 'Wedge_Unzipped' directory is deleted if it exists. Then we need an extraction of the original zipped data into a new directory 'Wedge_Unzipped'

In [10]:
folderPath = 'Wedge_Unzipped';
    
# This line checks if the folder specified by folderPath exists or not. It uses the os.path.exists() function from 
# the os module to determine if the folder exists.

if os.path.exists(folderPath):
      
    # If the folder exists, this line removes the folder and all its contents. It uses the shutil.rmtree() 
    # function from the shutil module to recursively delete the directory specified by folderPath.
    
    shutil.rmtree(folderPath)
  
    print("The folder has been deleted successfully!")
else:
    print("Cannot delete the folder as it doesn't exists")

Cannot delete the folder as it doesn't exists


In [11]:
# Extracts all zips in 'WedgeZipOfZips' to 'Wedge_Unzipped'.

# This line starts a for loop that iterates over each item (zipf) in the zip_files list.

for zipf in zip_files :
    
    # Within the loop, the code opens a ZIP file specified by "WedgeZipOfZips/" + zipf in read mode ('r'). 
    # It assigns the opened ZIP file object to the variable zf. The with statement ensures that the ZIP file is 
    # properly closed after the code block is executed.
    
    with ZipFile("WedgeZipOfZips/" + zipf,'r') as zf :  
        print(zf.namelist())        
        print('Extracting this file now...')
        
        # This line extracts all the files and directories from the currently opened ZIP file (zf) and saves them 
        # to the 'Wedge_Unzipped' directory. The extractall() method is used to perform the extraction.
        
        zf.extractall('Wedge_Unzipped')
        print('Done!')
        print()
        #break

['transArchive_201001_201003.csv']
Extracting this file now...
Done!

['transArchive_201004_201006.csv']
Extracting this file now...
Done!

['transArchive_201007_201009.csv']
Extracting this file now...
Done!

['transArchive_201010_201012.csv']
Extracting this file now...
Done!

['transArchive_201101_201103.csv']
Extracting this file now...
Done!

['transArchive_201104.csv']
Extracting this file now...
Done!

['transArchive_201105.csv']
Extracting this file now...
Done!

['transArchive_201106.csv']
Extracting this file now...
Done!

['transArchive_201107_201109.csv']
Extracting this file now...
Done!

['transArchive_201110_201112.csv']
Extracting this file now...
Done!

['transArchive_201201_201203.csv']
Extracting this file now...
Done!

['transArchive_201201_201203_inactive.csv']
Extracting this file now...
Done!

['transArchive_201204_201206.csv']
Extracting this file now...
Done!

['transArchive_201204_201206_inactive.csv']
Extracting this file now...
Done!

['transArchive_201207_2

#### Now, the files in 'Wedge_Unzipped' that do not have headers need to be moved to a new directory called 'O_Headers' after deleting this directory if it exists. The code then converts the headerless csv files to files with headers and moves them back to 'Wedge_Unzipped'. After, we clean up 'O_Headers' as it is no longer needed.

In [12]:
folderPath = 'O_Headers';
    
# Check if folder exists or not.

if os.path.exists(folderPath):
      
    # Delete Folder.
    
    shutil.rmtree(folderPath)
  
    print("The folder has been deleted successfully!")
else:
    print("Cannot delete the folder as it doesn't exists")

Cannot delete the folder as it doesn't exists


In [13]:
# This cell converts the headers dictionary built earlier on into a dataframe.

keys = []
values = []

# This for loop iterates over the values of the headers dictionary. It retrieves each value and appends it to the 
# values list.

for value in headers.values():
    values.append(value)
    
# This for loop iterates over the keys of the headers dictionary. It retrieves each key and appends it to the keys list. 

for key in headers.keys():
    keys.append(key)

# Column name list. 

col_names =  ['file', 'headers']
  
# Create an empty dataframe.
# Add columns.

headers_df  = pd.DataFrame(columns = col_names)
headers_df.file = keys
headers_df.headers = values

# Show the dataframe.

headers_df

Unnamed: 0,file,headers
0,transArchive_201001_201003.csv,True
1,transArchive_201004_201006.csv,True
2,transArchive_201007_201009.csv,True
3,transArchive_201010_201012.csv,True
4,transArchive_201101_201103.csv,True
5,transArchive_201104.csv,True
6,transArchive_201105.csv,True
7,transArchive_201106.csv,True
8,transArchive_201107_201109.csv,True
9,transArchive_201110_201112.csv,True


In [14]:
# Converts boolean to float.

headers_df['headers'] = headers_df['headers'].astype(float)
headers_df

Unnamed: 0,file,headers
0,transArchive_201001_201003.csv,1.0
1,transArchive_201004_201006.csv,1.0
2,transArchive_201007_201009.csv,1.0
3,transArchive_201010_201012.csv,1.0
4,transArchive_201101_201103.csv,1.0
5,transArchive_201104.csv,1.0
6,transArchive_201105.csv,1.0
7,transArchive_201106.csv,1.0
8,transArchive_201107_201109.csv,1.0
9,transArchive_201110_201112.csv,1.0


In [15]:
# Creates list of files with no headers.

# This line filters the headers_df dataframe to select rows where the value in the 'headers' column is equal to 0. 
# It creates a new dataframe called headless that contains only those rows.

headless = headers_df[(headers_df.headers == 0)]

# Pulling 'file' from headless and reassigning headless

headless = headless.file

# Creating empty list

file_names_headless = []

# taking all file names from list of files without headers and putting them in the empty list

for file_name in headless :
    file_names_headless.append(file_name)

In [16]:
# Creates 'O_Headers' directory.

# This line assigns the string "O_Headers" to the variable path. It represents the path of the directory that will 
# be created if it doesn't already exist.

path = "O_Headers"

# This line checks if the directory specified by path exists or not. It uses the os.path.exists() function from the 
# os module to determine if the directory exists. The result is stored in the variable isExist.

isExist = os.path.exists(path)

# This line starts an if statement that checks if the isExist variable is False, indicating that the directory does 
# not exist.

if not isExist:
    
    # If the directory does not exist, this line creates the directory specified by path using the os.makedirs() 
    # function from the os module. It creates any necessary parent directories as well.
    
    os.makedirs(path)
    print("The new directory is created!")

The new directory is created!


In [17]:
# Moves files with no headers to 'O_Headers'.

# These lines define the source folder, destination folder, and the list of files to move. source_folder represents 
# the path of the folder where the files are currently located. destination_folder represents the path of the 
# folder where the files will be moved to. files_to_move is a list of file names that need to be moved.

source_folder = r"Wedge_Unzipped\\"
destination_folder = r"O_Headers\\"
files_to_move = file_names_headless

# This line starts a for loop that iterates over each file name in the files_to_move list.

for file in files_to_move:
    
    # These lines construct the full file paths for the source and destination of each file to be moved. They 
    # concatenate the source_folder and destination_folder paths with the current file name to create the full paths.
    
    source = source_folder + file
    destination = destination_folder + file
    
    # This line moves the file from the source path to the destination path using the shutil.move() function from the 
    # shutil module. It effectively performs the file move operation.
    
    shutil.move(source, destination)
    print('Moved:', file)

Moved: transArchive_201511.csv
Moved: transArchive_201512.csv
Moved: transArchive_201601.csv
Moved: transArchive_201602.csv
Moved: transArchive_201603.csv
Moved: transArchive_201604.csv
Moved: transArchive_201605.csv
Moved: transArchive_201606.csv
Moved: transArchive_201607.csv
Moved: transArchive_201608.csv
Moved: transArchive_201609.csv
Moved: transArchive_201610.csv
Moved: transArchive_201611.csv
Moved: transArchive_201612.csv
Moved: transArchive_201701.csv


In [18]:
# This line uses the os.listdir() function to retrieve a list of file names in the directory specified as "O_Headers". 
# It assigns the list of file names to the variable headless_list.

headless_list = os.listdir("O_Headers")

# This for loop iterates over each file name in the headless_list. It reads each CSV file located in the "O_headers" 
# directory using pd.read_csv() from the pandas library. It assigns the resulting DataFrame to the variable big_heads. 
# The header=None argument is used to indicate that the CSV file does not have a header row.

for headless in headless_list :
    big_heads = pd.read_csv('O_headers\\'+headless, header = None)
    
# This for loop iterates over each file name in the headless_list. 

for headless in headless_list :
    
    # It specifies the output path as 'Wedge_Unzipped\\'
    
    path = 'Wedge_Unzipped\\'
    
    # and uses the to_csv() method on the big_heads DataFrame to write the DataFrame to a CSV file with the same name as 
    # the original file. # The header argument is used to provide a list of column names for the output CSV file.
    
    big_heads.to_csv(
        path + headless,
        header=["datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype",
                "trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice",
                "tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype",
                "voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched",
                "memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local",
                "organic","display","receipt","card_no","store","branch","match_id","trans_id"],
        index=False) # The index=False argument ensures that the index column is not included in the output file.

  big_heads = pd.read_csv('O_headers\\'+headless, header = None)
  big_heads = pd.read_csv('O_headers\\'+headless, header = None)
  big_heads = pd.read_csv('O_headers\\'+headless, header = None)
  big_heads = pd.read_csv('O_headers\\'+headless, header = None)
  big_heads = pd.read_csv('O_headers\\'+headless, header = None)
  big_heads = pd.read_csv('O_headers\\'+headless, header = None)
  big_heads = pd.read_csv('O_headers\\'+headless, header = None)
  big_heads = pd.read_csv('O_headers\\'+headless, header = None)


In [19]:
# This chunk deletes the now useless 'O_Headers' directory.

# This line assigns the string 'O_Headers' to the variable folderPath. It represents the path of the directory that 
# will be deleted if it exists.

folderPath = 'O_Headers';
    
# This line checks if the directory specified by folderPath exists or not. It uses the os.path.exists() function 
# from the os module to determine if the directory exists.

if os.path.exists(folderPath):
      
    # If the directory exists, this line removes the directory and its contents using the shutil.rmtree() function 
    # from the shutil module. It effectively deletes the directory and all its subdirectories and files.
    
    shutil.rmtree(folderPath)
  
    print("The folder has been deleted successfully!")
else:
    print("Cannot delete the folder as it doesn't exists")

The folder has been deleted successfully!


#### Now we need to upload the data to GBQ after establishing a connection.

In [20]:
# Building the private key.

# This line assigns the string representing the path to the JSON file to the variable service_path. It specifies the 
# directory path where the JSON file is located.

service_path = "C:\\Users\\rsmcd\\OneDrive\\Desktop\\MSBA Fall 2022\\" # Path to json file.

# This line assigns the string representing the name of the JSON file to the variable service_file. It specifies the 
# name of the JSON file, including its extension.

service_file = 'reese-msba-9558fdd20984.json' # Name of json file.

# This line assigns the string representing the name of the Google BigQuery project to the variable gbq_proj_id. It 
# specifies the project ID associated with the Google BigQuery service.

gbq_proj_id = 'reese-msba' # Name of project.

# This line concatenates the service_path and service_file variables to form the complete file path to the JSON file. 
# It assigns this concatenated path string to the variable private_key. The private_key variable now holds the full 
# path to the JSON file, which can be used to authenticate and access the Google BigQuery service.

private_key =service_path + service_file # Creates single variable that leads to json file.

In [21]:
# A connection to Google BigQuery is established here using the credentials from a service account JSON file. The code then 
# lists the datasets available in the connected BigQuery project.

# Now we pass in our credentials so that Python has permission to access our project. This line uses the 
# from_service_account_file() method from the service_account module to create credentials for authentication. It takes 
# the complete file path to the service account JSON file (service_path + service_file) and generates the credentials 
# object.

credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

# And finally we establish our connection. This line creates a BigQuery client object using the Client() constructor 
# from the bigquery module. It passes the credentials object and the gbq_proj_id as parameters to authenticate the 
# client connection. This establishes the connection to the specified BigQuery project.

client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

# Look at list of data sets in client. This code block uses a for loop to iterate over the datasets in the connected 
# BigQuery project. The client.list_datasets() method retrieves a list of all datasets, and each item represents a 
# dataset. The item.full_dataset_id attribute is printed, which contains the full identifier of the dataset, including 
# the project ID and dataset name.

for item in client.list_datasets() : 
    print(item.full_dataset_id)

reese-msba:dram_shop
reese-msba:wedge_transactions


#### This body of code cleans up the files and uploads to GBQ.

In [22]:
# This line assigns the string 'reese-msba' to the variable gbq_proj_id. It represents the Google BigQuery project ID to 
# which you want to connect.

gbq_proj_id = 'reese-msba'

# This line assigns the string 'wedge_transactions' to the variable dataset_id. It represents the dataset ID within the 
# specified Google BigQuery project where you want to perform operations.

dataset_id = 'wedge_transactions'

# This line uses the os.listdir() function to retrieve a list of filenames within the directory "Wedge_Unzipped". It 
# assigns the list of filenames to the variable unzipped_files. The os.listdir() function returns a list of all files 
# and directories present in the specified directory.

unzipped_files = os.listdir("Wedge_Unzipped")

# Iterating through the list of unzipped files that all should now have headers. The following are cleaning operations
# that unify data types and correct inconsistencies in entry of booleans and nulls. After, the data is loaded into GBQ.

for uz_file in unzipped_files :    
    
    # This line reads the CSV file specified by 'Wedge_Unzipped\\' + uz_file into a DataFrame called big_wedge. The sep 
    # parameter is set to the delimiter specific to the file (delimiters[uz_file]), and the encoding parameter is set 
    # to "utf-8" to ensure proper character encoding.
    
    big_wedge = pd.read_csv('Wedge_Unzipped\\'+uz_file,sep=delimiters[uz_file], encoding = "utf-8") 
    
    # Replaces all \N and ' ' with gbq ready null values.
    
    big_wedge = big_wedge.replace(r'\N', np.nan).replace(r' ', np.nan) 
    
    # This code block iterates over each column in the big_wedge DataFrame using enumerate(). It checks the data type 
    # of each column and performs type conversions if necessary. If the column's data type is object, it converts the 
    # column to the str data type using astype() and replaces the string 'nan' with NaN values. If the column's data 
    # type is "int64", it converts the column to the float data type using astype().
    
    for idx, column in enumerate(big_wedge) :
        
        # Converts all object columns to strings and replaces 'nan' with gbq ready null values.
        
        if big_wedge[column].dtypes == object : 
            big_wedge = big_wedge.astype({column :'str'}).replace('nan', np.nan)
            
        # Converts all integer columns to floats.  
        
        if big_wedge[column].dtypes == "int64" : 
            big_wedge = big_wedge.astype({column :'float'})
            
    # Converts datetime columns to timestamp.  
    
    big_wedge['datetime'] = pd.to_datetime(big_wedge['datetime']) 
    
    # List of columns to convert to float.
    
    cols = ['wicable','taxexempt','percentDiscount','receipt','match_id','local','organic','itemstatus','tenderstatus'] 
    
    # Converts cols to float.
    
    for idx, col in enumerate(cols) :
        big_wedge[col] = pd.to_numeric(big_wedge[col])
        
    # List of columns to convert to boolean. 
    
    cols2 = ['memType','staff','batchHeaderID','display', 'wicable', 'taxexempt', 'local'] 
    
    # Converts 1 and 0 to True and False by mapping, then converts to boolean while preserving null values.
    
    for idx, col in enumerate(cols2) :
        big_wedge[col] = big_wedge[col].map({1:True, 0:False}).astype('boolean') 
#     break

    # Uploading all data files to GBQ.
    
    # These three lines extract the table name from each file name, construct the table ID 
    # using the project ID, dataset ID, and table name, and then write the DataFrame (big_wedge) to the specified 
    # BigQuery table using the pandas_gbq.to_gbq() function.
    
    # This line splits the uz_file variable at the period (.) and assigns the first part to table_name. The 
    # underscore _ is used as a placeholder to ignore the second part of the split result.
    
    table_name, _ = uz_file.split(".")
    
    # This line joins the gbq_proj_id, dataset_id, and table_name together using periods (.) as separators. It 
    # creates a full table identifier in the format of project_id.dataset_id.table_name. The resulting string is 
    # assigned to the table_id variable.
    
    table_id = ".".join([gbq_proj_id,dataset_id,table_name])
    
    # This line uses the to_gbq() function from the pandas_gbq module to write the big_wedge DataFrame to the 
    # specified BigQuery table.
    
    pandas_gbq.to_gbq(big_wedge, table_id, project_id=gbq_proj_id, if_exists="replace")
    
    # It takes the following arguments:

    # big_wedge: The DataFrame to be written to BigQuery.
    # table_id: The fully qualified table identifier in the format of project_id.dataset_id.table_name.
    # project_id: The project ID associated with the BigQuery table.
    # if_exists="replace": This parameter specifies the behavior if the table already exists. In this case, it is set 
    # to "replace", which means that if the table already exists, it will be replaced with the new data from the DataFrame.
    
#     break    

GenericGBQException: Reason: 403 POST https://bigquery.googleapis.com/bigquery/v2/projects/reese-msba/datasets/wedge_transactions/tables?prettyPrint=false: Billing has not been enabled for this project. Enable billing at https://console.cloud.google.com/billing. Datasets must have a default expiration time and default partition expiration time of less than 60 days while in sandbox mode.