Google Big Query is a distributed data warehouse built on a serverless architecture . We’ll discuss this framework in class. In this task you’ll upload all Wedge transaction records to Google Big Query. You’ll want to make sure that the column data types are correctly specified and you’ve properly handled the null values. 
The requirements for this task change depending on the grade you’re going for. 
Note: this assignment can be done manually or programmatically. Naturally I’d prefer it be done programmatically so that you get more practice, but that’s not required to get full credit. 

In [1]:
import pandas as pd
import zipfile
from google.cloud import bigquery
from google.oauth2 import service_account
from google.api_core.exceptions import NotFound
from google.cloud.bigquery import SchemaField
import os
import numpy as np 



In [2]:

zip_path = 'Data\wedge-clean-files.zip'  # Replace with your zip file path
extract_path = 'Data\Clean'   # Replace with your desired extract path

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [3]:
#export GOOGLE_APPLICATION_CREDENTIALS='wedge-project-403222-85fe5b35980b.json'

service_path = ""
service_file = 'wedge-project-403222-80aeb3085a6a.json' # change this to your authentication information  

gbq_proj_id = 'wedge-project-403222'  

# And this should stay the same. 
private_key = service_path + service_file

# Now we pass in our credentials so that Python has permission to access our project.
credentials = service_account.Credentials.from_service_account_file(private_key)

# And finally we establish our connection
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)


In [4]:
# Check if the dataset exists
dataset_id = 'Transactions'
dataset_ref = client.dataset(dataset_id)

try:
    client.get_dataset(dataset_ref)
    print(f"Dataset {dataset_id} already exists.")
except NotFound:
    # Create the dataset if it does not exist
    dataset = bigquery.Dataset(dataset_ref)
    dataset = client.create_dataset(dataset)
    print(f"Dataset {dataset_id} created.")



Dataset Transactions already exists.


In [5]:
folder_path = 'Data\Clean\clean-files'
data_types = {
    'datetime': 'str',
    'register_no': 'Float64',
    'emp_no': 'Float64',
    'trans_no': 'Float64',
    'upc': 'str',
    'description': 'str',
    'trans_type': 'str',
    'trans_subtype': 'str',
    'trans_status': 'str',
    'department': 'Float64',
    'quantity': 'Float64',
    'Scale': 'Float64',
    'cost': 'Float64',
    'unitPrice': 'Float64',
    'total': 'Float64',
    'regPrice': 'Float64',
    'altPrice': 'Float64',
    'tax': 'Float64',
    'taxexempt': 'Float64',
    'foodstamp': 'Float64',
    'wicable': 'Float64',
    'discount': 'Float64',
    'memDiscount': 'Float64',
    'discountable': 'Float64',
    'discounttype': 'Float64',
    'voided': 'Float64',
    'percentDiscount': 'Float64',
    'ItemQtty': 'Float64',
    'volDiscType': 'Float64',
    'volume': 'Float64',
    'VolSpecial': 'Float64',
    'mixMatch': 'Float64',
    'matched': 'Float64',
    'memType': 'Float64',
    'staff': 'Float64',
    'numflag': 'Float64',
    'itemstatus': 'Float64',
    'tenderstatus': 'Float64',
    'charflag': 'str',
    'varflag': 'Float64',
    'batchHeaderID': 'Float64',
    'local': 'Float64',
    'organic': 'Float64',
    'display': 'Float64',
    'receipt': 'Float64',
    'card_no': 'Float64',
    'store': 'Float64',
    'branch': 'Float64',
    'match_id': 'Float64',
    'trans_id': 'Float64',

}

na_values =[' ']

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'): 
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, dtype=data_types, na_values=na_values)

        for column, dtype in data_types.items():
            if dtype == 'Float64':
                df[column] = pd.to_numeric(df[column], errors='coerce')

In [7]:
# Convert to string
string_columns = ['trans_subtype', 'trans_status', 'charflag']
for col in string_columns:
    df[col] = df[col].astype(str)

# Convert to numeric (float), using NaN for non-convertible values
numeric_columns = ['taxexempt', 'wicable', 'percentDiscount', 'memType', 
                   'itemstatus', 'tenderstatus', 'local', 'organic', 
                   'receipt', 'match_id']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill NaN values with a default value if required, e.g., 0.0
df.fillna(0.0, inplace=True)

In [8]:
types_df = df.applymap(type)
unique_types_per_column = types_df.nunique()

# Identifying columns with more than one data type
mixed_type_columns = unique_types_per_column[unique_types_per_column > 1]
print("Columns with mixed types:\n", mixed_type_columns)

  types_df = df.applymap(type)


Columns with mixed types:
 Series([], dtype: int64)


In [9]:
schema1 = [
    SchemaField("datetime", "STRING", mode="NULLABLE"),
    SchemaField("register_no", "FLOAT", mode="NULLABLE"),
    SchemaField("emp_no", "FLOAT", mode="NULLABLE"),
    SchemaField("trans_no", "FLOAT", mode="NULLABLE"),
    SchemaField("upc", "STRING", mode="NULLABLE"),
    SchemaField("description", "STRING", mode="NULLABLE"),
    SchemaField("trans_type", "STRING", mode="NULLABLE"),
    SchemaField("trans_subtype", "STRING", mode="NULLABLE"),
    SchemaField("trans_status", "STRING", mode="NULLABLE"),
    SchemaField("department", "FLOAT", mode="NULLABLE"),
    SchemaField("quantity", "FLOAT", mode="NULLABLE"),
    SchemaField("Scale", "FLOAT", mode="NULLABLE"),
    SchemaField("cost", "FLOAT", mode="NULLABLE"),
    SchemaField("unitPrice", "FLOAT", mode="NULLABLE"),
    SchemaField("total", "FLOAT", mode="NULLABLE"),
    SchemaField("regPrice", "FLOAT", mode="NULLABLE"),
    SchemaField("altPrice", "FLOAT", mode="NULLABLE"),
    SchemaField("tax", "FLOAT", mode="NULLABLE"),
    SchemaField("taxexempt", "FLOAT", mode="NULLABLE"),
    SchemaField("foodstamp", "FLOAT", mode="NULLABLE"),
    SchemaField("wicable", "FLOAT", mode="NULLABLE"),
    SchemaField("discount", "FLOAT", mode="NULLABLE"),
    SchemaField("memDiscount", "FLOAT", mode="NULLABLE"),
    SchemaField("discountable", "FLOAT", mode="NULLABLE"),
    SchemaField("discounttype", "FLOAT", mode="NULLABLE"),
    SchemaField("voided", "FLOAT", mode="NULLABLE"),
    SchemaField("percentDiscount", "FLOAT", mode="NULLABLE"),
    SchemaField("ItemQtty", "FLOAT", mode="NULLABLE"),
    SchemaField("volDiscType", "FLOAT", mode="NULLABLE"),
    SchemaField("volume", "FLOAT", mode="NULLABLE"),
    SchemaField("VolSpecial", "FLOAT", mode="NULLABLE"),
    SchemaField("mixMatch", "FLOAT", mode="NULLABLE"),
    SchemaField("matched", "FLOAT", mode="NULLABLE"),
    SchemaField("memType", "FLOAT", mode="NULLABLE"),
    SchemaField("staff", "FLOAT", mode="NULLABLE"),
    SchemaField("numflag", "FLOAT", mode="NULLABLE"),
    SchemaField("itemstatus", "FLOAT", mode="NULLABLE"),
    SchemaField("tenderstatus", "FLOAT", mode="NULLABLE"),
    SchemaField("charflag", "STRING", mode="NULLABLE"),
    SchemaField("varflag", "FLOAT", mode="NULLABLE"),
    SchemaField("batchHeaderID", "FLOAT", mode="NULLABLE"),
    SchemaField("local", "FLOAT", mode="NULLABLE"),
    SchemaField("organic", "FLOAT", mode="NULLABLE"),
    SchemaField("display", "FLOAT", mode="NULLABLE"),
    SchemaField("receipt", "FLOAT", mode="NULLABLE"),
    SchemaField("card_no", "FLOAT", mode="NULLABLE"),
    SchemaField("store", "FLOAT", mode="NULLABLE"),
    SchemaField("branch", "FLOAT", mode="NULLABLE"),
    SchemaField("match_id", "FLOAT", mode="NULLABLE"),
    SchemaField("trans_id", "FLOAT", mode="NULLABLE")
]

In [11]:
# BigQuery client
#client = bigquery.Client()

# Path to the directory where files are extracted
files_path = 'Data\Clean\clean-files' # Update this to your path

# Loop through the files and upload each to BigQuery
for filename in os.listdir(files_path):
    if filename.endswith('.csv'):  # Assuming files are in CSV format
 
        file_path = os.path.join(files_path, filename)
        dataframe = pd.read_csv(file_path, low_memory=False)


        float_columns = ['register_no', 'emp_no', 'trans_no', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 
        'total', 'regPrice', 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 
        'discounttype', 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 
        'memType', 'staff', 'numflag', 'itemstatus', 'tenderstatus', 'varflag', 'batchHeaderID', 'local', 'organic', 'display', 
        'receipt', 'card_no', 'store', 'branch', 'match_id', 'trans_id']  # Add other float column names here
        for col in float_columns:
            dataframe[col] = pd.to_numeric(dataframe[col], errors='coerce')
        dataframe.fillna(0.0, inplace=True)

        project_id = 'wedge-project-403222'
        dataset_id = 'Transactions'
        table_id = os.path.splitext(filename)[0]

        # Define the full table ID
        table_full_id = f"{client.project}.{dataset_id}.{table_id}"

        # If the table does not exist, it will be created. If it exists, data will be appended.
        job = client.load_table_from_dataframe(dataframe, table_full_id, job_config=bigquery.LoadJobConfig(schema=schema1))

        # Wait for the job to complete
        job.result()
        print(f"Uploaded {filename} to {table_full_id}")

ArrowTypeError: Expected bytes, got a 'float' object