# Read, Clean, Upload Wedge Files

In [1]:
import pandas as pd
import os
import csv

import re
import datetime
import io
from zipfile import ZipFile

import pandas as pd
import numpy as np
import pandas_gbq
import janitor

# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account

# Connect to GBQ

In [2]:
# These first two values will be different on your machine. 
service_path = "C:\\Users\\ofano\\Documents\\MSBA Folder\\MSBA Folder\\"
service_file = 'msba-project-2022-75bb8251ef6f.json' # change this to your authentication information  
gbq_proj_id = 'msba-project-2022' # change this to your project. 

# And this should stay the same. 
private_key =service_path + service_file

In [3]:
# Now we pass in our credentials so that Python has permission to access our project.
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

In [4]:
# And finally we establish our connection
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

In [5]:
for item in client.list_datasets() : 
    print(item.full_dataset_id)

msba-project-2022:dram_shop
msba-project-2022:wedge_dataset


# Specify Google Big Query location

In [6]:
name_pattern = re.compile(r"(\D{12})")

In [7]:
dataset_id = "wedge_dataset"

In [8]:
tables = client.list_tables(dataset_id)  

for table in tables:
    
    print(f'Looking at {table.table_id}')

Looking at transArchive_201001_201003
Looking at transArchive_201004_201006
Looking at transArchive_201007_201009


In [9]:
#the next two lines create my table id in gbq format

#this line delete my table out of gbq if it exists
for table in client.list_tables(dataset_id) :
    if name_pattern.search(table.table_id) : 
        table_id = ".".join([gbq_proj_id,dataset_id,table.table_id])
        client.delete_table(table_id, not_found_ok=True)

        print(f"Deleted {table.table_id}.")

Deleted transArchive_201001_201003.
Deleted transArchive_201004_201006.
Deleted transArchive_201007_201009.


# Reading in and Cleaning the zips

Now we need to read in our dirty zip files

In [10]:
zip_files = os.listdir("Wedge_zips\\")
#zip_files

Now let's open the files, find their delimiters and headers, and read each of them into a pds dataframe

In [11]:
headers = """datetime  register_no    emp_no    trans_no    upc    description    trans_type    trans_subtype    trans_status    department
quantity    Scale    cost    unitPrice    total    regPrice    altPrice    tax    taxexempt    foodstamp    wicable
discount    memDiscount    discountable    discounttype    voided    percentDiscount    ItemQtty    volDiscType    volume
VolSpecial    mixMatch    matched    memType    staff    numflag    itemstatus    tenderstatus    charflag    varflag
batchHeaderID    local    organic    display    receipt    card_no    store    branch    match_id    trans_id
""".split()

# Cleaning Files and Uploading to Google Big Query

In [1]:
#first lets read in the zips and get a list of file names

for zip in zip_files:
    with ZipFile("Wedge_zips\\" + zip, 'r') as zf :
        files_in_zip = zf.namelist()
        
        #now I can open the files
        for file_name in files_in_zip:
            open_file = zf.open(file_name, 'r')
            open_file = io.TextIOWrapper(open_file, encoding = "utf-8")
            
            #check open_file's delimeter
            sniffer = csv.Sniffer().sniff(sample = open_file.readline())
            
            #check open_file for headers
            for line in open_file:
                
                #now lets handle delimeters and headers while reading into a pd df
                if line[0] == "d" or line[0:2] == '"d':
                    df = pd.read_csv(open_file, sep = sniffer.delimiter, encoding = "utf-8")
                    
                                        
                else:
                    df = pd.read_csv(open_file, sep = sniffer.delimiter, names = headers, encoding = "utf-8")
                    
            
            for idx,column in enumerate(df):
                df[column] = df[column].replace(np.nan, '', regex=True)
                df['datetime'] = pd.to_datetime(df['datetime'],format='%Y-%m-%d %H:%M:%S')
                df['local'] = df['local'].fillna(0)
                df['local'] = df['local'].replace('\\N', 0)
                df['local'] = df['local'].astype(int)
                df['altPrice'] = df['altPrice'].astype(str)
                if df[column].dtypes == object:
                    df=df.astype({column: 'str'})

            
            df = janitor.clean_names(df)


    

            table_name = file_name.replace('.csv', '')
            table_id = ".".join([gbq_proj_id,dataset_id,table_name])
            pandas_gbq.to_gbq(df,table_id,project_id=gbq_proj_id,if_exists="replace")

In [24]:
#df.dtypes