In [1]:
import pandas as pd
import os
from datetime import timedelta

In [2]:
df_customers = pd.read_csv('DimCustomer.csv')
df_date = pd.read_csv('DimDate.csv')
df_products = pd.read_csv('DimProduct.csv')
df_stores = pd.read_csv('DimStore.csv')
df_transactions = pd.read_csv('FactTransaction.csv')

In [3]:
# Add errors/duplicates to simulate dirty dataset

# 1. Duplicate rows
df_customers_dirty = pd.concat([df_customers, df_customers.sample(10)], ignore_index=True)
df_transactions_dirty = pd.concat([df_transactions, df_transactions.sample(50)], ignore_index=True)

# 2. Age outliers
age_outliers = df_customers.sample(5).copy()
age_outliers['age'] = [3, 140, 125, 8, 111]
df_customers_dirty = pd.concat([df_customers_dirty, age_outliers], ignore_index=True)

# 3. Future signup dates
future_customers = df_customers.sample(5).copy()
future_customers['signup_date'] = pd.Timestamp.today() + timedelta(days=120)
df_customers_dirty = pd.concat([df_customers_dirty, future_customers], ignore_index=True)

# 4. Invalid discounts
discount_outliers = df_transactions.sample(5).copy()
discount_outliers['discount_applied'] = [0.5, -0.1, 0.75, 1.0, -0.25]
df_transactions_dirty = pd.concat([df_transactions_dirty, discount_outliers], ignore_index=True)

# 5. Future transaction dates
future_txn = df_transactions.sample(5).copy()
future_txn['transaction_date'] = pd.Timestamp.today() + timedelta(days=90)
df_transactions_dirty = pd.concat([df_transactions_dirty, future_txn], ignore_index=True)

# Save to new dirty dataset directory
dirty_path = "cooked/"
os.makedirs(dirty_path, exist_ok=True)

df_customers_dirty.to_csv(dirty_path + "DimCustomer_dirty.csv", index=False)
df_products.to_csv(dirty_path + "DimProduct_dirty.csv", index=False)
df_stores.to_csv(dirty_path + "DimStore_dirty.csv", index=False)
df_date.to_csv(dirty_path + "DimDate_dirty.csv", index=False)
df_transactions_dirty.to_csv(dirty_path + "FactTransaction_dirty.csv", index=False)

dirty_path


'cooked/'

In [None]:
from google.cloud import bigquery
# Initialize BigQuery client
client = bigquery.Client()
# Define dataset and table names
dataset_id = "cust_analytics"  
# Mapping of file names to table names
file_table_map = {
    "DimCustomer_dirty.csv": "dim_customers",
    "DimProduct_dirty.csv": "dim_products",
    "DimStore_dirty.csv": "dim_stores",
    "DimDate_dirty.csv": "dim_date",
    "FactTransaction_dirty.csv": "fact_transactions"
}

# Load each CSV into its respective BigQuery table
for file_name, table_name in file_table_map.items():
    table_id = f"{client.project}.{dataset_id}.{table_name}"
    file_path = os.path.join(dirty_path, file_name)
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,
        autodetect=True,
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
    )
    with open(file_path, "rb") as source_file:
        load_job = client.load_table_from_file(source_file, table_id, job_config=job_config)
    load_job.result()  # Wait for the job to complete
    print(f"Loaded {file_name} to {table_id}")

DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.

In [1]:
%pip install nbconvert


Collecting nbconvert
  Downloading nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting beautifulsoup4 (from nbconvert)
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting bleach!=5.0.0 (from bleach[css]!=5.0.0->nbconvert)
  Downloading bleach-6.2.0-py3-none-any.whl.metadata (30 kB)
Collecting defusedxml (from nbconvert)
  Downloading defusedxml-0.7.1-py2.py3-none-any.whl.metadata (32 kB)
Collecting jinja2>=3.0 (from nbconvert)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting jupyterlab-pygments (from nbconvert)
  Downloading jupyterlab_pygments-0.3.0-py3-none-any.whl.metadata (4.4 kB)
Collecting markupsafe>=2.0 (from nbconvert)
  Downloading MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting mistune<4,>=2.0.3 (from nbconvert)
  Downloading mistune-3.1.3-py3-none-any.whl.metadata (1.8 kB)
Collecting nbclient>=0.5.0 (from nbconvert)
  Downloading nbclient-0.10.2-py3-none-any.whl.metadata (8.3 kB)
Collec


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
