In [1]:
import os
import json
import random
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone
from typing import Dict, Any

from google.cloud import bigquery
from google.cloud.bigquery import LoadJobConfig, SourceFormat, WriteDisposition
from google.api_core.exceptions import NotFound

In [2]:
PROJECT_ID = "analytics-pipeline-assessment"
DATASET_ID = "analytics_dw"

bq = bigquery.Client(project = PROJECT_ID)

Logging function

In [3]:
def log(step: str, **kwargs):

  ist = timezone(timedelta(hours=5, minutes=30))
  ist_time = datetime.now(ist).isoformat()

  print(json.dumps({"ts": ist_time, "Step": step, **kwargs}))

Attendance data generator

In [4]:
def gen_attendance():

  # Parameters
  num_records = 3_000_000
  output_file = "attendance_dataset_3m.csv"

  # Sample data
  regions = ["North America", "Europe", "Asia", "South America", "Africa", "Oceania"]
  countries = {
      "North America": ["USA", "Canada", "Mexico"],
      "Europe": ["Germany", "France", "UK", "Italy"],
      "Asia": ["China", "India", "Japan", "Singapore"],
      "South America": ["Brazil", "Argentina", "Chile"],
      "Africa": ["South Africa", "Nigeria", "Egypt"],
      "Oceania": ["Australia", "New Zealand"]
  }
  departments = ["IT", "Sales", "Marketing", "HR", "Finance", "Operations"]
  first_names = ["Alice", "Bob", "Chen", "Daniela", "Ethan", "Fatima", "George", "Hiro", "Isabella", "Juan"]
  last_names = ["Johnson", "Smith", "Wei", "Lopez", "Brown", "Hassan", "Wilson", "Tanaka", "Rossi", "Martinez"]
  statuses = ["Present", "Absent", "Remote"]

  # Generator function
  def generate_attendance_data(n):
      for i in range(1, n+1):
          staff_id = f"ST{i:07d}"
          name = f"{random.choice(first_names)} {random.choice(last_names)}"
          region = random.choice(regions)
          country = random.choice(countries[region])
          department = random.choice(departments)
          date = datetime(2020, 1, 1) + timedelta(days=random.randint(0, 1825))  # 5 years
          status = random.choices(statuses, weights=[0.7, 0.1, 0.2])[0]  # more likely to be Present
          if status == "Present" or status == "Remote":
              check_in_hour = random.randint(8, 10)
              check_in_minute = random.randint(0, 59)
              check_out_hour = random.randint(16, 18)
              check_out_minute = random.randint(0, 59)
              check_in = f"{check_in_hour:02d}:{check_in_minute:02d}"
              check_out = f"{check_out_hour:02d}:{check_out_minute:02d}"
          else:
              check_in = "-"
              check_out = "-"

          yield [
              staff_id, name, region, country, department, date.strftime("%Y-%m-%d"),
              status, check_in, check_out
          ]

  # Write CSV in chunks
  columns = ["StaffID", "Name", "Region", "Country", "Department", "Date", "Status", "CheckInTime", "CheckOutTime"]
  chunk_size = 100_000

  with open(output_file, "w", encoding="utf-8") as f:
      f.write(",".join(columns) + "\n")
      for start in range(0, num_records, chunk_size):
          chunk = list(generate_attendance_data(min(chunk_size, num_records - start)))
          df = pd.DataFrame(chunk, columns=columns)
          df.to_csv(f, header=False, index=False)

  print(f"âœ… Attendance dataset generated: {output_file}")
  log("gen.attendance.done", file = output_file)

Sales data generator

In [5]:
def gen_sales():

  # Parameters
  num_records = 3_000_000
  output_file = "sales_dataset_3m.csv"

  # Sample lists
  regions = ["North America", "Europe", "Asia", "South America", "Africa", "Oceania"]
  countries = {
      "North America": ["USA", "Canada", "Mexico"],
      "Europe": ["Germany", "France", "UK", "Italy"],
      "Asia": ["China", "India", "Japan", "Singapore"],
      "South America": ["Brazil", "Argentina", "Chile"],
      "Africa": ["South Africa", "Nigeria", "Egypt"],
      "Oceania": ["Australia", "New Zealand"]
  }
  currencies = {
      "USA": "USD", "Canada": "CAD", "Mexico": "MXN",
      "Germany": "EUR", "France": "EUR", "UK": "GBP", "Italy": "EUR",
      "China": "CNY", "India": "INR", "Japan": "JPY", "Singapore": "SGD",
      "Brazil": "BRL", "Argentina": "ARS", "Chile": "CLP",
      "South Africa": "ZAR", "Nigeria": "NGN", "Egypt": "EGP",
      "Australia": "AUD", "New Zealand": "NZD"
  }
  products = ["Software", "Hardware", "Consulting", "Cloud Services", "Licenses"]

  # Generator function to avoid memory issues
  def generate_sales_data(n):
      for i in range(1, n+1):
          region = random.choice(regions)
          country = random.choice(countries[region])
          product = random.choice(products)
          currency = currencies[country]
          date = datetime(2020, 1, 1) + timedelta(days=random.randint(0, 1825))  # 5 years
          quantity = random.randint(1, 50)
          unit_price = round(random.uniform(100, 5000), 2)
          total_sales = round(quantity * unit_price, 2)

          yield [
              f"S{i:07d}", region, country, product, date.strftime("%Y-%m-%d"),
              currency, quantity, unit_price, total_sales
          ]

  # Write CSV in chunks
  columns = ["SaleID", "Region", "Country", "Product", "Date", "Currency", "Quantity", "UnitPrice", "TotalSales"]
  chunk_size = 100_000

  with open(output_file, "w", encoding="utf-8") as f:
      f.write(",".join(columns) + "\n")
      for start in range(0, num_records, chunk_size):
          chunk = list(generate_sales_data(min(chunk_size, num_records - start)))
          df = pd.DataFrame(chunk, columns=columns)
          df.to_csv(f, header=False, index=False)

  print(f"âœ… Sales dataset generated: {output_file}")
  log("gen.sales.done", file = output_file)

Finance data generator

In [6]:
def gen_finance():

  # Parameters
  num_records = 3_000_000
  output_file = "financial_dataset_3m.csv"

  regions = ["North America", "Europe", "Asia", "South America", "Africa", "Oceania"]
  countries = {
      "North America": ["USA", "Canada", "Mexico"],
      "Europe": ["Germany", "France", "UK", "Italy"],
      "Asia": ["China", "India", "Japan", "Singapore"],
      "South America": ["Brazil", "Argentina", "Chile"],
      "Africa": ["South Africa", "Nigeria", "Egypt"],
      "Oceania": ["Australia", "New Zealand"]
  }
  currencies = {
      "USA": "USD", "Canada": "CAD", "Mexico": "MXN",
      "Germany": "EUR", "France": "EUR", "UK": "GBP", "Italy": "EUR",
      "China": "CNY", "India": "INR", "Japan": "JPY", "Singapore": "SGD",
      "Brazil": "BRL", "Argentina": "ARS", "Chile": "CLP",
      "South Africa": "ZAR", "Nigeria": "NGN", "Egypt": "EGP",
      "Australia": "AUD", "New Zealand": "NZD"
  }
  products = ["Software", "Hardware", "Consulting", "Cloud Services", "Licenses"]

  def generate_data(n):
      for i in range(1, n+1):
          region = random.choice(regions)
          country = random.choice(countries[region])
          currency = currencies[country]
          product = random.choice(products)
          date = datetime(2020, 1, 1) + timedelta(days=random.randint(0, 1825))
          revenue = round(random.uniform(1000, 100000), 2)
          expense = round(revenue * random.uniform(0.4, 0.9), 2)
          profit = revenue - expense

          yield [
              f"T{i:07d}", region, country, product, date.strftime("%Y-%m-%d"),
              currency, revenue, expense, profit
          ]

  columns = ["TransactionID", "Region", "Country", "Product", "Date", "Currency", "Revenue", "Expense", "Profit"]

  chunk_size = 100_000
  with open(output_file, "w", encoding="utf-8") as f:
      f.write(",".join(columns) + "\n")
      for start in range(0, num_records, chunk_size):
          chunk = list(generate_data(min(chunk_size, num_records - start)))
          df = pd.DataFrame(chunk, columns=columns)
          df.to_csv(f, header=False, index=False)

  print(f"âœ… Finance dataset generated: {output_file}: {output_file}")
  log("gen.finance.done", file = output_file)

In [7]:
def data_gen():

  gen_attendance(), gen_sales(), gen_finance()

data_gen()

âœ… Attendance dataset generated: attendance_dataset_3m.csv
{"ts": "2025-10-11T23:25:32.840996+05:30", "Step": "gen.attendance.done", "file": "attendance_dataset_3m.csv"}
âœ… Sales dataset generated: sales_dataset_3m.csv
{"ts": "2025-10-11T23:26:17.594463+05:30", "Step": "gen.sales.done", "file": "sales_dataset_3m.csv"}
âœ… Finance dataset generated: financial_dataset_3m.csv: financial_dataset_3m.csv
{"ts": "2025-10-11T23:27:04.719752+05:30", "Step": "gen.finance.done", "file": "financial_dataset_3m.csv"}


Ensuring dataset existance

In [8]:
def ensure_dataset():

  ds = f"{PROJECT_ID}.{DATASET_ID}"

  try:
    bq.get_dataset(ds)
    log("dataset.exists", dataset = ds)
  except Exception:
    bq.create_dataset(bigquery.Dataset(ds), exists_ok = True)
    log("dataset.created", dataset = ds)

SQL query executor

In [9]:
def run_sql(sql: str):

    return bq.query(sql).result()

In [10]:
DDL_SQL = f"""
CREATE SCHEMA IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}`;

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.dim_date` (
  date_key DATE,
  year INT64,
  quarter INT64,
  month INT64,
  month_name STRING,
  day_of_month INT64,
  day_of_week INT64,
  day_name STRING
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.dim_location` (
  location_key STRING,
  region STRING,
  country STRING
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.dim_product` (
  product_key STRING,
  product_name STRING
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.dim_employee` (
  employee_key STRING,
  staffid STRING,
  name STRING,
  department STRING,
  home_country STRING,
  home_region STRING
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.dim_currency` (
  currency_key STRING,
  currency_code STRING,
  date_to_usd NUMERIC
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.stg_attendance` (
  StaffID STRING,
  Name STRING,
  Region STRING,
  Country STRING,
  Department STRING,
  Date DATE,
  Status STRING,
  CheckInTime STRING,
  CheckOutTime STRING
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.stg_sales` (
  SaleID STRING,
  Region STRING,
  Country STRING,
  Product STRING,
  Date DATE,
  Currency STRING,
  Quantity INT64,
  UnitPrice NUMERIC,
  TotalSales NUMERIC
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.stg_finance` (
  TransactionID STRING,
  Region STRING,
  Country STRING,
  Product STRING,
  Date DATE,
  Currency STRING,
  Revenue NUMERIC,
  Expense NUMERIC,
  Profit NUMERIC
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.fact_attendance` (
  attendance_key STRING,
  employee_key STRING,
  location_key STRING,
  date_key DATE,
  status STRING,
  checkin_time TIMESTAMP,
  checkout_time TIMESTAMP
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.fact_sales` (
  saleid STRING,
  product_key STRING,
  location_key STRING,
  date_key DATE,
  currency_key STRING,
  quantity INT64,
  conversion_rate_to_usd NUMERIC,
  unit_price_usd NUMERIC,
  total_sales_usd NUMERIC
);

CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.fact_finance` (
  transaction_id STRING,
  product_key STRING,
  location_key STRING,
  date_key DATE,
  currency_key STRING,
  conversion_rate_to_usd NUMERIC,
  revenue_usd NUMERIC,
  expense_usd NUMERIC,
  profit_usd NUMERIC
);
"""

run_sql(DDL_SQL)
log("ddl.applied")

{"ts": "2025-10-11T23:27:09.965021+05:30", "Step": "ddl.applied"}


Upload finance data to BQ

In [11]:
def upload_finance_to_bq():
    table_id = f"{PROJECT_ID}.{DATASET_ID}.stg_finance"
    file_path = "financial_dataset_3m.csv"

    # Read the CSV
    df = pd.read_csv(file_path)

    # Remove duplicates
    before = len(df)
    df = df.drop_duplicates(keep="first")
    after = len(df)
    removed = before - after

    # Upload cleaned data to BigQuery
    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
    )
    bq.load_table_from_dataframe(df, table_id, job_config=job_config).result()

    # Log and print results
    log("upload.finance.done", table=table_id, rows_uploaded=after, duplicates_removed=removed)
    print(f"âœ… Uploaded {after:,} rows to {table_id}")
    print(f"ðŸ§¹ Removed {removed:,} duplicate rows before upload.")

upload_finance_to_bq()

{"ts": "2025-10-11T23:27:39.648633+05:30", "Step": "upload.finance.done", "table": "analytics-pipeline-assessment.analytics_dw.stg_finance", "rows_uploaded": 3000000, "duplicates_removed": 0}
âœ… Uploaded 3,000,000 rows to analytics-pipeline-assessment.analytics_dw.stg_finance
ðŸ§¹ Removed 0 duplicate rows before upload.
