In [6]:
import pandas as pd
from faker import Faker
import random
import os


# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 60000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-1y', end_date='-1m').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]
# Generate random country codes
country_codes = [fake.country_code() for _ in range(num_rows)]

# Generate random categories
categories = [random.choice(['Category A', 'Category B', 'Category C']) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values, 'country_code': country_codes, 'category': categories})

# Print the dataframe
print(ultra_large_df)

                                 unique_key        date  metric_value  \
0      1d22cad3-2106-40be-a0fd-deb80d7806c7  2022-10-22          3.02   
1      99af204e-82fa-43aa-bb2c-fc73b7fb9b58  2023-08-01          9.47   
2      4fda8d75-718d-4fed-ace1-92750ced8ed2  2022-09-30          1.48   
3      a70d5e70-abf9-4f78-8580-6f33cb5d4387  2023-01-17          1.46   
4      1e5a2c3e-2b9b-4097-ac92-8571cc5f394d  2022-12-14          2.12   
...                                     ...         ...           ...   
59995  2daf24d1-576e-4276-adca-7d7b6c7df9a8  2023-01-22          7.13   
59996  cab271b9-1269-4418-853d-b01832a7ff75  2022-10-31          0.16   
59997  52f56de8-7b90-40ee-810e-c29a5aaf5c89  2022-12-12          4.29   
59998  9040565e-4f1b-4d3b-8578-2208021a70b0  2023-04-25          5.26   
59999  2e3c45af-9bfb-4675-ba33-c384f1cf5113  2022-10-19          8.57   

      country_code    category  
0               GA  Category A  
1               ML  Category B  
2               MD  Cate

In [7]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://sammyteillet@localhost:5432/dbt-dummy-project')
ultra_large_df.to_sql(name='source_bookings', con=engine, if_exists='replace', index=False)

1000

In [8]:
import numpy as np

# Introduce drift

# Set the seed for reproducibility
np.random.seed(42)

ultra_large_df2 = ultra_large_df.copy()

# Select 10 random indices for metric value update
random_indices_metric = np.random.choice(ultra_large_df2.index, size=100, replace=False)
print(random_indices_metric)
# Update metric value with random values between 0 and 10
ultra_large_df2.loc[random_indices_metric, 'metric_value'] = [round(random.uniform(0, 10),2) for _ in range(100)]
ultra_large_df.to_sql(name='source_bookings', con=engine, if_exists='replace', index=False)

[12628 37730 39991  8525  8279 51012 14871 15127  9366 33322 53390 21819
  5026 23428 45297 26354 30195 47038 20731 34047 26064 42469 29746 14522
 31572 54949 19368  3803 53325 14300 51301  9008 47521 25224 48921 37978
 44171 26303 19458  5369 50291 25951 54908 56362 32218  2885 36559  8966
 46574 10530 44628   273 19269 36911 10121 13290 57606 47189 29209 42187
 25386 17005 10981 47313 27070  6685 54960 58125 40700 13902 31539 49716
 49519 51923  3502 39336  2218 18505 10689 21377  1866 20192 28870 52203
  3867  3222 21785 20984 48539 40694  8440  9951  1334 32572 28344 46503
 34482 36271 36874 11512]


1000