In [6]:
import pandas as pd
from faker import Faker
import random
import os


# Initialize Faker to generate random data
fake = Faker()

# Set the number of rows for the dataframe
num_rows = 60000

# Generate random IDs and dates
ids = [fake.uuid4() for _ in range(num_rows)]
dates = [fake.date_between(start_date='-1y', end_date='-1m').strftime('%Y-%m-%d') for _ in range(num_rows)]

# Generate random metric values between 0 and 10
metric_values = [round(random.uniform(0, 10),2) for _ in range(num_rows)]
# Generate random country codes
country_codes = [fake.country_code() for _ in range(num_rows)]

# Generate random categories
categories = [random.choice(['Category A', 'Category B', 'Category C']) for _ in range(num_rows)]

# Create the dataframe
ultra_large_df = pd.DataFrame({'unique_key': ids, 'date': dates, 'metric_value': metric_values, 'country_code': country_codes, 'category': categories})

# Print the dataframe
print(ultra_large_df)

                                 unique_key        date  metric_value  \
0      460b0428-0d68-49e4-9392-0fa9393e9a56  2022-12-23          9.67   
1      4531543d-b30a-46c4-9dfc-372b57e7eaba  2023-04-20          9.92   
2      47133571-86b3-40f2-a9f7-c8ecc725780b  2022-10-07          5.95   
3      a0158801-3e47-48a8-becf-e40fcdd61a7e  2023-09-21          9.88   
4      58eb6983-7356-4e7e-b9e8-509304f898e6  2023-06-02          9.33   
...                                     ...         ...           ...   
59995  9cc16de7-b197-49de-a0d9-c2bceecd677a  2023-02-23          0.88   
59996  e4ff2ab9-45b8-4bca-b0cb-2165a5067266  2023-07-28          8.69   
59997  548d6acb-d0e5-4799-b5fc-a04c0c56aef3  2023-05-01          0.85   
59998  5717e7be-e09d-40a9-959e-2227af1cf3e1  2022-12-14          2.60   
59999  35b93a5f-b7d6-4072-84ab-6bf8b71cf66d  2022-11-06          8.79   

      country_code    category  
0               FJ  Category A  
1               LB  Category B  
2               EG  Cate

In [7]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://lucasdevries@localhost:5432/dbt-dummy-project')
ultra_large_df.to_sql(name='source_bookings', con=engine, if_exists='replace', index=False)

1000

In [8]:
import numpy as np

# Introduce drift

# Set the seed for reproducibility
np.random.seed(42)

ultra_large_df2 = ultra_large_df.copy()

# Select 10 random indices for metric value update
random_indices_metric = np.random.choice(ultra_large_df2.index, size=100, replace=False)
print(random_indices_metric)
# Update metric value with random values between 0 and 10
ultra_large_df2.loc[random_indices_metric, 'metric_value'] = [round(random.uniform(0, 10),2) for _ in range(100)]
ultra_large_df.to_sql(name='source_bookings', con=engine, if_exists='replace', index=False)

[12628 37730 39991  8525  8279 51012 14871 15127  9366 33322 53390 21819
  5026 23428 45297 26354 30195 47038 20731 34047 26064 42469 29746 14522
 31572 54949 19368  3803 53325 14300 51301  9008 47521 25224 48921 37978
 44171 26303 19458  5369 50291 25951 54908 56362 32218  2885 36559  8966
 46574 10530 44628   273 19269 36911 10121 13290 57606 47189 29209 42187
 25386 17005 10981 47313 27070  6685 54960 58125 40700 13902 31539 49716
 49519 51923  3502 39336  2218 18505 10689 21377  1866 20192 28870 52203
  3867  3222 21785 20984 48539 40694  8440  9951  1334 32572 28344 46503
 34482 36271 36874 11512]


1000