### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import clickhouse_connect
from dotenv import dotenv_values
import random
from datetime import datetime, timedelta

### Load Credentials

In [2]:
env_vars = dotenv_values('/root/text2sql/Credentials/.env')
host = env_vars['host']
port = int(env_vars['port'])
username = env_vars['user']
password = env_vars['password']

### Construct ClickHouse Connection Client

In [3]:
client = clickhouse_connect.get_client(host=host, port=port, secure=True, username=username, password=password)

## Simulating and Creating Dummy Tables

### Table: Agency Customers (CustomerId Primary Key, Name, Email, Status, CreatedAt) 

In [4]:
# Generate random customer names
def gen_random_name():
    first_names = ['John', 'Alice', 'Robert', 'Emily', 'Michael', 'Emma', 'David', 'Olivia', 'Daniel', 'Sophia',
                   'James', 'Lily', 'William', 'Grace', 'Benjamin', 'Charlotte', 'Andrew', 'Ava', 'Joseph', 'Mia',
                   'Henry', 'Ella', 'Alexander', 'Lucy']
    
    last_names = ['Smith', 'Johnson', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor', 'Anderson', 'Thomas',
                  'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 'Martinez', 'Robinson', 'Clark', 'Rodriguez',
                  'Lewis', 'Lee', 'Walker', 'Hall', 'Allen']
    
    return random.choice(first_names), random.choice(last_names)

# Generate random email addresses
def gen_random_email(first_name, last_name):
    domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'example.com', 'domain.com']
    email_prefix = f"{first_name.lower()}.{last_name.lower()}"
    return f"{email_prefix}@{random.choice(domains)}"

# Generate random status
def gen_random_status():
    statuses = ['active', 'inactive']
    return random.choice(statuses)

# Generate random created_at dates between November 1, 2021 and June 27, 2023
def gen_random_created_at():
    start_date = datetime(2021, 11, 1)
    end_date = datetime(2023, 6, 27)
    time_between_dates = end_date - start_date
    random_number_of_days = random.randrange(time_between_dates.days)
    random_date = start_date + timedelta(days=random_number_of_days)
    return random_date.strftime('%Y-%m-%d')

# Generate 100 values for the table columns
num_values = 100
used_customer_ids = set()

data = []
for _ in range(num_values):
    customer_id = random.randint(1, 1000000)
    while customer_id in used_customer_ids:
        customer_id = random.randint(1, 1000000)
    used_customer_ids.add(customer_id)

    first_name, last_name = gen_random_name()
    email = gen_random_email(first_name, last_name)
    status = gen_random_status()
    created_at = gen_random_created_at()

    data.append([customer_id, f"{first_name} {last_name}", email, status, created_at])

# Create a DataFrame from the data
df_customers = pd.DataFrame(data, columns=['Customer_id', 'name', 'email', 'status', 'created_at'])
df_customers['created_at'] = pd.to_datetime(df_customers['created_at']).dt.date

In [5]:
# Create Customers table into ClickHouse
client.command('CREATE OR REPLACE TABLE Customers ('
               'CustomerId UInt32,'
               'Name String,'
               'Email String,'
               'Status String,'
               'CreatedAt Date,'
               'PRIMARY KEY (CustomerId))'
               'ENGINE = MergeTree()'
               'ORDER BY CustomerId')

''

In [6]:
# Insert data into table
data_customers = df_customers.values.tolist()
client.insert('Customers', data_customers, column_names=['CustomerId', 'Name', 'Email', 'Status', 'CreatedAt'])

### Table: Users (UserId Primary Key, RegDate, Status)

In [17]:
def gen_user_id():
    return random.randint(1, 10000)

def gen_registration_date():
    start_date = datetime(2021, 1, 1)
    end_date = datetime(2023, 6, 27)
    random_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    return random_date.strftime("%Y-%m-%d")

def gen_status():
    return random.choice(['active', 'passive'])

# Simulating 100 values
num_values = 50
table_values = []

for _ in range(num_values):
    user_id = gen_user_id()
    reg_date = gen_registration_date()
    status = gen_status()
    table_values.append((user_id, reg_date, status))

# Creating the final DataFrame
df_users = pd.DataFrame(table_values, columns= ['UserID', 'RegDate', 'Status'])
df_users['RegDate'] = pd.to_datetime(df_users['RegDate']).dt.date

In [18]:
# Create Users table into ClickHouse
client.command('CREATE OR REPLACE TABLE Users ('
               'UserId UInt32,'
               'RegDate Date,'
               'Status String,'
               'PRIMARY KEY (UserId))'
               'ENGINE = MergeTree()'
               'ORDER BY UserId')

''

In [19]:
# Insert data into table
data_users = df_users.values.tolist()
client.insert('Users', data_users, column_names=['UserId', 'RegDate', 'Status'])

### Table: UserActivity (VisitId Primary Key, UserId, VisitDate, ThroughClick, CampaignId)

In [64]:
user_ids = list(df_users['UserID'].unique())

def gen_visit_id(visit_ids):
    visit_id = random.randint(1, 10000)
    while visit_id in visit_ids:
        visit_id = random.randint(1, 10000)
    return visit_id

def gen_visit_date():
    start_date = datetime(2021, 1, 1)
    end_date = datetime(2023, 6, 27)
    random_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    return random_date.strftime("%Y-%m-%d")

def gen_through_click():
    return random.choice([True, False])

def gen_campaign_id(through_click):
    if not through_click:
        return 999
    return random.randint(1, 30)

# Simulating 1000 samples of visits
num_samples = 1000
visit_data = []
visit_ids = set()

for _ in range(num_samples):
    visit_id = gen_visit_id(visit_ids)
    visit_ids.add(visit_id)
    user_id = random.choice(user_ids)
    visit_date = gen_visit_date()
    click = gen_through_click()
    campaign_id = gen_campaign_id(click)
    visit_data.append((visit_id, user_id, visit_date, click, campaign_id))

# Creating the final DataFrame
df_activity = pd.DataFrame(visit_data, columns=['VisitId', 'UserId', 'VisitDate', 'Click', 'CampaignId'])
df_activity['VisitDate'] = pd.to_datetime(df_activity['VisitDate']).dt.date

In [65]:
df_activity.tail()

Unnamed: 0,VisitId,UserId,VisitDate,Click,CampaignId
995,1033,575,2022-12-23,False,999
996,9563,3717,2021-04-12,False,999
997,9669,774,2022-10-26,True,3
998,8481,1322,2022-06-11,False,999
999,3882,5417,2021-03-28,True,28


In [66]:
# Create Users table into ClickHouse
client.command('CREATE OR REPLACE TABLE UserActivity ('
               'VisitId UInt32,'
               'UserId UInt32,'
               'VisitDate Date,'
               'Click UInt8,'
               'CampaignId UInt32,'
               'PRIMARY KEY (VisitId))'
               'ENGINE = MergeTree()'
               'ORDER BY VisitId')

''

In [67]:
# Insert data into table
data_activity = df_activity.values.tolist()
client.insert('UserActivity', data_activity, column_names=['VisitId', 'UserId', 'VisitDate', 'Click', 'CampaignId'])

In [45]:
len(list(df_activity['CampaignId'].unique()))

31

### Table: CampaignActivity (CampaignId Primary Key, Platform, AdStartDate, AdEndDate, TotalCost)

In [68]:
campaign_ids = list(df_activity['CampaignId'].unique())
campaign_ids.remove(999)

def generate_platform():
    platforms = ['Facebook', 'Google', 'LinkedIn', 'Bing']
    return random.choice(platforms)

def generate_ad_dates():
    start_date = datetime(2021, 1, 1)
    end_date = datetime(2023, 6, 27)
    random_start_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    random_end_date = random_start_date + timedelta(days=random.randint(1, 30))
    return random_start_date.strftime("%Y-%m-%d"), random_end_date.strftime("%Y-%m-%d")

def generate_total_cost():
    return round(random.uniform(100, 1000), 2)

# Random ordering of campaign Ids
random.shuffle(campaign_ids)

# Simulating campaign activities
campaign_data = []

for campaign_id in campaign_ids:
    platform = generate_platform()
    ad_start_date, ad_end_date = generate_ad_dates()
    total_cost = generate_total_cost()
    campaign_data.append((campaign_id, platform, ad_start_date, ad_end_date, total_cost))

# Creating the final DataFrame
df_campaign = pd.DataFrame(campaign_data, columns=['CampaignId', 'Platform', 'AdStartDate', 'AdEndDate', 'Cost'])
df_campaign['AdStartDate'] = pd.to_datetime(df_campaign['AdStartDate']).dt.date
df_campaign['AdEndDate'] = pd.to_datetime(df_campaign['AdEndDate']).dt.date
df_campaign['CampaignId'] = df_campaign['CampaignId'].astype('UInt32')
df_campaign['Cost'] = df_campaign['Cost'].astype(np.float32)

In [69]:
# Create Users table into ClickHouse
client.command('CREATE OR REPLACE TABLE CampaignActivity ('
               'CampaignId UInt32,'
               'Platform String,'
               'AdStartDate Date,'
               'AdEndDate Date,'
               'Cost Float32,'
               'PRIMARY KEY (CampaignId))'
               'ENGINE = MergeTree()'
               'ORDER BY CampaignId')

''

In [70]:
# Insert data into table
data_campaign = df_campaign.values.tolist()
client.insert('CampaignActivity', data_campaign, column_names=['CampaignId', 'Platform', 'AdStartDate', 'AdEndDate', 'Cost'])

In [29]:
# df_customers.to_csv('/root/text2sql/data_csv/df_customers.csv')
# df_users.to_csv('/root/text2sql/data_csv/df_users.csv')
# df_activity.to_csv('/root/text2sql/data_csv/df_activity.csv')
# df_campaign.to_csv('/root/text2sql/data_csv/df_campaign.csv')