## Connect to ClickHouse

In [6]:
import pandas as pd
import numpy as np
import clickhouse_connect
from dotenv import dotenv_values

In [7]:
env_vars = dotenv_values('/root/text2sql/Credentials/.env')
host = env_vars['host']
port = int(env_vars['port'])
username = env_vars['user']
password = env_vars['password']

In [8]:
client = clickhouse_connect.get_client(host=host, port=port, secure=True, username=username, password=password)

## Script for the Text2SQL Model

In [9]:
from langchain.llms import OpenAI

In [10]:
api_key = env_vars['OPENAI_API_KEY']

In [183]:
llm = OpenAI(model_name='gpt-3.5-turbo', openai_api_key=api_key, temperature=0.0)



In [204]:
user_text = """Get the best ad name by clicks from Facebook, Google, and LinkedIn for 2022 in terms of lowest CPC"""

In [200]:
def_terminology = """
CPC - Cost per Click (calculated as sum of total cost of advertisement/campaign divided by total number of clicks)
"""

In [205]:
worker_prompt = f"""
You are a very experienced data engineer whose job is to write correct SQL queries. Given below is a description of four tables delimited with <>. The descriptions contain information about how tables were created,
what are the columns, their types, definitions, references to other tables using Foreign Keys and finally, first three rows as an example. 

<
Table 1: Users (Information about users, their registration date and activity status)

CREATE TABLE "Users" (
    "UserId" INTEGER NOT NULL (unique identifier of an user),
    "RegDate" DATE NOT NULL (date of registration),
    "Status" NVARCHAR(220) (status of the user: active or passive),
    PRIMARY KEY ("UserId")
)

SELECT * FROM "Users" LIMIT 3;
UserId RegDate Status
120	2023-03-04	passive
345	2023-03-19	active
533	2021-07-24	passive
>

<
Table 2: UserActivity (Information about users visit to the website. It contains history of dates of visitis, channels of visit: direct visit or through clicking an advertisement of marketing campaign. \ 
If visit happened by clicking the ad then corresponding campaign Id is also provided.)

CREATE TABLE "UserActivity" (
    "VisitId" INTEGER NOT NULL (unique identifier of a user's visit to website),
    "UserId" INTEGER NOT NULL (Id of an user),
    "VisitDate" DATE (date of visit),
    "Click" BOOLEAN (if user visited website after clicking an advertisement of marketing campaign on some platform (Google, LinkedIn, Facebook, Bing) then 1, otherwise 0),
    "CampaignId" INTEGER (Id of marketing campaign. If user arrived at website directly without advertisement then CampaignId is 999),
    PRIMARY KEY ("VisitId"),
    FOREIGN KEY("UserId") REFERENCES "Users" ("UserId"),
    FOREIGN KEY("CampaignId") REFERENCES "CampaignActivity" ("CampaignId")
)

SELECT * FROM "UserActivity" LIMIT 3;
VisitId UserId VisitDate Click CampaignId
23	5259	2021-11-27	1	25
24	708	    2023-05-18	1	29
46	7601	2022-11-04	0	7
>

<
Table 3: CampaignActivity (Information about unique marketing campaigns with starting and ending dates, cost of campaign and the platform where the advertisements/campaigns are/were running (LinkedIn, Google, Facebook, Bing))

CREATE TABLE "CampaignActivity" (
    "CampaignId" INTEGER NOT NULL (unique id of marketing campaign),
    "Platform" TEXT NOT NULL (a platform/social media where the advertisement/campaign is/was running),
    "AdStartDate" DATE (start date of advertisement/campaign),
    "AdEndDate" DATE (end date of advertisement/campaign),
    "Cost" REAL (cost of given advertisement/campaign in USD),
    PRIMARY KEY ("CampaignId")
)

SELECT * FROM "CampaignActivity" LIMIT 3;
CampaignId Platform AdStartDate AdEndDate Cost
1	Google	 2022-06-22	 2022-06-27	154.74
2	Facebook 2023-02-14	 2023-03-12	894.79
3	Google	 2022-12-20	 2023-01-18	897.17
>

<
TABLE 4: Customers (Information about clients/customers of marketing agency. Customers are not users. Customers pay money to marketing agency for advertisements/campaigns.)

CREATE TABLE "Customers" (
    "CustomerId" INTEGER NOT NULL (unique identifier of client/customer),
    "Name" TEXT NOT NULL (full name of the customer),
    "Email" TEXT NOT NULL (email of the customer),
    "Status" TEXT NOT NULL (status of the customer: active or passive),
    "CreatedAt" DATAE  (date of account creation/registration),
    PRIMARY KEY ("CustomerId")
)

SELECT * FROM "Customers" LIMIT 3;
CustomerId Name Email Status CreatedAt
36868	Ella Lewis	ella.lewis@example.com	inactive	2022-06-17
49449	Ava Miller	ava.miller@example.com	active	    2021-12-18
50287	Michael Rodriguez	michael.rodriguez@gmail.com	inactive	2022-07-03
>

Carefully analyze tables above and write proper SQL query for the following instructions delimited by triple backticks ```{user_text}```

For the definition of specific terminology you can use following: {def_terminology}

Write query in ClickHouse SQL.

Do not hallucinate. Don't use columns that aren't available in table. Use joins to other tables to find appropriate columns.

result must be just a sql query and nothing else!
"""

In [243]:
response  = llm.predict(worker_prompt)
print(response)

SELECT 
    ca.Platform,
    ca.CampaignId,
    ca.AdStartDate,
    ca.AdEndDate,
    SUM(ca.Cost) / COUNT(ua.VisitId) AS CPC
FROM 
    CampaignActivity ca
JOIN 
    UserActivity ua ON ca.CampaignId = ua.CampaignId
JOIN 
    Users u ON ua.UserId = u.UserId
WHERE 
    ca.Platform IN ('Facebook', 'Google', 'LinkedIn') AND
    ca.AdStartDate >= '2022-01-01' AND ca.AdEndDate <= '2022-12-31'
GROUP BY 
    ca.Platform,
    ca.CampaignId,
    ca.AdStartDate,
    ca.AdEndDate
ORDER BY 
    CPC ASC
LIMIT 1;


In [236]:
from clickhouse_driver.errors import ServerException, NetworkError

In [222]:
try:
    result = client.command("""
    SELECT 
        ca.Platform,
        ca.CampaignId,
        ca.adadad,
        ca.AdEndDate,
        SUM(ca.Cost) / COUNT(ua.VisitId) AS CPC
    FROM 
        CampaignActivity ca
    JOIN 
        UserActivity ua ON ca.CampaignId = ua.CampaignId
    JOIN 
        Users u ON ua.UserId = u.UserId
    WHERE 
        ca.Platform IN ('Facebook', 'Google', 'LinkedIn') AND
        ca.AdStartDate >= '2022-01-01' AND ca.AdEndDate <= '2022-12-31'
    GROUP BY 
        ca.Platform,
        ca.CampaignId,
        ca.AdStartDate,
        ca.AdEndDate
    ORDER BY 
        CPC ASC
    LIMIT 1;
    """)
except ServerException as e:
    # Handle ClickHouse server exceptions (e.g., query execution errors)
    print("ClickHouse server error occurred:", str(e))

except NetworkError as e:
    # Handle ClickHouse network-related exceptions (e.g., connection errors)
    print("ClickHouse network error occurred:", str(e))

except Exception as e:
    # Handle other exceptions
    print("An error occurred:", str(e))

Code: 47. DB::Exception: There's no column 'ca.adadad' in table 'ca': While processing SELECT Platform AS `ca.Platform`, `--ca.CampaignId` AS `ca.CampaignId`, ca.adadad, ca.AdEndDate, SUM(ca.Cost) / COUNT(ua.VisitId) AS CPC FROM CampaignActivity AS ca INNER JOIN UserActivity AS ua ON ca.CampaignId = ua.CampaignId INNER JOIN Users AS u ON ua.UserId = u.UserId WHERE (ca.Platform IN ('Facebook', 'Google', 'LinkedIn')) AND (ca.AdStartDate >= '2022-01-01') AND (ca.AdEndDate <= '2022-12-31') GROUP BY ca.Platform, ca.CampaignId, ca.AdStartDate, ca.AdEndDate ORDER BY CPC ASC LIMIT 1. (UNKNOWN_IDENTIFIER) (version 23.5.1.34446 (official build))



An error occurred: :HTTPDriver for https://ppio7dt65z.europe-west4.gcp.clickhouse.cloud:443 returned response code 404)
 Code: 47. DB::Exception: There's no column 'ca.adadad' in table 'ca': While processing SELECT Platform AS `ca.Platform`, `--ca.CampaignId` AS `ca.CampaignId`, ca.adadad, ca.AdEndDate, SUM(ca.Cost) / COUNT(ua.VisitId) AS CPC FROM CampaignAct


In [238]:
user_input = """"
What's the total marketing revenue from Twitter in 2023?
"""

In [239]:
context = f"""
<
Table 1: Users (Information about users, their registration date and activity status)

CREATE TABLE "Users" (
    "UserId" INTEGER NOT NULL (unique identifier of an user),
    "RegDate" DATE NOT NULL (date of registration),
    "Status" NVARCHAR(220) (status of the user: active or passive),
    PRIMARY KEY ("UserId")
)

SELECT * FROM "Users" LIMIT 3;
UserId RegDate Status
120	2023-03-04	passive
345	2023-03-19	active
533	2021-07-24	passive
>

<
Table 2: UserActivity (Information about users visit to the website. It contains history of dates of visitis, channels of visit: direct visit or through clicking an advertisement of marketing campaign. \ 
If visit happened by clicking the ad then corresponding campaign Id is also provided.)

CREATE TABLE "UserActivity" (
    "VisitId" INTEGER NOT NULL (unique identifier of a user's visit to website),
    "UserId" INTEGER NOT NULL (Id of an user),
    "VisitDate" DATE (date of visit),
    "Click" BOOLEAN (if user visited website after clicking an advertisement of marketing campaign on some platform (Google, LinkedIn, Facebook, Bing) then 1, otherwise 0),
    "CampaignId" INTEGER (Id of marketing campaign. If user arrived at website directly without advertisement then CampaignId is 999),
    PRIMARY KEY ("VisitId"),
    FOREIGN KEY("UserId") REFERENCES "Users" ("UserId"),
    FOREIGN KEY("CampaignId") REFERENCES "CampaignActivity" ("CampaignId")
)

SELECT * FROM "UserActivity" LIMIT 3;
VisitId UserId VisitDate Click CampaignId
23	5259	2021-11-27	1	25
24	708	    2023-05-18	1	29
46	7601	2022-11-04	0	7
>

<
Table 3: CampaignActivity (Information about unique marketing campaigns with starting and ending dates, cost of campaign and the platform where the advertisements/campaigns are/were running (LinkedIn, Google, Facebook, Bing))

CREATE TABLE "CampaignActivity" (
    "CampaignId" INTEGER NOT NULL (unique id of marketing campaign),
    "Platform" TEXT NOT NULL (a platform/social media where the advertisement/campaign is/was running),
    "AdStartDate" DATE (start date of advertisement/campaign),
    "AdEndDate" DATE (end date of advertisement/campaign),
    "Cost" REAL (cost of given advertisement/campaign in USD),
    PRIMARY KEY ("CampaignId")
)

SELECT * FROM "CampaignActivity" LIMIT 3;
CampaignId Platform AdStartDate AdEndDate Cost
1	Google	 2022-06-22	 2022-06-27	154.74
2	Facebook 2023-02-14	 2023-03-12	894.79
3	Google	 2022-12-20	 2023-01-18	897.17
>

<
TABLE 4: Customers (Information about clients/customers of marketing agency. Customers are not users. Customers pay money to marketing agency for advertisements/campaigns.)

CREATE TABLE "Customers" (
    "CustomerId" INTEGER NOT NULL (unique identifier of client/customer),
    "Name" TEXT NOT NULL (full name of the customer),
    "Email" TEXT NOT NULL (email of the customer),
    "Status" TEXT NOT NULL (status of the customer: active or passive),
    "CreatedAt" DATAE  (date of account creation/registration),
    PRIMARY KEY ("CustomerId")
)

SELECT * FROM "Customers" LIMIT 3;
CustomerId Name Email Status CreatedAt
36868	Ella Lewis	ella.lewis@example.com	inactive	2022-06-17
49449	Ava Miller	ava.miller@example.com	active	    2021-12-18
50287	Michael Rodriguez	michael.rodriguez@gmail.com	inactive	2022-07-03
>

For the definition of specific terminology you can use following: {def_terminology}
"""

In [244]:
context_question = f""""
You're the best data engineer in the world. You are an expert in analytics and SQL. Given the database tables described in triple backticks ```{context}```, can you answer to the following question: {user_input} \
your answer must be Yes or No and nothing more or less!
"""

In [245]:
print(context_question)

"
You're the best data engineer in the world. You are an expert in analytics and SQL. Given the database tables described in triple backticks ```
<
Table 1: Users (Information about users, their registration date and activity status)

CREATE TABLE "Users" (
    "UserId" INTEGER NOT NULL (unique identifier of an user),
    "RegDate" DATE NOT NULL (date of registration),
    "Status" NVARCHAR(220) (status of the user: active or passive),
    PRIMARY KEY ("UserId")
)

SELECT * FROM "Users" LIMIT 3;
UserId RegDate Status
120	2023-03-04	passive
345	2023-03-19	active
533	2021-07-24	passive
>

<
Table 2: UserActivity (Information about users visit to the website. It contains history of dates of visitis, channels of visit: direct visit or through clicking an advertisement of marketing campaign. \ 
If visit happened by clicking the ad then corresponding campaign Id is also provided.)

CREATE TABLE "UserActivity" (
    "VisitId" INTEGER NOT NULL (unique identifier of a user's visit to website),


In [246]:
response  = llm.predict(context_question)
print(response)

No.
