## Convert Database schema (.db) to soure of datatype format

Datatype data format should look like this
```json
{
  "JOIN_KEY": {
    "PK": [
      "PK1",
      "PK2"
    ],
    "FK": {
      "FK1": {
        "Ref_table": "Ref_column"
      }
    }
  },
  "COLUMNS": {
    "Col1": "Type",
    "Col2": "Type",
    "Col3": "Type"
  }
}


In [1]:
import os, json, sqlite3

In [2]:
def transform_schema(schema_db_path, output_path):

    connection = sqlite3.connect(schema_db_path)
    cursor = connection.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    for table in tables:
        
        schema = {
                    "JOIN_KEY" : {
                        "PK" : list(),
                        "FK" : dict()
                    },
                    "COLUMNS" : dict()
        }
        table_name = table[0]
        print(f"Table: {table_name}")
        schema_file_name = f"{table_name}_datatype.json"
        output_file_path = os.path.join(output_path, schema_file_name)
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()

        for column in columns:
            _, cname, ctype, _, _, pk_sq = column
            schema['COLUMNS'][cname] = ctype

            # if this column is primary key
            if pk_sq: schema['JOIN_KEY']['PK'].append(cname)

            print(f"\tColumn: {cname} {ctype} {pk_sq}")

            # Get foreign keys for the table
        cursor.execute(f"PRAGMA foreign_key_list({table_name});")
        foreign_keys = cursor.fetchall()
        print()

        if foreign_keys:
            print("Foreign Keys:")
            for fk in foreign_keys:
                _, _, to_table, fk_column, to_column, _, _, _ = fk
                schema['JOIN_KEY']['FK'][fk_column] = {to_table : to_column}
                print(f"\t{fk_column} REFERENCES {to_table}({to_column})")
            print()

        with open(output_file_path, "w") as file:
            json.dump(schema, file, indent=2)
            print(f"Dump {output_file_path} sucess")
            
    cursor.close()
    connection.close()

# Embedded Domain schema type and description

## JSON Structure Description

The JSON embedded_data object has the following structure:
```json
{
  "name": "domain name",
  "tables": {
    "table_1" : {
      "description" : {
        "text" : "this is a description of table_1",
        "vector" : [Vector]
      },
      "datatypes" : {
        "JOIN_KEY" : {
          "PK" : ["column_1", "column_3"],
          "FK" : {
            "column_3" : "table_2"
            }, 
          },
        "COLUMNS" : {
          "column_1" : "number",
          "column_2" : "text",
          "column_3" : "text"
        }
      },
      "class_labels" : {
        "class_label_1" : {
          "text" : "This is a class description for identify column type",
          "vector" : [Vector],
        },
        "class_label_2" : {
          "text" : "This is a class description for identify column type",
          "vector" : [Vector],
        }
      },
      "columns" : {
        "column_1" : {
          "text" : "This is a description of column_1",
          "vector" : [Vector],
          "column_classes" : ["class_label_1"]
        },
        "column_2" : {
          "text" : "This is a description of column_2",
          "vector" : [Vector],
          "column_classes" : ["class_label_2"]
        },
        "column_3" : {
          "text" : "This is a description of column_3",
          "vector" : [Vector],
          "column_classes" : ["class_label_1", "class_label_2"]
        }
      }
    }
  },
  "table_2" : {...}
}

In [6]:
import json
import torch
from torch import Tensor
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

In [7]:
tokenizer = AutoTokenizer.from_pretrained("models/nsql-350M")
model = AutoModelForCausalLM.from_pretrained("models/nsql-350M")
sen_emb = SentenceTransformer("models/all-MiniLM-L6-v2")

def encode(text):
    return sen_emb.encode(text).tolist()

In [4]:
def cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

In [8]:
def most_relate_topic(text:str, table_classes:dict,topic_threshold_score:float=0.4) -> list:
    text_vec = encode(text)
    topic_scores = [float(cos_sim(info['vector'], text_vec)) for info in table_classes.values()]
    related_topic_indices = np.where(np.array(topic_scores) >= topic_threshold_score)[0]
    
    # If no topics meet the threshold, return the topic with maximum score
    if not len(related_topic_indices) : 
        related_topic_indices = [np.argmax(topic_scores)]

    related_topics = [list(table_classes.keys())[i] for i in related_topic_indices]
    return related_topics

In [25]:
def new_embed_domain(domain_name, 
                     domain_description_folder_path,
                     domain_datatype_folder_path,
                     domain_schema_class_path):
  
  domain = {}
  domain['name'] = domain_name
  domain['tables'] = {}


  with open(domain_schema_class_path, "r") as file:
    schema_classes = json.load(file)

  schema_classes_vector = dict()
  for table_name, table_classes in schema_classes.items():
    schema_classes_vector[table_name] = dict()
    for class_label, class_description in table_classes.items():
       schema_classes_vector[table_name][class_label] = { "text" : class_description,
                                                          "vector" : encode(class_description) }

  description_files = sorted(os.listdir(domain_description_folder_path))
  datatype_files = sorted(os.listdir(domain_datatype_folder_path))
  for description, datatype in zip(description_files, 
                                   datatype_files):

    with open(os.path.join(domain_description_folder_path, description), 'r') as file:
      description_body = json.load(file)
    
    with open(os.path.join(domain_datatype_folder_path, datatype), 'r') as file:
      datatype_body = json.load(file)

    # dev
      
    table_name = description_body['table']
    column_classes = schema_classes_vector[table_name]
    table_description = {}

    table_description['text'] = description_body['description']
    table_description['vector']= encode(description_body['description'])

    columns = {}
    for col, desc in description_body['columns'].items():
      column = {}
      column['text'] = desc
      column['vector'] = encode(desc)
      column['column_classes'] = most_relate_topic(desc, column_classes)
      columns[col] = column

    table = {}
    table['description'] = table_description
    table['datatypes'] = datatype_body
    table['class_labels'] = column_classes
    table['columns'] = columns

    domain['tables'][table_name] = table

  return domain

### Example usage

In [None]:
domain_name = 'coffee_shop'

In [26]:
domain_description_folder_path = f'App/domain/{domain_name}/Schema/descriptions'
domain_datatype_folder_path = f'App/domain/{domain_name}/Schema/datatypes'
domain_schema_class_path = f'App/domain/{domain_name}/{domain_name}_classes.json'
destination_path = 'src/test/embedded.json'

with open(destination_path, 'w') as file:
    json.dump(new_embed_domain(domain_name=domain_name,
                               domain_description_folder_path=domain_description_folder_path,
                               domain_datatype_folder_path=domain_datatype_folder_path,
                               domain_schema_class_path=domain_schema_class_path), 
                            file, indent=2)