In [1]:
import io
import os
import random
from pathlib import Path

In [2]:
%pwd

'c:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\BIRD_benchmark_SQL\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Asus\\Machine_learning\\LLM\\Projects\\BIRD_benchmark_SQL'

In [5]:
from Bird_bench_SQL import logger

In [6]:
from dataclasses import dataclass
from pathlib import Path

In [7]:
@dataclass(frozen=True)
class DataProcessingConfig:
    data_file_path      : Path
    train_file_path     : Path
    test_file_path      : Path
    few_shot_file_path  : Path
    few_shot_file_size  : int 
    k                   : int 
    db_id_name          : str
    embedding_model     : str 


In [8]:
from Bird_bench_SQL.constants import *
from Bird_bench_SQL.utils.common import read_yaml,create_directories
from Bird_bench_SQL.config.configuration import ConfigurationManager

In [9]:
from dotenv import load_dotenv
from Bird_bench_SQL.entity.config_entity import DatabaseAndModelConfig

In [10]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath    = CONFIG_FILE_PATH,
                 params_filepath    = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_database_and_model_config(self) -> DatabaseAndModelConfig:
        config = self.config.database_and_model
        params = self.params 
        
        logger.info('Database and model config initialized')
        load_dotenv()
        data_procesing_config = DatabaseAndModelConfig(SQLite_database_path = config.database_path,
                                                       Model_name           = params.MODEL_NAME,
                                                       temperature          = params.TEMPERATURE,
                                                       api_key              = os.getenv("GROQ_API_KEY")) 
        logger.info("database and model config finished") 
        return data_procesing_config    
    
        

    def get_data_processing_config(self) -> DataProcessingConfig:
        config =  self.config.data_processing
        params = self.params 
        logger.info('Data preprocessing config initialized')
        data_procesing_config = DataProcessingConfig(data_file_path     = config.data_file_path,
                                                     train_file_path    = config.train_file_path,
                                                     test_file_path     = config.test_file_path,
                                                     few_shot_file_path = config.few_shots_path,
                                                     few_shot_file_size = params.FEW_SHOTS_SIZE,
                                                     db_id_name         = params.db_id,
                                                     k                  = params.K,
                                                     embedding_model    = params.EMBEDDING_MDOEL
                                                     )
        logger.info('Data preprocessing config finished')
        return data_procesing_config        


In [11]:
manager             = ConfigurationManager()
processing_config   = manager.get_data_processing_config()
model_config        = manager.get_database_and_model_config()


train_path          = processing_config.train_file_path
test_path           = processing_config.test_file_path

[2024-09-10 13:12:36,817: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-10 13:12:36,818: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-10 13:12:36,818: INFO: common: created directory at: artifacts]
[2024-09-10 13:12:36,818: INFO: 1897769260: Data preprocessing config initialized]
[2024-09-10 13:12:36,818: INFO: 1897769260: Data preprocessing config finished]
[2024-09-10 13:12:36,826: INFO: 1897769260: Database and model config initialized]
[2024-09-10 13:12:36,830: INFO: 1897769260: database and model config finished]


In [12]:
import random
from Bird_bench_SQL.utils.common import load_json,save_json
from Bird_bench_SQL.components.database_and_model import DatabaseAndModel
from Bird_bench_SQL.entity.config_entity import DatabaseAndModelConfig

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import SemanticSimilarityExampleSelector

In [13]:
class DataProcessing(DatabaseAndModel):

    def __init__(self,processing_config : DataProcessingConfig,model_config:DatabaseAndModelConfig):
        super().__init__(model_config)
        self.processing_config          = processing_config
        self.llm, self.conn, self.db    = self.database_and_model_setup()

    def data_processing(self): 
        if  os.path.exists(self.processing_config.few_shot_file_path):
            os.remove(self.processing_config.few_shot_file_path)
            logger.info(f"{self.processing_config.few_shot_file_path} has been deleted.")
        else:
            pass 

        if not os.path.exists(self.processing_config.few_shot_file_path):
            
            logger.info(f"Data processing has started")
            data_file   = self.processing_config.data_file_path
            train_data  = self.processing_config.train_file_path
            data_file   = load_json(train_data)
            logger.info(f"{data_file} has loaded succesfully completed")

            conn = self.conn 
            cursor  = conn.cursor() 
            logger.info("----added cursor----")
            #datas = load_json(train_path)

            for data in data_file: # datas
                results         = cursor.execute(data.SQL)
                data['Answer']  = ",".join([str(ans[0]) for ans in results])
            conn.close()
            logger.info("----database closeed----")
            logger.info("----Data alternation started----")
            
            random.shuffle(data_file) # datas
            few_shots       = random.sample(data_file,self.processing_config.few_shot_file_size) # datas
            logger.info(f"----Pick {self.processing_config.few_shot_file_size} Random samples from dataset----")

            logger.info(f"----Data alternation of few shots started----")
            few_shots_data  = []
            for item in few_shots:
                new_item = {
                    "Question"  : f"{item['question']} ==>> {item['evidence']}",
                    "SQLQuery"  : item['SQL'],
                    "SQLResult" : "Result of the SQL query",  # Placeholder for the actual SQL result if needed
                    "Answer"    : item['Answer']
                }
                few_shots_data.append(new_item)
            logger.info(f"----Data alternation of few shots completed----")
            save_json(path  = self.processing_config.few_shot_file_path,
                      data  = few_shots_data)
            print(few_shots_data) 
        else:
            logger.info(f"{self.processing_config.few_shot_file_path} file is already present")


    def sematic_similarity_example_selector(self):
        logger.info(f"Sematic similarity example selector begin") 
        embeddings    = HuggingFaceEmbeddings(model_name=self.processing_config.embedding_model)
        logger.info(f"----Embedding model----{self.processing_config.embedding_model}----setup successfully completed") 
        to_vectorize  = [' '.join(sent.values()) for sent in load_json(self.processing_config.few_shot_file_path)]
        logger.info(f"----loaded few_shot-data from----{self.processing_config.few_shot_file_path}----") 
        vectorstore   = Chroma.from_texts(to_vectorize,embeddings,metadatas=load_json(self.processing_config.few_shot_file_path))
        logger.info("----successfully completed vectorstore----")

        example_selector = SemanticSimilarityExampleSelector(vectorstore  = vectorstore,k= self.processing_config.k,)
        return example_selector
  

In [14]:
try:
    manager                 = ConfigurationManager()
    model_config            = manager.get_database_and_model_config()
    data_processing_config  = manager.get_data_processing_config()
    data_processing         = DataProcessing(processing_config  = data_processing_config,
                                             model_config       = model_config)
    data_processing.data_processing()
    example_selector        = data_processing.sematic_similarity_example_selector()

except Exception as e:
    raise e

[2024-09-10 13:12:38,735: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-10 13:12:38,743: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-10 13:12:38,746: INFO: common: created directory at: artifacts]
[2024-09-10 13:12:38,746: INFO: 1897769260: Database and model config initialized]
[2024-09-10 13:12:38,749: INFO: 1897769260: database and model config finished]
[2024-09-10 13:12:38,749: INFO: 1897769260: Data preprocessing config initialized]
[2024-09-10 13:12:38,753: INFO: 1897769260: Data preprocessing config finished]
[2024-09-10 13:12:38,753: INFO: database_and_model: Model setup initialized]
[2024-09-10 13:12:39,992: INFO: database_and_model: model----llama3-70b-8192----created]
[2024-09-10 13:12:40,000: INFO: database_and_model: connection----movies_4.sqlite----created]
[2024-09-10 13:12:40,015: INFO: database_and_model: engine----movies_4.sqlite----created]
[2024-09-10 13:12:40,073: INFO: database_and_model: database----created]
[20

  embeddings    = HuggingFaceEmbeddings(model_name=self.processing_config.embedding_model)
  from tqdm.autonotebook import tqdm, trange


[2024-09-10 13:12:51,760: INFO: SentenceTransformer: Use pytorch device_name: cuda]
[2024-09-10 13:12:51,760: INFO: SentenceTransformer: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2]




[2024-09-10 13:12:56,836: INFO: 4210093409: ----Embedding model----sentence-transformers/all-MiniLM-L6-v2----setup successfully completed]
[2024-09-10 13:12:56,853: INFO: common: json file loaded succesfully from: artifacts/data/few_shots.json]
[2024-09-10 13:12:56,853: INFO: 4210093409: ----loaded few_shot-data from----artifacts/data/few_shots.json----]
[2024-09-10 13:12:56,853: INFO: common: json file loaded succesfully from: artifacts/data/few_shots.json]
[2024-09-10 13:12:57,882: INFO: posthog: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.]
[2024-09-10 13:12:58,606: INFO: 4210093409: ----successfully completed vectorstore----]


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [15]:
example_selector.select_examples({'question': "What are the genres of Sky Captain ",})

[{'Answer': "City of Angels,It's a Wonderful Life,Dogma,The Prophecy,Frailty,Legion,The Mortal Instruments: City of Bones,The Christmas Candle",
  'Question': 'Look for the movie title with the keyword of "angel". ==>> keyword of "angel" refers to keyword_name = \'angel\'',
  'SQLQuery': "SELECT T1.title FROM movie AS T1 INNER JOIN movie_keywords AS T2 ON T1.movie_id = T2.movie_id INNER JOIN keyword AS T3 ON T2.keyword_id = T3.keyword_id WHERE T3.keyword_name = 'angel'",
  'SQLResult': 'Result of the SQL query'},
 {'Answer': 'Michael Bay',
  'Question': "Who is the director for the movie 'Transformers?' ==>> the director refers to person_name where job = 'Director'; movie 'Transformers' refers to title = 'Transformers'",
  'SQLQuery': "SELECT T3.person_name FROM movie AS T1 INNER JOIN movie_crew AS T2 ON T1.movie_id = T2.movie_id INNER JOIN person AS T3 ON T2.person_id = T3.person_id WHERE T1.title = 'Transformers' AND T2.job = 'Director'",
  'SQLResult': 'Result of the SQL query'}]