In [2]:
from langchain.llms import CTransformers
from langchain.chains import QAGenerationChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
from accelerate import Accelerator
from ctransformers import AutoConfig
import os 
import json
import time
import aiofiles
import csv


model_name = "TheBloke/Amethyst-13B-Mistral-GGUF"

In [3]:
def load_llm():
    # Load the locally downloaded model here

    # Configuration du modèle
    config = {
        'max_new_tokens': 3000,
        'temperature': 0.01,
        'context_length': 7000,
        'gpu_layers': 0
    }
    
    # Initialisation du modèle avec indication explicite pour l'utilisation du GPU
    llm = CTransformers(
        model=model_name,
        model_type="mistral",
        max_new_tokens=1048,
        temperature=0.3,
        gpu_layers=0,
        config=config,
    )

    accelerator = Accelerator()
    llm, config = accelerator.prepare(llm, config)
    return llm


In [4]:

def file_processing(file_path):
    # Load data from text file
    with open(file_path, 'r') as file:
        data = file.read()

    question_gen = data

    splitter_ques_gen = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100
    )

    chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

    document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

    splitter_ans_gen = RecursiveCharacterTextSplitter(
        chunk_size = 300,
        chunk_overlap = 30
    )

    document_answer_gen = splitter_ans_gen.split_documents(
        document_ques_gen
    )

    return document_ques_gen, document_answer_gen


In [5]:
def llm_pipeline(file_path):

    document_ques_gen, document_answer_gen = file_processing(file_path)



    llm_ques_gen_pipeline = load_llm()

    prompt_template = """
    You are an expert at creating questions based on documentation.
    Your goal is to prepare an LLM for training and build a dataset
    You do this by asking standalone questions about the text below.The question must be standalone, questions should not be repeated
    Here is the text:

    ------------
    {text}
    ------------

    Create questions that will help creating the dataset for LLM. 
    Make sure not to lose any important concepts.

    QUESTIONS:
    """

    PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

    refine_template = ("""
    You are an expert at creating practice questions based on documentation.
    Your goal is to help preparing a dataset for training an LLM.
    We have received some practice questions to a certain extent: {existing_answer}.
    We have the option to refine the existing questions or add new ones.
    (only if necessary) with some more context below.
    ------------
    {text}
    ------------

    Given the new context, refine the original questions in English.
    If the context is not helpful, please provide the original questions.
    QUESTIONS:
    """
    )

    REFINE_PROMPT_QUESTIONS = PromptTemplate(
        input_variables=["existing_answer", "text"],
        template=refine_template,
    )

    ques_gen_chain = load_summarize_chain(llm = llm_ques_gen_pipeline, 
                                            chain_type = "refine", 
                                            verbose = True, 
                                            question_prompt=PROMPT_QUESTIONS, 
                                            refine_prompt=REFINE_PROMPT_QUESTIONS)

    ques = ques_gen_chain.run(document_ques_gen)

    embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

    vector_store = FAISS.from_documents(document_answer_gen, embeddings)

    llm_answer_gen = load_llm()

    ques_list = ques.split("\n")
    filtered_ques_list = [element for element in ques_list if element.endswith('?') or element.endswith('.')]

    answer_generation_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen, 
                                                chain_type="stuff", 
                                                retriever=vector_store.as_retriever())

    return answer_generation_chain, filtered_ques_list

def get_csv (file_path):
    answer_generation_chain, ques_list = llm_pipeline(file_path)
    base_folder = "generated"
    if not os.path.isdir(base_folder):
        os.mkdir(base_folder)
    output_file = base_folder+"QA.csv"
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Question", "Answer"])  # Writing the header row

        for question in ques_list:
            print("Question: ", question)
            answer = answer_generation_chain.run(question)
            print("Answer: ", answer)
            print("--------------------------------------------------\n\n")

            # Save answer to CSV file
            csv_writer.writerow([question, answer])
    return output_file

In [6]:
get_csv("test.txt")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

  warn_deprecated(




[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are an expert at creating questions based on documentation.
    Your goal is to prepare an LLM for training and build a dataset
    You do this by asking standalone questions about the text below.The question must be standalone, questions should not be repeated
    Here is the text:

    ------------
    # [Constructor](#constructor)
Constructors are a special type of function that runs only once when deploying a contract, and can be used to initialize the state of the contract. Your contract must not have more than one constructor, and that constructor function must be annotated with the `#[constructor]` attribute. Also, a good practice consists in naming that function `constructor`.


Here's a simple example that demonstrates how to initialize the state of a contract on deployment by defining logic inside a constructor.



```
#[starknet::co

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Question:  1. What is a constructor in StarkNet contracts?
Answer:   A constructor in StarkNet contracts is a special type of function that runs only once when deploying a contract and can be used to initialize the state of the contract. It's important for contract developers to understand how constructors work, as they provide an opportunity to set up the initial conditions of their smart contract.

--------------------------------------------------


Question:      2. Why do we use constructors in StarkNet contracts?
Answer:    Constructors are used to initialize the state of a contract on deployment. They run only once when deploying a contract and can be used to set up initial values for variables or call other functions. This helps ensure that the contract's state is properly initialized before it starts executing code.
--------------------------------------------------


Question:      3. How can we initialize the state of a contract using a constructor?
Answer:   To initialize t

'generatedQA.csv'