# 1-2. Поступает запрос (разбираю худший случай), который надо преобразовать в правильный запрос и из нормального запроса надо извлечь ключевые запросы для поиска

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
class MyLLM:
    def __init__(self, max_new_tokens = 512, do_sample = False, top_k = 50, top_p = 0.95, num_return_sequences = 1):
        print("-"*50)
        print("Load tokenizer")
        self.tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-7b-instruct-v1.5", trust_remote_code=True)
        print("-"*50)
        print("Load model")
        self.model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-7b-instruct-v1.5", trust_remote_code=True).cuda()
        print("-"*50)
        print("Finish load")
        
        self.messages=[
            # { 'role': 'user', 'content': "write a quick sort algorithm in python."}
            { 'role': 'user', 'content': "Hello! How are you?"}
        ]
        self.max_new_tokens = max_new_tokens
        self.do_sample = do_sample
        self.top_k = top_k
        self.top_p = top_p
        self.eos_token_id = self.tokenizer.eos_token_id
        self.num_return_sequences = num_return_sequences
        
    def invoke(self, message=None):
        self.messages = message if message is not None else self.messages
        inputs = self.tokenizer.apply_chat_template(self.messages, add_generation_prompt=True, return_tensors="pt").to(self.model.device)
        
        outputs = self.model.generate(
            inputs, 
            max_new_tokens=self.max_new_tokens, 
            do_sample=self.do_sample, 
            top_k=self.top_k, 
            top_p=self.top_p, 
            num_return_sequences=self.num_return_sequences, 
            eos_token_id=self.eos_token_id
        )
        
        answer = self.tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
        print(answer)

        return answer

In [3]:
# llm = MyLLM()

In [4]:
# answer = llm.invoke()

In [5]:
# answer

In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [7]:
class Query:
    def __init__(self, user_query):
        # self.llm_name = "gpt-3.5-turbo"
        self.embeddings = HuggingFaceEmbeddings()
        self.llm = MyLLM()
        self.user_query = user_query
        self.correct_query = None

    def get_correct_query(self):
        messages=[
            { 'role': 'system', 'content': "You are an expert at finding repositories on GitHub. A person who does not know how to make such requests has contacted you, he writes a request in simple words, and your task is to make a competent and relevant request. As a response, you need to give only the text of the request that you recommend. Answer only english language"},
            { 'role': 'user', 'content': "User request: " + self.user_query}
        ]

        answer = self.llm.invoke(messages)

        return answer

    def get_key_queries_from_correct_query(self):
        self.correct_query = self.get_correct_query()

        messages=[
            { 'role': 'system', 'content': "You are an expert at finding repositories on GitHub. You have been contacted by the person who made up the query, and your task is to use keywords from the user's query to write 5 short queries that the user can best search for. Use no more than 3 words in one search query. Be brief. Write requests separated by commas. Answer only english language"},
            { 'role': 'user', 'content': "User request: " + self.correct_query}
        ]

        answer = self.llm.invoke(messages)

        return answer

In [8]:
# Предположим, что мне пришёл пользовательский запрос
uq="Я хочу сделать вот такой красивый розовый сайт для продажи собачек разных парод и чтобы сайт был красивым и привлекательным, а так же сайт должен быть продающим"

In [9]:
query = Query(user_query=uq)

--------------------------------------------------
Load tokenizer


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


--------------------------------------------------
Load model


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

--------------------------------------------------
Finish load


In [10]:
key_queries = query.get_key_queries_from_correct_query().split(',')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100015 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100015 for open-end generation.


To create a beautiful, appealing, and selling dog breed selling website, you would need a combination of web development skills, design expertise, and marketing knowledge. Here's a high-level request that you can use to find the appropriate repositories on GitHub:

1. **Frontend Development**: You'll need a modern and responsive frontend framework to ensure your website looks good on all devices. Frameworks like React, Vue.js, or Angular can help you build interactive and dynamic user interfaces.

2. **Design**: You'll need a good set of design assets, including images, logos, and color schemes. You might want to look for repositories that contain open-source design resources or templates that you can use as a starting point.

3. **E-commerce Features**: If you want to sell products online, you'll need an e-commerce platform. WooCommerce, Magento, or Shopify are popular choices for WordPress, PHP, and JavaScript respectively.

4. **SEO Optimization**: To make your website attractive to

In [11]:
key_queries

['1. "React frontend development"',
 '\n2. "Open-source design resources"',
 '\n3. "WooCommerce e-commerce platform"',
 '\n4. "SEO optimization tools"',
 '\n5. "Marketing tools for website"',
 '\n6. "WordPress backend development"',
 '\n7. "Vue.js frontend development"',
 '\n8. "Magento e-commerce platform"',
 '\n9. "Shopify e-commerce platform"',
 '\n10. "Django backend development"',
 '\n11. "Ruby on Rails backend development"',
 '\n12. "SEO best practices"',
 '\n13. "Email marketing platforms"',
 '\n14. "Social media integration tools"',
 '\n15. "Analytics tools for website".\n']

# 3. Поступает список readme который надо преобразовать в embedding и добавить в общую базу данных, если их там нет, если данный readme уже есть в базе данных, то забираем его оттуда. По итогу этого пункта, формируется база данных документов с которыми будет работать LLM

In [23]:
def fill_database(markdown_path):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=150
    )

    loader = UnstructuredMarkdownLoader(markdown_path)
    doc = loader.load()
    split_docs = text_splitter.split_documents(doc)

    # Add the unique documents to your database
    return Chroma.from_documents(split_docs, query.embeddings)

# 4-6. Проверить соответствует ли каждый документ правильному запросу или же нет. Если соответствует, то с помощью семилярить получить численное значение на сколько оно соответствует, чтобы выплюнуть ранжированный список. Сделать человеко читаемую выжимку по этому readme на том же языке, что и поступил первичный запрос.

In [24]:
# Предположим, что ко мне пришёл список строк с путями к .md файлам
paths = ['./README.md', './README1.md']

Здесь score почему-то работает наоборот и чем ближе он к нлю, тем больше похожи друг на друга два документа. При возврате надо будет делать 1 - score.

In [25]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use only yes or no. Keep the answer as concise as possible. As a context, you will receive a repository "readme" file and you need to answer whether this repository corresponds to the question or not. 
{context}
Question: Does the repository match the user's search query: {question}?
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [26]:
out_meta = []
out_description = []
out_raiting = []

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [37]:
# local_path = ("/home/user/models/Meta-Llama-3-8B.Q6_K.gguf")
local_path = ("/home/kama/models/mistral-7b-instruct-v0.2.Q6_K.gguf")
callbacks = [StreamingStdOutCallbackHandler()]
llm2 = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

In [38]:
for path in paths:
    vectordb = fill_database(path)

    # Run chain
    qa_chain = RetrievalQA.from_chain_type(
        llm2,
        retriever=vectordb.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    )

    question = query.correct_query  # Этот запрос по факту нужен, тот что внизу написан для теста
    # question = "I need project with ml for predict delivery food"

    result = qa_chain({"query": question})
    if result['result'] == "Yes":
        out_meta.append(result['source_documents'][0].metadata['source'])  # save path from right document
        chain = RetrievalQA.from_chain_type(
            llm=llm2,
            chain_type="map_reduce",
            retriever=vectordb.as_retriever()
        )
        q = "What about this text? Explain it as for a person who does not have a technical education. Explain in simple words and to the point, but do not write too much - a maximum of 5 sentences."
        result = chain({"query": q})
        out_description.append(result['result'])
        # забираем максимальное значение сходства для этого документа
        out_raiting.append(1 - max([doc[1] for doc in [docs for docs in vectordb.similarity_search_with_score(question)]]))
        continue

  warn_deprecated(


 No, this repository does not match the user's search query as it focuses on LangChain, a language model framework, rather than creating a dog breed selling website. No, this repository does not match the user's search query as it focuses on predicting order cancellation in a food delivery system using machine learning algorithms rather than creating a dog breed selling website.

In [39]:
out_description

[]

In [40]:
out_meta

[]

In [41]:
out_raiting

[]