In [31]:
import chromadb, google.generativeai as genai
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings

In [32]:
COLL_NAME = 'StackOverflowQnA'
CHROMA_PATH = '../lab5/chroma_db'
MODEL_NAME = 'all-MiniLM-L6-v2'
LLM_NAME = 'gemini-2.0-flash'
SYSTEM_MSG = "Ты - эксперт по программированию и ИТ. Отвечай точно, используя только предоставленный контекст."
with open('api.txt.', 'r') as f:
    GOOGLE_API_KEY = f.read().strip()

In [33]:
model = SentenceTransformer(MODEL_NAME)

client = chromadb.PersistentClient(path=CHROMA_PATH, settings=Settings(anonymized_telemetry=False))
collection = client.get_collection(COLL_NAME)

genai.configure(api_key=GOOGLE_API_KEY)
gemini = genai.GenerativeModel(LLM_NAME)

In [34]:
collection

Collection(name=StackOverflowQnA)

In [35]:
collection.count()

28672

In [36]:
def semantic_search(query: str, top_n: int = 5):
    query_embedding = model.encode(
        [query],
        batch_size=256,
        convert_to_numpy=True
    )[0].tolist()

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_n,
        include=["metadatas", "distances"]
    )

    hits = []
    for metadata, distance in zip(results["metadatas"][0], results["distances"][0]):
        hits.append({
            "question": metadata["clean_question"],
            "answer": metadata["clean_answers"],
            "score": 1 - distance,
            "answer_score": metadata["Score_answers"]
        })

    return hits

In [37]:
search_query = "Deploying SQL Server"
results = semantic_search(search_query, top_n=5)

for i, hit in enumerate(results, 1):
    print(f"\nРезультат {i}:")
    print(f"Вопрос: {hit['question']}")
    print(f"Ответ: {hit['answer'][:150]}...")
    print(f"Оценка ответа: {hit['answer_score']}")
    print(f"Оценка сходства: {hit['score']:.4f}")


Результат 1:
Вопрос: wonder guys manage deployment database sql servers specifically sql server development live buildscript standard windows batch current complexity scripts switch powershell later enterprise manager management studio express count copy mdf file attach bit careful working binary data compatiblity issue development live run version server time given lack explain create table sql exports existing database sql scripts run target server yes tool automatically dump given database sql queries runs command line enterprise manager management studio express count lastly given fact live database contains data deployment involve creating tables checking difference structure alter table live ones instead need data verification conversion existing fields change hear lot great stuff red gate products hobby projects price bit steep automatically deploy sql server databases test live
Ответ: currently working thing deploying sql server databases test live include process local integr

In [38]:
def make_prompt(user_q: str, ctx: list[dict]) -> str:
    parts = [f'User question:\n{user_q}\n', 'Knowledge snippets:']

    for i, s in enumerate(ctx, 1):
        txt = (s['answer'][:500] + '…') if len(s['answer']) > 500 else s['answer']
        parts.append(f'[{i}] {txt}')

    parts.append('\nCompose a concise, correct answer citing the snippets.')
    return '\n\n'.join(parts)

In [39]:
def rag(user_q: str, k_ctx: int = 3, temperature: float = 0.2):

    ctx = semantic_search(user_q, k_ctx)

    user_prompt = make_prompt(user_q, ctx)
    full_prompt = f'{SYSTEM_MSG}\n\n{user_prompt}'

    resp = gemini.generate_content(
        [{'role': 'user', 'parts': [full_prompt]}],
        generation_config={'temperature': temperature, 'max_output_tokens': 512}
    )
    return resp.text, ctx

In [40]:
tests = [
    'how do you install pandas?',
    'Deploying SQL Server?',
    'How do you open a file in C++?',
    'How do I connect to a database and loop over a recordset in C#?',
    'Convert HashBytes to VarChar'
]

for query in tests:
    results = semantic_search(query, top_n=5)

    print('?:')
    print(f' {query}')
    print('?')

    for i, hit in enumerate(results, 1):
        answer_snippet = hit['answer'][:150] + '...' if len(hit['answer']) > 70 else hit['answer']
        score_info = f"(similarity: {hit['score']:.3f})"

        if 'answer_score' in hit:
            score_info += f", (score: {hit['answer_score']})"

        print(f" ![context{i}] → {answer_snippet}  {score_info}")

    answer, ctx_used = rag(query, k_ctx=3)
    print('\n! Ответ Gemini:\n', answer)
    print('\n' + '=' * 100 + '\n')

?:
 how do you install pandas?
?
 ![context1] → panda wrote altho better nice python based api  (similarity: 0.446), (score: 1.0)
 ![context2] → helps guide installing python windows vista  (similarity: 0.397), (score: 1.0)
 ![context3] → jeff hardy blog post django ironpython mentioned tony meyer useful read jeff posts series struggles ironpython easy install zlib solving zlib problem ...  (similarity: 0.378), (score: 25.0)
 ![context4] → try debug pdb issue likely easy install method check site dir seeks easy install pth...  (similarity: 0.369), (score: 0.0)
 ![context5] → temporary change change python path temporarily interactive session append sys path like import sys sys path program files pyscripter lib rpyc zip win...  (similarity: 0.369), (score: 2.0)

! Ответ Gemini:
 Based on the provided snippets, none directly explain how to install pandas. Snippet [1] mentions pandas as a "python based api". Therefore, a general answer would be: You can install pandas using pip, a package