In [1]:
from sentence_transformers import SentenceTransformer
import polars as pl
import chromadb
import torch
import os
from tqdm import tqdm


In [2]:
os.environ["DNNL_MAX_CPU_ISA"] = "AVX512_CORE_AMX"
os.environ["OMP_NUM_THREADS"] = str(os.cpu_count())

In [3]:
print(torch.__config__.parallel_info())

ATen/Parallel:
	at::get_num_threads() : 6
	at::get_num_interop_threads() : 6
OpenMP 2019
	omp_get_max_threads() : 6
Intel(R) oneAPI Math Kernel Library Version 2025.1-Product Build 20250306 for Intel(R) 64 architecture applications
	mkl_get_max_threads() : 6
Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)
std::thread::hardware_concurrency() : 12
Environment variables:
	OMP_NUM_THREADS : 12
	MKL_NUM_THREADS : [not set]
ATen parallel backend: OpenMP



In [4]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

In [5]:
df = pl.read_csv(
    "clear.csv",
    columns=["Id_questions", "clean_question", "clean_answers", "Score_answers"]
)

In [6]:
df.shape

(2176164, 4)

In [7]:
df.schema

Schema([('Id_questions', Int64),
        ('Score_answers', Float64),
        ('clean_question', String),
        ('clean_answers', String)])

In [8]:
df = df.with_columns(
    pl.col("clean_question").cast(pl.Utf8).fill_null(""),
    pl.col("clean_answers").cast(pl.Utf8).fill_null("")
).with_columns(
    (pl.col("clean_question") + " " + pl.col("clean_answers")).alias("text")
)

In [9]:
df.head()

Id_questions,Score_answers,clean_question,clean_answers,text
i64,f64,str,str,str
80,12.0,"""written database generation sc…","""wound kind hack actually works…","""written database generation sc…"
80,6.0,"""written database generation sc…","""sqlite api function called lik…","""written database generation sc…"
80,1.0,"""written database generation sc…","""making delimiter little comple…","""written database generation sc…"
90,13.0,"""good tutorials explaining bran…","""version control subversion goo…","""good tutorials explaining bran…"
90,2.0,"""good tutorials explaining bran…","""try version control standalone…","""good tutorials explaining bran…"


In [10]:
df['Id_questions'].n_unique()

1264216

In [11]:
df.filter(pl.col("text").is_null()).height

0

In [12]:
df.null_count()

Id_questions,Score_answers,clean_question,clean_answers,text
u32,u32,u32,u32,u32
0,161648,0,0,0


In [13]:
texts = df["text"].to_numpy()

In [14]:
embeddings = model.encode(
    texts,
    convert_to_numpy=True,
    convert_to_tensor=False,
    show_progress_bar=True
)

Batches:   0%|          | 0/68006 [00:00<?, ?it/s]

In [15]:
client = chromadb.Client()

In [16]:
collection = client.create_collection(
    name="StackOverflowQnA",
    metadata={"hnsw:space": "cosine"}
)

In [33]:
ids = pl.int_range(0, df.height, eager=True).cast(pl.Utf8).to_list()
metadatas = df.select(["clean_question", "clean_answers"]).rows(named=True)

In [34]:
batch_size = 4096
with tqdm(total=len(ids), desc="Загрузка в Chroma") as pbar:
    for i in range(0, len(ids), batch_size):
        batch = {
            "ids": ids[i:i + batch_size],
            "embeddings": embeddings[i:i + batch_size].tolist(),
            "metadatas": metadatas[i:i + batch_size]
        }
        try:
            collection.add(**batch)
        except Exception as e:
            print(f"Ошибка в батче {i}-{i + batch_size}: {str(e)}")
        pbar.update(len(batch["ids"]))


Загрузка в Chroma: 100%|██████████| 2176164/2176164 [04:15<00:00, 8511.53it/s]


In [36]:
def semantic_search(query: str, top_n: int = 5):
    query_embedding = model.encode(
        [query],
        batch_size=256,
        convert_to_numpy=True
    )[0].tolist()

    return collection.query(
        query_embeddings=[query_embedding],
        n_results=top_n,
        include=["metadatas", "distances"]
    )

In [37]:
results = semantic_search("Deploying SQL Server")
for meta, dist in zip(results["metadatas"][0], results["distances"][0]):
    print(f"Вопрос: {meta['clean_question']}")
    print(f"Ответ: {meta['clean_answers'][:150]}...\n")

Вопрос: created windows application end visual studio end microsoft sql server net framework deployment installing application clients machine necessary install net framework visualstudio sql server installing application module building set file filesystem editor registry editor etc attach database application deployment help tips tracks deployment thanks advance
Ответ: depends want client sqlserver installed locally usually enterprise scenario server box sql server running need deploy database having sql server runni...

Вопрос: package developed ssis visual studio possible deploy attach package sql server possible licence sql server matter
Ответ: run version dtexec certainly deploy catalog install minimum need run ssis server execute package filesystem version dtexec http msdn microsoft com lib...

Вопрос: created windows application end visual studio end microsoft sql server net framework deployment installing application clients machine necessary install net framework visualstudio

In [38]:
results = semantic_search("How do you open a file in C++?")
for meta, dist in zip(results["metadatas"][0], results["distances"][0]):

    print(f"Вопрос: {meta['clean_question']}")
    print(f"Ответ: {meta['clean_answers'][:150]}...\n")

Вопрос: standard cross platform analogue file int open const char pathname int flags mode mode
Ответ: int open const char pathname int flags mode mode pure use std fstream http www cplusplus com reference fstream fstream include int main std fstream op...

Вопрос: happened run code example reading text file follows int char filename text txt ifstream fin filename fin code actually opens reads space delimited text don understand works file opened open read command way rewind beginning file trying create dynamically allocated array don know counted file number scores
Ответ: file opened open read command ifstream constructor open file way rewind beginning file use seekg eof reached need clear calling seekg trying create dy...

Вопрос: want open file reading way need able text files involve sort read line function binary files provide way read raw data char buffer
Ответ: include include namespace std void main ifstream stream fstream command initiate stream command char filename variable f

In [39]:
results = semantic_search("install pandas")
for meta, dist in zip(results["metadatas"][0], results["distances"][0]):

    print(f"Вопрос: {meta['clean_question']}")
    print(f"Ответ: {meta['clean_answers'][:150]}...\n")

Вопрос: having great difficulty installing pandas python python oct msc bit amd win windows computer help appreciated tried different solutions avail details needed inform
Ответ: tags guess windows people use anaconda comes packages including pandas line easy install use ide...

Вопрос: tried installing pandas pip install pandas python scripts pip install pandas gave following error system find path specified know downloaded installed python
Ответ: ...

Вопрос: trying install pandas pip install pandas error command python setup egg info failed error code users username appdata local temp pip build username pandas followed answer given installed setup incident error pip install pandas thanks help provide information let know
Ответ: easiest way install pandas dependencies windows download relevant packages christoph gohlke python extension packages windows repository find files pa...

Вопрос: tried installing pandas easy install claimed successfully installed pandas package python direct