In [28]:
from match_hybrid_embeddings import match_hybrid_embeddings

match_hybrid_embeddings(
    queries_path=r"C:\Users\Reem Darawsheh\Desktop\PythonProject/data/hybrid_queries.joblib",
    hybrid_chunks_dir=r"C:\Users\Reem Darawsheh\Desktop\PythonProject/data/hybrid_chunks",
    output_path=r"C:\Users\Reem Darawsheh\Desktop\PythonProject/results/hybrid_results.json",
    top_k=100,
    alpha=0.5
)


In [29]:
# تنفيذ
if __name__ == "__main__":
    queries_path = r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Processing\hybridQuery\BEIR\quora\test\hybird_query_data.joblib"
    hybrid_chunks_dir = r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Data Representation\Hybrid\beir\quora\test\chunks"
    output_path = r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Matching & Ranking\HybridMatching\hybrid_results.json"

    match_hybrid_embeddings(
        queries_path=queries_path,
        hybrid_chunks_dir=hybrid_chunks_dir,
        output_path=output_path,
        top_k=100,
        alpha=0.5
    )


🔹 تحميل الاستعلامات الهجينة...
📊 عدد الاستعلامات: 10000, حجم مفردات TF-IDF: 102029


🧩 معالجة Chunks: 100%|██████████| 105/105 [07:23<00:00,  4.22s/it]


✅ تم حفظ النتائج في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\HybridMatching\hybrid_results.json


In [25]:
import joblib

queries_data = joblib.load(r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Processing\hybridQuery\BEIR\quora\test\hybird_query_data.joblib")
print(queries_data.keys())


dict_keys(['query_ids', 'original_texts', 'clean_texts', 'bert_embeddings', 'tfidf_indices', 'tfidf_values', 'bert_model_name'])


In [9]:
import joblib
from scipy.sparse import csr_matrix

def custom_tokenizer(text):
    return text.split()

# تحميل النصوص الأصلية للاستعلامات
queries_data = joblib.load(r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Processing\tfidf\queries_tfidf.joblib")
query_ids = queries_data["query_ids"]
clean_texts = queries_data["clean_texts"]

# تحميل vectorizer الخاص بالوثائق
doc_vectorizer_data = joblib.load(r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Data Representation\TF-IDF\beir\quora\test\doc\tfidf_data.joblib")
vectorizer = doc_vectorizer_data["vectorizer"]

# إعادة تحويل الاستعلامات لنفس فضاء الميزات
query_matrix = vectorizer.transform(clean_texts)

# حفظ التمثيل المعاد ترميزه
joblib.dump({
    "query_ids": query_ids,
    "clean_texts": clean_texts,
    "query_tfidf_matrix": query_matrix,
    "vectorizer": vectorizer  # اختياري للتوثيق
}, "hybird_query_data_reencoded.joblib")

print("✅ تم حفظ الاستعلامات بتمثيل متوافق مع الوثائق.")


✅ تم حفظ الاستعلامات بتمثيل متوافق مع الوثائق.


In [4]:
import joblib
import os
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import json

def match_hybrid_embeddings(
    queries_path: str,
    hybrid_chunks_dir: str,
    output_path: str,
    top_k: int = 100,
    alpha: float = 0.5
):
    print("🔹 تحميل الاستعلامات الهجينة...")
    queries_data = joblib.load(queries_path)

    query_ids = queries_data["query_ids"]
    tfidf_indices_list = queries_data["tfidf_indices"]
    tfidf_values_list = queries_data["tfidf_values"]
    bert_queries = np.array(queries_data["bert_embeddings"], dtype=np.float32)

    num_queries = len(query_ids)

    # 🔧 استخراج حجم المفردات من أول ملف chunk
    chunk_files = sorted([f for f in os.listdir(hybrid_chunks_dir) if f.endswith(".joblib")])
    if not chunk_files:
        raise FileNotFoundError("❌ لم يتم العثور على أي ملفات joblib داخل مجلد الـ chunks.")

    first_chunk_path = os.path.join(hybrid_chunks_dir, chunk_files[0])
    first_chunk_data = joblib.load(first_chunk_path)
    vocab_size = first_chunk_data["tfidf_chunk"].shape[1]

    # إعادة بناء مصفوفة TF-IDF من القوائم
    indptr = [0]
    indices = []
    data = []

    for i in range(num_queries):
        indices.extend(tfidf_indices_list[i])
        data.extend(tfidf_values_list[i])
        indptr.append(len(indices))

    tfidf_query_matrix = csr_matrix((data, indices, indptr), shape=(num_queries, vocab_size))

    print(f"📊 عدد الاستعلامات: {num_queries}, حجم مفردات TF-IDF: {tfidf_query_matrix.shape[1]}")

    results = {qid: [] for qid in query_ids}

    for chunk_file in tqdm(chunk_files, desc="🧩 معالجة Chunks"):
        chunk_path = os.path.join(hybrid_chunks_dir, chunk_file)
        chunk_data = joblib.load(chunk_path)

        tfidf_docs = chunk_data["tfidf_chunk"]       # sparse matrix
        bert_docs = np.array(chunk_data["bert_chunk"], dtype=np.float32)
        doc_ids = chunk_data["doc_ids"]

        if tfidf_query_matrix.shape[1] != tfidf_docs.shape[1]:
            raise ValueError(
                f"❌ عدم تطابق بين أبعاد TF-IDF للاستعلامات ({tfidf_query_matrix.shape[1]}) "
                f"والوثائق ({tfidf_docs.shape[1]}) في {chunk_file}"
            )

        sim_tfidf = cosine_similarity(tfidf_query_matrix, tfidf_docs)
        sim_bert = cosine_similarity(bert_queries, bert_docs)
        sim_hybrid = alpha * sim_tfidf + (1 - alpha) * sim_bert

        for i, qid in enumerate(query_ids):
            sims = sim_hybrid[i]
            top_indices = np.argpartition(sims, -top_k)[-top_k:]
            top_scores = sims[top_indices]
            sorted_idx = top_indices[np.argsort(-top_scores)]

            results[qid].extend([(doc_ids[idx], float(sims[idx])) for idx in sorted_idx])

    # ترتيب وأخذ أعلى top_k لكل استعلام
    final_results = {}
    for qid, docs in results.items():
        docs.sort(key=lambda x: -x[1])
        final_results[qid] = docs[:top_k]

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_results, f, indent=2, ensure_ascii=False)

    print(f"✅ تم حفظ النتائج في: {output_path}")





In [11]:
# تنفيذ
if __name__ == "__main__":
    queries_path = r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Processing\hybridQuery\Antique\train\hybird_query_data.joblib"
    hybrid_chunks_dir = r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Data Representation\Hybrid\antique\train\chunks"
    output_path = r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Matching & Ranking\HybridMatching\hybrid_results_antique.json"

    match_hybrid_embeddings(
        queries_path=queries_path,
        hybrid_chunks_dir=hybrid_chunks_dir,
        output_path=output_path,
        top_k=100,
        alpha=0.5
    )


🔹 تحميل الاستعلامات الهجينة...
📊 عدد الاستعلامات: 176, حجم مفردات TF-IDF: 250274


🧩 معالجة Chunks: 100%|██████████| 81/81 [00:18<00:00,  4.30it/s]


✅ تم حفظ النتائج في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\HybridMatching\hybrid_results_antique.json
