In [1]:
import sys
sys.path.append(r"C:\Users\Reem Darawsheh\Desktop\PythonProject/PythonProject")

from process_beir_queries import main

main()


📦 تم حفظ تمثيل الاستعلامات الهجين في: C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Processing\hybridQuery\BEIR\quora\test\hybird_query_data.joblib


In [2]:
import os
import re
import json
import joblib
import ir_datasets
import nltk
import numpy as np
from tqdm import tqdm
from pymongo import MongoClient
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# تحميل موارد NLTK
nltk.download("wordnet")
nltk.download("omw-1.4")

lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def clean_text(text):
    tokens = tokenize(text)
    lemmas = lemmatize_tokens(tokens)
    return " ".join(lemmas)

# ------------------ المسارات -------------------
bert_path = r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Processing\Bertquery\antique\train\query_embeddings\bert_query_embeddings.joblib"
vectorizer_path = r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Data Representation\TF-IDF\antique\train\doc\tfidf_data.joblib"
output_path = r"C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Processing\hybridQuery\Antique\train\hybird_query_data.joblib"

# ------------------ تحميل البيانات -------------------
bert_data = joblib.load(bert_path)
vectorizer_data = joblib.load(vectorizer_path)
vectorizer = vectorizer_data["vectorizer"]

bert_embeddings = bert_data["embeddings"]
bert_model_name = bert_data["model_name"]
bert_query_ids = bert_data["query_ids"]

# ربط تمثيلات BERT بالـ query_id
bert_map = {
    qid: emb for qid, emb in zip(bert_query_ids, bert_embeddings)
}

# تحميل الاستعلامات
dataset = ir_datasets.load("antique/test/non-offensive")

query_docs = []
query_ids = []
original_texts = []
clean_texts = []
tfidf_indices_list = []
tfidf_values_list = []

print("🧼 تنظيف الاستعلامات...")
for query in tqdm(dataset.queries_iter()):
    cleaned = clean_text(query.text)
    if not cleaned.strip():
        continue

    if query.query_id not in bert_map:
        continue  # تجاهل الاستعلامات التي لا يوجد لها تمثيل BERT

    # تمثيل TF-IDF
    tfidf_vector = vectorizer.transform([cleaned])
    row = tfidf_vector.getrow(0).tocoo()

    # بناء الإدخال
    doc = {
        "query_id": query.query_id,
        "original_text": query.text,
        "clean_text": cleaned,
        "bert_embedding": bert_map[query.query_id].tolist(),
        "tfidf_indices": row.col.tolist(),
        "tfidf_values": row.data.tolist()
    }
    query_docs.append(doc)

    # تجميع للـ joblib لاحقًا
    query_ids.append(query.query_id)
    original_texts.append(query.text)
    clean_texts.append(cleaned)
    tfidf_indices_list.append(row.col.tolist())
    tfidf_values_list.append(row.data.tolist())

# ------------------ تخزين في MongoDB -------------------
client = MongoClient("mongodb://localhost:27017/")
db = client["ir_project"]
collection = db["queries_quora_test_hybrid_antique"]
collection.delete_many({})
collection.insert_many(query_docs)

print(f"✅ تم تخزين {len(query_docs)} استعلام هجين في MongoDB داخل: {collection.name}")

# ------------------ حفظ بصيغة joblib -------------------
os.makedirs(os.path.dirname(output_path), exist_ok=True)

joblib.dump({
    "query_ids": query_ids,
    "original_texts": original_texts,
    "clean_texts": clean_texts,
    "bert_embeddings": [bert_map[qid].tolist() for qid in query_ids],
    "tfidf_indices": tfidf_indices_list,
    "tfidf_values": tfidf_values_list,
    "bert_model_name": bert_model_name
}, output_path)

print(f"📦 تم حفظ تمثيل الاستعلامات الهجين في: {output_path}")



📦 تم حفظ تمثيل الاستعلامات الهجين في: C:\Users\Reem Darawsheh\Desktop\PythonProject\PythonProject\Query Processing\hybridQuery\Antique\quora\test\hybird_query_data.joblib
