In [1]:
# Cell 1: Installation (تشغيل مرة واحدة في Colab)
# ============================================

!pip install -q sentence-transformers scikit-learn pandas numpy "transformers>=4.40.0" accelerate safetensors

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Cell 2: Imports & Settings
# ============================================

import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors

import torch
from transformers import pipeline

pd.set_option('display.max_colwidth', None)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Imports ready. Using device: {device}")

✅ Imports ready. Using device: cuda


In [6]:
# Cell 3: Load ShadowID dataset for RAG
# ============================================

DATA_PATH = '/content/drive/MyDrive/shadow_id_v2_English_Dataset.csv'

try:
    df = pd.read_csv(DATA_PATH)
    print(f"✅ Dataset Loaded: {df.shape[0]} records, {df.shape[1]} columns.")
    print("\n🧾 Columns:", df.columns.tolist())
    display(df.head(2))
except FileNotFoundError:
    raise FileNotFoundError(
        f"❌ CSV file not found at: {DATA_PATH}\n"
        "اـاكد من مسار الملف (mount Google Drive ثم بحدد المسار الصحيح)."
    )

✅ Dataset Loaded: 4000 records, 24 columns.

🧾 Columns: ['RowID', 'PersonType', 'PersonTypeCode', 'IDNumber', 'Nationality', 'NameShort', 'Phone', 'Location', 'Latitude', 'Longitude', 'IssueDate', 'ExpiryDate', 'TokenId', 'TokenStartTime', 'TokenEndTime', 'UsageTime', 'TokenDurationMinutes', 'UsedWithinValidity', 'DeviceId', 'State', 'StateArabic', 'RiskLabel', 'RiskReason', 'FraudType']


Unnamed: 0,RowID,PersonType,PersonTypeCode,IDNumber,Nationality,NameShort,Phone,Location,Latitude,Longitude,...,TokenEndTime,UsageTime,TokenDurationMinutes,UsedWithinValidity,DeviceId,State,StateArabic,RiskLabel,RiskReason,FraudType
0,3385,Resident,2,2676743889,Filipino,Aaron B.,545183616,Jazan,16.8895,42.57,...,10/19/2025 12:27,10/19/2025 12:17,15,1,DEV-52853004,Active,فعال,Low,,
1,2790,Citizen,1,1206743219,Saudi,Aaron B.,533094606,Dammam,26.4207,50.0888,...,6/13/2025 15:04,6/13/2025 14:54,15,1,DEV-14444657,Suspicious,مشبوه,High,Abnormal request frequency,FrequentGeneration


In [7]:
# Cell 4: Optional filtering (use only Medium / High risk)
# ==========================================================

if "RiskLabel" in df.columns:
    mask = df["RiskLabel"].astype(str).str.lower().isin(["medium", "high"])
    df_rag = df[mask].copy()
    if df_rag.empty:
        print("⚠️ No Medium/High records found, using full dataset instead.")
        df_rag = df.copy()
    else:
        print(f"✅ Using only Medium/High risk events: {df_rag.shape[0]} records.")
else:
    print("ℹ️ Column 'RiskLabel' not found, using all records for RAG.")
    df_rag = df.copy()

df_rag.reset_index(drop=True, inplace=True)

✅ Using only Medium/High risk events: 1366 records.


In [8]:
# Cell 5: Build knowledge_text for each security event
# ======================================================

COLUMNS_FOR_TEXT = [""
    "EventID",
    "Timestamp",
    "PersonType",
    "PersonTypeCode",
    "Nationality",
    "Location",
    "Latitude",
    "Longitude",
    "FraudType",
    "State",
    "UsedWithinValidity",
    "IsExpiredAtUse",
    "TimeFromStartMin",
    "TokenDurationMinutes",
    "UsageHour",
    "UsageWeekday",
    "RiskLabel",
]

available_cols = [c for c in COLUMNS_FOR_TEXT if c in df_rag.columns]
print("🧩 Columns used in knowledge_text:", available_cols)

def row_to_text(row):
    parts = []
    for col in available_cols:
        val = row.get(col, None)
        if pd.isna(val):
            continue
        parts.append(f"{col}: {val}")
    return " | ".join(parts)

df_rag["knowledge_text"] = df_rag.apply(row_to_text, axis=1)

print("\n🔍 Example knowledge_text:\n")
print(df_rag["knowledge_text"].iloc[0])

🧩 Columns used in knowledge_text: ['PersonType', 'PersonTypeCode', 'Nationality', 'Location', 'Latitude', 'Longitude', 'FraudType', 'State', 'UsedWithinValidity', 'TokenDurationMinutes', 'RiskLabel']

🔍 Example knowledge_text:

PersonType: Citizen | PersonTypeCode: 1 | Nationality: Saudi | Location: Dammam | Latitude: 26.4207 | Longitude: 50.0888 | FraudType: FrequentGeneration | State: Suspicious | UsedWithinValidity: 1 | TokenDurationMinutes: 15 | RiskLabel: High


In [9]:
# ============================================
# Cell 6: Build embeddings with GATE-AraBert-v1 + KNN index
# ============================================

EMB_MODEL_ID = "Omartificial-Intelligence-Space/GATE-AraBert-v1"

print(f"⏳ Loading AraBert embeddings model on {device}...")
embedder = SentenceTransformer(EMB_MODEL_ID, device=device)
print("✅ AraBert SentenceTransformer loaded.")

knowledge_base = df_rag["knowledge_text"].tolist()

print("⏳ Encoding knowledge base with AraBert...")
corpus_embeddings = embedder.encode(knowledge_base, show_progress_bar=True)
print("✅ Encoded vectors shape:", corpus_embeddings.shape)

# بناء KNN index للبحث الدلالي
nn_indexer = NearestNeighbors(n_neighbors=5, metric="cosine")
nn_indexer.fit(corpus_embeddings)
print("✅ KNN search index ready.")


def search_shadow_id(query, k=3):
    """
    تبحث عن أهم k سجلات أمنية مرتبطة بسؤال المستخدم باستخدام
    embeddings من GATE-AraBert-v1 + cosine similarity.
    """
    query_emb = embedder.encode([query])
    distances, indices = nn_indexer.kneighbors(query_emb, n_neighbors=k)

    results = []
    for i, idx in enumerate(indices[0]):
        score = 1 - distances[0][i]  # من مسافة إلى درجة تشابه
        results.append({
            "score": float(score),
            "text": knowledge_base[idx]
        })
    return results


# تجربة سريعة للبحث
print("✨ Test search result example:")
_test = search_shadow_id("suspicious login attempts", k=2)
for r in _test:
    print(f"[{r['score']:.2f}] {r['text'][:120]}...")

⏳ Loading AraBert embeddings model on cuda...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

✅ AraBert SentenceTransformer loaded.
⏳ Encoding knowledge base with AraBert...


Batches:   0%|          | 0/43 [00:00<?, ?it/s]

✅ Encoded vectors shape: (1366, 768)
✅ KNN search index ready.
✨ Test search result example:
[0.59] PersonType: Resident | PersonTypeCode: 2 | Nationality: Pakistani | Location: Hail | Latitude: 27.5114 | Longitude: 41.7...
[0.59] PersonType: Resident | PersonTypeCode: 2 | Nationality: Pakistani | Location: Tabuk | Latitude: 28.3833 | Longitude: 36....


In [10]:
# Cell 7: Load Qwen2.5-1.5B-Instruct (Smart & Fast Arabic LLM)
# ============================================

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

LLM_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"

print(f"⏳ Loading {LLM_MODEL_ID}...")

# تحميل التوكنايزر والمودل
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_ID,
    torch_dtype="auto",
    device_map="auto"  # يستخدم الـ GPU تلقائياً
)

# إعداد الـ Pipeline للتوليد
llm_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,  # مساحة كافية للملخص والتوصيات
    temperature=0.7,     # موزون
    repetition_penalty=1.1 # منع تكرار الكلام
)

print("✅ Qwen2.5-1.5B LLM Loaded Successfully.")

⏳ Loading Qwen/Qwen2.5-1.5B-Instruct...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Device set to use cuda:0


✅ Qwen2.5-1.5B LLM Loaded Successfully.


In [11]:
# Cell 8: Prompt template + Qwen Generation
# ============================================

def generate_response(user_query, retrieved_logs):
    """
    يأخذ سؤال المستخدم + السجلات ويقوم بصياغة رد احترافي
    """
    # 1. تجهيز النصوص المسترجعة
    context_str = "\n".join([f"- {log['text']}" for log in retrieved_logs])

    # 2. تجهيز البرومبت بنظام المحادثة (Chat Format) ليفهمه المودل بدقة
    messages = [
        {
            "role": "system",
            "content": (
                "أنت خبير أمن سيبراني لنظام ShadowID في وزارة الداخلية. "
                "مهمتك تحليل السجلات وكتابة تقرير موجز بالعربية. "
                "التزم بالهيكل: 1. ملخص الحالة 2. تحليل المخاطر 3. التوصيات."
            )
        },
        {
            "role": "user",
            "content": f"السؤال: {user_query}\n\nالسجلات المسترجعة:\n{context_str}\n\nأعطني التقرير الأمني:"
        }
    ]

    # 3. تحويل البرومبت للصيغة التي يفهمها المودل
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # 4. التوليد
    outputs = llm_pipe(prompt)

    # استخراج النص المولد فقط (نحذف البرومبت من البداية)
    full_response = outputs[0]["generated_text"]
    response_only = full_response.split("<|im_start|>assistant")[-1].strip()

    return prompt, response_only

In [12]:
# Cell 9: End-to-End Test
# ============================================

def run_shadow_id_rag(query, k=4):
    print(f"🔎 User Query: {query}")
    print("-" * 60)

    # 1) البحث واسترجاع السجلات
    retrieved_logs = search_shadow_id(query, k=k)
    print(f"✅ Retrieved {len(retrieved_logs)} relevant records.\n")

    # 2) التوليد الذكي
    prompt, response = generate_response(query, retrieved_logs)

    print("🤖 Arabic Security Summary & Recommendations:\n")
    print(response)
    print("\n" + "-" * 60)

    return response



In [13]:
# تجربة
result = run_shadow_id_rag(
    "حلل لي الهويات ذات مستوى High Risk وأعطني توصيات عاجلة.",
    k=4
)

🔎 User Query: حلل لي الهويات ذات مستوى High Risk وأعطني توصيات عاجلة.
------------------------------------------------------------
✅ Retrieved 4 relevant records.

🤖 Arabic Security Summary & Recommendations:

ملخص الحالة:

هناك سبعة هويات تحتوي على مستوى "High Risk" وهي:

1. رقم 1 (Tabuk)
2. رقم 2 (Tabuk)
3. رقم 3 (Tabuk)
4. رقم 4 (Jeddah)

هذه الهمويات تتضمن العديد من المخاطر، بما في ذلك:

- DeviceHopping: استخدام الجهاز أو الهاتف الذي تم استخدامه سابقًا.
- ImpossibleTravel: إمكانية أن تكون هناك مشاكل معقدة أو غير متوقعة في التحويلات.
- FraudType: نوع الخداع أو الفساد الذي يتم ا detection بواسطة ShadowID.

تحليل المخاطر:

- DeviceHopping: يمكن أن يكون هذا خطراً كبيراً حيث قد يتسبب في تسريب البيانات الشخصية أو الوصول إلى الأموال.
- ImpossibleTravel: قد يؤدي إلى تأخير في عمليات الدفع أو تعطيل الخدمة بشكل عام.
- FraudType: إذا كانت هذه الهمويات تتطلب دفع مبالغ كبيرة، فقد يشكل ذلك خطراً أكبر على النظام.

توصيات عاجلة:

1. إعادة تعيين الاسم والرمز لكل هوية حسب الترتيب الحالي.
2. إجراء اخت