In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
#create base directory
BASE_DIR = "/content/drive/MyDrive/rag_arxiv_project"
PDF_DIR = os.path.join(BASE_DIR, "data/pdfs")

os.makedirs(PDF_DIR, exist_ok=True)
print("Project directory ready:", PDF_DIR)


Project directory ready: /content/drive/MyDrive/rag_arxiv_project/data/pdfs


In [None]:
# install require directory
!pip install arxiv


Collecting arxiv
  Downloading arxiv-2.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.4.0-py3-none-any.whl (12 kB)
Downloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=496bae6ae4dfae8bc8d2e6552ef13149e19540a553361fc1f32565c3feeb4999
  Stored in directory: /root/.cache/pip/wheels/03/f5/1a/23761066dac1d0e8e683e5fdb27e12de53209d05a4a37e6246
Successfully built sgmllib3k
Installing collected packag

In [64]:
#download arxiv paper automatic
import arxiv
import os

search_queries = [
    "Sybil attack VANET",
    "VANET security",
    "Vehicular Ad Hoc Network Sybil detection"
]

max_results_per_query = 10 # keep small for now

client = arxiv.Client()

downloaded = 0

for query in search_queries:
    search = arxiv.Search(
        query=query,
        max_results=max_results_per_query,
        sort_by=arxiv.SortCriterion.Relevance
    )

    for result in client.results(search):
        title = result.title.replace(" ", "_").replace("/", "")
        pdf_path = os.path.join(PDF_DIR, f"{title}.pdf")

        if not os.path.exists(pdf_path):
            try:
                result.download_pdf(dirpath=PDF_DIR, filename=f"{title}.pdf")
                downloaded += 1
                print(f"Downloaded: {title}")
            except arxiv.HTTPError as e:
                print(f"Error downloading {title}: {e}")

print(f"\nTotal PDFs downloaded: {downloaded}")



Total PDFs downloaded: 0


In [65]:
#Goal of Step 2

#Extract clean text from PDFs

#Remove noise (references, page numbers)

#Split into meaningful chunks (not random)


# Step 2.1 — Install PDF & Text Tools
!pip install pymupdf langchain



In [66]:
# Step 2.2 — PDF → Text Extraction

import fitz  # PyMuPDF
import os

TEXT_DIR = os.path.join(BASE_DIR, "data/texts")
os.makedirs(TEXT_DIR, exist_ok=True)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


for pdf_file in os.listdir(PDF_DIR):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(PDF_DIR, pdf_file)
        text = extract_text_from_pdf(pdf_path)

        txt_path = os.path.join(TEXT_DIR, pdf_file.replace(".pdf", ".txt"))
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(text)

print("✅ Text extraction completed.")



✅ Text extraction completed.


In [67]:
# Step 2.3 — Clean the Text (Very Important)

# We remove:

# References section

# Extra newlines

# Page artifacts

import re

def clean_text(text):
    # Remove references section
    text = re.split(r"\nreferences\n|\nReferences\n", text)[0]

    # Remove multiple newlines
    text = re.sub(r"\n{2,}", "\n", text)

    # Remove page numbers
    text = re.sub(r"\n\d+\n", "\n", text)

    return text.strip()


for txt_file in os.listdir(TEXT_DIR):
    txt_path = os.path.join(TEXT_DIR, txt_file)

    with open(txt_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    cleaned = clean_text(raw_text)

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(cleaned)

print("✅ Text cleaning done.")


✅ Text cleaning done.


In [68]:
# Step 2.4 — Smart Chunking (RAG Secret Sauce)

# We’ll use overlapping chunks (this improves recall a LOT).
!pip install langchain_text_splitters

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

documents = []

for txt_file in os.listdir(TEXT_DIR):
    path = os.path.join(TEXT_DIR, txt_file)
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()

    chunks = text_splitter.split_text(text)

    for chunk in chunks:
        documents.append({
            "text": chunk,
            "source": txt_file
        })

print(f"✅ Total chunks created: {len(documents)}")


✅ Total chunks created: 2938


In [69]:
# Save Chunks for Next Step

import pickle

CHUNK_PATH = os.path.join(BASE_DIR, "data/chunks.pkl")

with open(CHUNK_PATH, "wb") as f:
    pickle.dump(documents, f)

print("✅ Chunks saved for embedding step.")


✅ Chunks saved for embedding step.


In [70]:
# STEP 3 — Embeddings + FAISS Vector Database (Core of RAG)

# This step turns your text chunks into a searchable brain.
# After this, you’ll be able to semantically retrieve relevant paper sections — not keyword search.
# Goal of Step 3

# Load saved chunks

# Generate embeddings

# Build & save a FAISS index

# Test semantic retrieval

# All Colab-safe and job-ready.


# Step 3.1 — Install Embedding & FAISS Libraries
!pip install sentence-transformers faiss-cpu




In [71]:
# Step 3.2 — Load Your Chunks

import pickle
import os

CHUNK_PATH = os.path.join(BASE_DIR, "data/chunks.pkl")

with open(CHUNK_PATH, "rb") as f:
    documents = pickle.load(f)

texts = [doc["text"] for doc in documents]
metadata = [doc["source"] for doc in documents]

print("Total chunks loaded:", len(texts))


Total chunks loaded: 2938


In [72]:
# Step 3.3 — Create Embeddings

# We’ll use a strong but lightweight model:

# all-MiniLM-L6-v2 (industry favorite)

from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embed_model.encode(
    texts,
    show_progress_bar=True,
    convert_to_numpy=True
)

print("Embedding shape:", embeddings.shape)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Embedding shape: (2938, 384)


In [73]:
# Step 3.4 — Build FAISS Index
import faiss
import numpy as np

dimension = embeddings.shape[1]

index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("FAISS index size:", index.ntotal)


FAISS index size: 2938


In [74]:
# Step 3.5 — Save FAISS Index (IMPORTANT for Colab)
FAISS_DIR = os.path.join(BASE_DIR, "faiss_index")
os.makedirs(FAISS_DIR, exist_ok=True)

faiss.write_index(index, os.path.join(FAISS_DIR, "index.faiss"))

with open(os.path.join(FAISS_DIR, "metadata.pkl"), "wb") as f:
    pickle.dump(metadata, f)

print("✅ FAISS index saved to Drive.")


✅ FAISS index saved to Drive.


In [75]:
# Step 3.6 — Test Semantic Retrieval

def search(query, top_k=5):
    query_embedding = embed_model.encode([query])
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for i in indices[0]:
        results.append({
            "text": texts[i][:500],
            "source": metadata[i]
        })
    return results


query = "limitations of Sybil attack detection in VANETs"
results = search(query)

for r in results:
    print("\nSOURCE:", r["source"])
    print(r["text"])




SOURCE: Efficient_Detection_of_Sybil_Attack_Based_on_Cryptography_in_Vanet.txt
additional vehicles on the road. Consequence of this attack is that every type of attack can be 
played after spoofing the positions or identities of other nodes in the network. 
3. DETECTION OF SYBIL ATTACK 
In literature, different techniques are proposed for detection of Sybil attack in VANETs. Sybil 
attacks are always possible in the absence of any logical centralized authority. As there is no 
centralized entity in VANETs, detection of Sybil attacks is very difficult. Some constraints such

SOURCE: Detecting_Sybil_Attacks_in_Vehicular_Ad_Hoc_Networks.txt
Detecting Sybil Attacks in Vehicular Ad Hoc Networks 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Salam Hamdan 
Computer Science Department 
Princess Sumaya University for technology 
Amman, Jordan 
S.hamdan@psut.edu.jo 
 
 
 
 
 
 
 
 
 
 
 
 
 
Amjad Hudaib 
Computer Science Department 
University of Jodan 
Amman, Jordan 
AHUDAIB@JU.EDU.JO 
 
 
 
 
 
 
 
 
 


In [76]:
# STEP 4 — Add the “G”: RAG Question Answering with LLM

# Now we combine:

# ✔ FAISS retrieval

# ✔ Prompt engineering

# ✔ LLM reasoning

# and turn raw chunks → clean, grounded answer.

# Step 4.1 — Install LLM Libraries
!pip install transformers accelerate torch




In [77]:
# Step 4.2 — Load an Instruction-Tuned Model

# We’ll use:

# google/flan-t5-base
# ✔ Fast
# ✔ Instruction-following

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



In [78]:
# Step 4.3 — Build RAG Prompt (VERY IMPORTANT)

def build_prompt(context_chunks, question):
    context = "\n\n".join(context_chunks)
    prompt = f"""
You are a research assistant.
Answer the question ONLY using the provided context.
If the answer is not in the context, say "Not found in the provided documents".

Context:
{context}

Question:
{question}

Answer:
"""
    return prompt


In [79]:
# Step 4.4 — RAG QA Function

import torch

def rag_qa(question, top_k=5):
    retrieved = search(question, top_k)
    context_chunks = [r["text"] for r in retrieved]

    prompt = build_prompt(context_chunks, question)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    )

    outputs = model.generate(
        **inputs,
        max_new_tokens=200
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [80]:
# Step 4.5 — Test FULL RAG

query = "What are the limitations of Sybil attack detection in VANETs?"
print(rag_qa(query))


There is no centralized entity in VANETs, detection of Sybil attacks is very difficult.


In [81]:
print(rag_qa("what is sybil attack in VANET?"))

Sybil node forges several identities such as pretending to be a police car, stealing vehicles identities or creating new identities


In [86]:
print(rag_qa("give me  research paper name on sybil attack detection?"))

Sybil Limit: A near optimal social network defense against Sybil attacks. In IEEE Symposium on Security and Privacy, 2008. [2]


In [92]:
import shutil

shutil.make_archive("/content/rag_arxiv_project", 'zip', "/content/drive/MyDrive/rag_arxiv_project")


'/content/rag_arxiv_project.zip'