In [None]:
# -*- coding: utf-8 -*-
"""
Cybersecurity RAG System with Gemini 2.0 + Gradio Custom UI
"""
!pip install PyPDF2
!pip install faiss-cpu

# Required Libraries
import PyPDF2
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
import gradio as gr
import os
import requests
import json

# Configuration
PDF_PATH = 'Merged_Cybersecurity_Documents.pdf'
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
GEMINI_API_KEY = "your_gemini_api_key"
GEMINI_MODEL_NAME = "gemini-2.0-flash"

# Global Variables
text_chunks = []
faiss_index = None
tokenizer = None
model = None

# PDF Text Extraction
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading PDF: {e}")
    return text

# Chunking
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size - overlap)]

# Embeddings
def initialize_embedding_model(model_name):
    global tokenizer, model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embeddings(texts):
    encoded = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_out = model(**encoded)
    pooled = mean_pooling(model_out, encoded['attention_mask'])
    return torch.nn.functional.normalize(pooled, p=2, dim=1).cpu().numpy()

# FAISS Vector Store
def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

def retrieve_chunks(query_embedding, top_k=3):
    query_embedding = query_embedding.reshape(1, -1)
    D, I = faiss_index.search(query_embedding, top_k)
    return [text_chunks[i] for i in I[0]]

# Gemini LLM API
def call_gemini_llm(prompt):
    url = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL_NAME}:generateContent?key={GEMINI_API_KEY}"
    payload = {
        "contents": [{"role": "user", "parts": [{"text": prompt}]}],
        "generationConfig": {
            "temperature": 0.7,
            "maxOutputTokens": 500
        }
    }
    try:
        res = requests.post(url, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        res.raise_for_status()
        data = res.json()
        return data['candidates'][0]['content']['parts'][0]['text']
    except Exception as e:
        return f"Gemini API Error: {e}"

# RAG Answer Logic
def rag_answer_question(user_question):
    if not user_question.strip():
        return "Please enter a question."
    query_embed = get_embeddings([user_question])
    retrieved = retrieve_chunks(query_embed)
    if not retrieved:
        return "No relevant information found."
    context = "\n\n".join(retrieved)
    prompt = f"""
You are a cybersecurity expert.
Use the context to answer the question clearly and accurately.

Context:
{context}

Question: {user_question}
Answer:
"""
    return call_gemini_llm(prompt)

# Initialization
def initialize_rag_system():
    global text_chunks, faiss_index, tokenizer, model
    print("Initializing RAG system...")
    text = extract_text_from_pdf(PDF_PATH)
    text_chunks = chunk_text(text)
    tokenizer, model = initialize_embedding_model(EMBEDDING_MODEL_NAME)
    embeds = get_embeddings(text_chunks)
    faiss_index = build_faiss_index(embeds)
    print("Initialization complete.")

# Gradio Interface with Custom Theme
def create_gradio_interface():
    if faiss_index is None:
        print("Error: FAISS index not built.")
        return

    custom_theme = gr.themes.Base(
        primary_hue="green",
        secondary_hue="cyan",
        neutral_hue="slate"
    ).set(
        text_size="lg",
        font=["Roboto", "ui-sans-serif"],
        radius_size="lg",
    )

    with gr.Blocks(theme=custom_theme, css="""
        .gradio-container {
            background: linear-gradient(to right, #0f2027, #203a43, #2c5364);
            color: white;
        }
        h1, h2 {
            text-align: center;
            color: #00ffc8;
        }
        .input textarea {
            font-size: 1.1rem;
        }
        .output {
            font-weight: bold;
            font-size: 1.05rem;
        }
    """) as demo:

        gr.Markdown("# 🔐 Cybersecurity RAG Assistant")
        gr.Markdown("Ask questions from the **Merged Cybersecurity PDF** using RAG + Gemini AI 💡")

        with gr.Row():
            with gr.Column():
                question_input = gr.Textbox(
                    lines=3,
                    placeholder="Ask about APTs, Zero Trust, ransomware...",
                    label="🧠 Your Question"
                )
                ask_btn = gr.Button("🔍 Get Answer")

            with gr.Column():
                answer_output = gr.Textbox(
                    lines=10,
                    label="📘 Gemini Answer",
                    interactive=False
                )

        ask_btn.click(fn=rag_answer_question, inputs=question_input, outputs=answer_output)

        gr.Examples(
            examples=[
                ["What are Advanced Persistent Threats (APTs)?"],
                ["How do ransomware attacks work and what are their defenses?"],
                ["What is Zero Trust Architecture?"],
                ["What are the future trends in cybersecurity regarding AI?"],
                ["Tell me about supply chain attacks and how to defend against them."]
            ],
            inputs=question_input
        )

        gr.Markdown("### 🚀 Powered by Gemini 2.0 + Sentence Transformers + FAISS Vector DB")

    demo.launch(share=True)

# Main Runner
if __name__ == "__main__":
    initialize_rag_system()
    create_gradio_interface()


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Initializing RAG system...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Initialization complete.


TypeError: Base.set() got an unexpected keyword argument 'text_size'

In [None]:
# -*- coding: utf-8 -*-
"""
Cybersecurity RAG System with Gemini 2.0 + Gradio Custom UI
"""

# --- INSTALL LIBRARIES (ONLY FOR COLAB USERS) ---
# Comment out these lines if running locally
# !pip install PyPDF2 faiss-cpu transformers torch gradio

# --- REQUIRED LIBRARIES ---
import PyPDF2
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
import gradio as gr
import os
import requests
import json

# --- CONFIGURATION ---
PDF_FILES = {
    "PDF 1": "CYBER SECURITY (R18A0521).pdf",
    "PDF 2": "Introduction to Cybersecurity.pdf",
    "PDF 3": "cybersecuirty_sb_factsheets_all.pdf"
}
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
GEMINI_API_KEY = "your_gemini_api_key"
GEMINI_MODEL_NAME = "gemini-2.0-flash"

# --- GLOBALS ---
vector_indexes = {}     # PDF name → FAISS index
text_chunks_map = {}    # PDF name → List of text chunks
tokenizer = None
model = None

# --- LOAD & CHUNK PDF ---
def extract_text_from_pdf(path):
    try:
        with open(path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            return "".join([page.extract_text() for page in reader.pages])
    except Exception as e:
        return f"Error reading {path}: {e}"

def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]

# --- EMBEDDING ---
def initialize_embedding_model():
    global tokenizer, model
    tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
    model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

def get_embeddings(texts):
    encoded = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_out = model(**encoded)
    pooled = mean_pooling(model_out, encoded['attention_mask'])
    return torch.nn.functional.normalize(pooled, p=2, dim=1).cpu().numpy()

# --- FAISS INDEX ---
def build_index(chunks):
    embeddings = get_embeddings(chunks)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

def retrieve_chunks(index, chunks, query_embed, top_k=3):
    D, I = index.search(query_embed.reshape(1, -1), top_k)
    return [chunks[i] for i in I[0]]

# --- GEMINI CALL ---
def call_gemini(prompt):
    url = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL_NAME}:generateContent?key={GEMINI_API_KEY}"
    payload = {
        "contents": [{"role": "user", "parts": [{"text": prompt}]}],
        "generationConfig": {
            "temperature": 0.7,
            "maxOutputTokens": 500
        }
    }
    try:
        res = requests.post(url, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        res.raise_for_status()
        data = res.json()
        return data['candidates'][0]['content']['parts'][0]['text']
    except Exception as e:
        return f"⚠️ Gemini API error: {e}"

# --- ANSWERING LOGIC ---
def rag_answer_question(user_question):
    if not user_question.strip():
        return "❗ Please enter a valid question."

    query_embed = get_embeddings([user_question])
    results = ""

    for pdf_label, index in vector_indexes.items():
        chunks = text_chunks_map[pdf_label]
        retrieved = retrieve_chunks(index, chunks, query_embed)

        if not retrieved:
            results += f"📄 **{pdf_label}**: No relevant info found.\n\n"
            continue

        context = "\n\n".join(retrieved)
        prompt = f"""
You are a cybersecurity expert.
Use the below context to answer the user question. If answer isn't available, clearly say so.

Context:
{context}

Question: {user_question}
Answer:
"""
        response = call_gemini(prompt)
        results += f"📄 **{pdf_label}**:\n{response.strip()}\n\n---\n\n"

    return results

# --- INIT SYSTEM ---
def initialize_all_pdfs():
    global text_chunks_map, vector_indexes
    initialize_embedding_model()
    for label, file in PDF_FILES.items():
        print(f"🔄 Processing {file}...")
        text = extract_text_from_pdf(file)
        chunks = chunk_text(text)
        text_chunks_map[label] = chunks
        vector_indexes[label] = build_index(chunks)
    print("✅ Initialization complete.")

# --- GRADIO UI ---
def create_gradio_ui():
    theme = gr.themes.Default()

    with gr.Blocks(theme=theme, css="""
        .gradio-container {background-color: #1a1a2e; color: white;}
        textarea, input, button {font-size: 16px !important;}
        .output {font-weight: bold;}
    """) as app:
        gr.Markdown("## 🔐 Cybersecurity Q&A from Multiple PDFs (Gemini RAG)")
        gr.Markdown("Upload 3 PDFs and ask your cybersecurity questions. Answers come from all documents individually.")

        with gr.Row():
            question = gr.Textbox(lines=2, placeholder="Ask your question here", label="💬 Your Question")
            btn = gr.Button("🔍 Get Answers")

        answer = gr.Textbox(lines=20, label="📘 Gemini Answer (PDF-wise)")

        btn.click(fn=rag_answer_question, inputs=question, outputs=answer)

        gr.Markdown("#### ⚡ Powered by Gemini 2.0 + FAISS + Sentence Transformers")

    app.launch(share=True)

# --- MAIN ---
if __name__ == "__main__":
    initialize_all_pdfs()
    create_gradio_ui()


🔄 Processing CYBER SECURITY (R18A0521).pdf...
🔄 Processing Introduction to Cybersecurity.pdf...
🔄 Processing cybersecuirty_sb_factsheets_all.pdf...
✅ Initialization complete.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9fff911d7141137f35.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
