In [None]:
!pip install arxiv langchain langchain-google-genai langchain-community spacy nltk chromadb pdf2image pytesseract pillow
!pip install sentence-transformers networkx scikit-learn matplotlib spacy
!pip install torch transformers peft datasets bitsandbytes trl
!apt-get update
!apt-get install -y poppler-utils tesseract-ocr
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md

Collecting arxiv
  Downloading arxiv-2.2.0-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.3-py3-none-any.whl.metadata (4.7 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.22-py3-none-any.whl.metadata (2.4 kB)
Collecting chromadb
  Downloading chromadb-1.0.6-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.16 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.17-py3-none-any.whl.metadata

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:

!pip install spacy sentence-transformers scikit-learn networkx matplotlib
!python -m spacy download en_core_web_md
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import os
import arxiv
import json
import asyncio
import time
import sys
from uuid import uuid4
from typing import List, Dict, Optional
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
try:
    from enhanced_nlp import clean_text, get_pos_tags, extract_named_entities
    from prompt_engineering import generate_agent_prompt, generate_summary_prompt
except ImportError as e:
    print(f"Error importing enhanced_nlp or prompt_engineering: {e}")
    print("Ensure enhanced_nlp.py and prompt_engineering.py are in the same directory.")
    sys.exit(1)
from pdf2image import convert_from_path
import pytesseract
from google.colab import files
import tempfile

sys.stdout.flush()
# Set this in your environment before running:
# export GOOGLE_API_KEY="your_api_key_here"
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY", "YOUR_API_KEY_HERE")

class BaseAgent:
    """Base class for all analysis agents."""
    def __init__(self, llm, retriever=None, focus_area: str=None):
        self.llm = llm
        self.retriever = retriever
        self.focus_area = focus_area
        if retriever and focus_area:
            self.prompt_template = PromptTemplate(
                template=generate_agent_prompt(focus_area),
                input_variables=["context", "question"]
            )
            self.qa_chain = RetrievalQA.from_chain_type(
                llm=self.llm,
                chain_type="stuff",
                retriever=self.retriever,
                chain_type_kwargs={"prompt": self.prompt_template}
            )

    async def analyze(self, query: str) -> Dict:
        max_retries = 3
        for attempt in range(max_retries):
            try:
                result = await asyncio.to_thread(self.qa_chain.invoke, {
                    "query": f"Analyze the content focusing on {self.focus_area} for: {query}"
                })
                return {
                    "focus_area": self.focus_area,
                    "analysis": result.get("result", ""),
                    "error": None
                }
            except Exception as e:
                if "429" in str(e) and attempt < max_retries - 1:
                    wait_time = 2 ** attempt * 30
                    print(f"Quota exceeded, retrying in {wait_time} seconds...")
                    await asyncio.sleep(wait_time)
                else:
                    return {
                        "focus_area": self.focus_area,
                        "analysis": "",
                        "error": str(e)
                    }

class GapAgent(BaseAgent):
    def __init__(self, llm, retriever):
        super().__init__(llm, retriever, "research gaps")

class ChallengeAgent(BaseAgent):
    def __init__(self, llm, retriever):
        super().__init__(llm, retriever, "challenges")

class FutureWorkAgent(BaseAgent):
    def __init__(self, llm, retriever):
        super().__init__(llm, retriever, "future work")

class LimitationAgent(BaseAgent):
    def __init__(self, llm, retriever):
        super().__init__(llm, retriever, "limitations")

class SummaryAgent(BaseAgent):
    def __init__(self, llm):
        super().__init__(llm, None, "summary")

    async def analyze(self, query: str, agent_results: List[Dict]) -> Dict:
        max_retries = 3
        for attempt in range(max_retries):
            try:
                agent_outputs = "\n".join([
                    f"{result['focus_area'].title()}:\n{result['analysis'] or 'No analysis available due to error: ' + result['error']}"
                    for result in agent_results
                ])
                prompt = generate_summary_prompt(query, agent_outputs)
                response = await asyncio.to_thread(self.llm.invoke, [
                    {"role": "user", "content": prompt}
                ])
                summary = response.content if hasattr(response, 'content') else str(response)
                return {
                    "focus_area": self.focus_area,
                    "analysis": summary,
                    "error": None
                }
            except Exception as e:
                if "429" in str(e) and attempt < max_retries - 1:
                    wait_time = 2 ** attempt * 30
                    print(f"Quota exceeded for summary, retrying in {wait_time} seconds...")
                    await asyncio.sleep(wait_time)
                else:
                    return {
                        "focus_area": self.focus_area,
                        "analysis": "",
                        "error": str(e)
                    }

class ResearchPaperAnalyzer:
    def __init__(self):
        try:
            self.llm = ChatGoogleGenerativeAI(
                model="gemini-1.5-pro",
                temperature=0.5,
                google_api_key=os.getenv("GOOGLE_API_KEY")
            )
            self.embeddings = GoogleGenerativeAIEmbeddings(
                model="models/embedding-001",
                google_api_key=os.getenv("GOOGLE_API_KEY")
            )
        except Exception as e:
            print(f"Error initializing Google API: {e}")
            print("Ensure your GOOGLE_API_KEY is valid.")
            sys.exit(1)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        self.vector_store = None

    def search_papers(self, query: str, max_results: int = 10) -> List[Dict]:
        max_retries = 3
        for attempt in range(max_retries):
            try:
                client = arxiv.Client()
                search = arxiv.Search(
                    query=query,
                    max_results=max_results,
                    sort_by=arxiv.SortCriterion.Relevance
                )
                papers = [
                    {
                        'title': result.title,
                        'authors': [author.name for author in result.authors],
                        'summary': result.summary,
                        'pdf_url': result.pdf_url,
                        'published': result.published.strftime("%Y-%m-%d")
                    }
                    for result in client.results(search)
                ]
                return papers
            except Exception as e:
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt * 5
                    print(f"Error searching papers: {e}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to search papers after {max_retries} attempts: {e}")
                    return []

    def create_vector_store(self, texts: List[str], metadatas: List[Dict] = None) -> Chroma:
        if metadatas is None:
            metadatas = [{} for _ in texts]
        self.vector_store = Chroma.from_texts(
            texts=texts,
            metadatas=metadatas,
            embedding=self.embeddings
        )
        return self.vector_store

    async def analyze_content(self, query: str, content: str) -> Dict:
        try:
            chunks = self.text_splitter.split_text(content)
            vector_store = self.create_vector_store(chunks)
            retriever = vector_store.as_retriever(search_kwargs={"k": 5})

            agents = [
                GapAgent(self.llm, retriever),
                ChallengeAgent(self.llm, retriever),
                FutureWorkAgent(self.llm, retriever),
                LimitationAgent(self.llm, retriever)
            ]

            agent_tasks = [agent.analyze(query) for agent in agents]
            agent_results = await asyncio.gather(*agent_tasks)

            summary_agent = SummaryAgent(self.llm)
            summary_result = await summary_agent.analyze(query, agent_results)

            cleaned_text = clean_text(content)
            pos_data = get_pos_tags(cleaned_text)
            entities = extract_named_entities(cleaned_text)

            return {
                "content": content,
                "agent_analyses": agent_results,
                "summary": summary_result,
                "nlp_details": {
                    "pos_tags": pos_data,
                    "named_entities": entities
                }
            }
        except Exception as e:
            print(f"Error during analysis: {e}")
            return {"error": str(e)}

    async def analyze_papers(self, query: str, papers: List[Dict]) -> Dict:
        combined_text = " ".join([paper['summary'] for paper in papers])
        return await self.analyze_content(query, combined_text)

def upload_pdf():
    max_attempts = 3
    for attempt in range(max_attempts):
        print("Please upload a PDF of a research paper.")
        sys.stdout.flush()
        try:
            uploaded = files.upload()
            for filename in uploaded.keys():
                if filename.lower().endswith('.pdf'):
                    return filename
            print("No valid PDF file uploaded.")
            sys.stdout.flush()
            return None
        except Exception as e:
            if attempt < max_attempts - 1:
                print(f"Upload failed: {e}. Retrying in 5 seconds...")
                sys.stdout.flush()
                time.sleep(5)
            else:
                print(f"Failed to upload PDF after {max_attempts} attempts: {e}")
                sys.stdout.flush()
                return None

def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)
        texts = []
        for img in images:
            text = pytesseract.image_to_string(img)
            if text.strip(): 
                texts.append(clean_text(text))
        combined_text = " ".join(texts)
        if not combined_text.strip():
            print("No text could be extracted from the PDF.")
            sys.stdout.flush()
            return None
        return combined_text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        sys.stdout.flush()
        return None

def get_user_input():
    try:
        print("\nSelect input type: 1. Text, 2. PDF")
        sys.stdout.flush()
        choice = input("Enter 1 or 2: ")
        sys.stdout.flush()
        if choice == '1':
            print("Enter your query: ")
            sys.stdout.flush()
            query = input()
            sys.stdout.flush()
            return 'text', query
        elif choice == '2':
            pdf_path = upload_pdf()
            if pdf_path:
                text = extract_text_from_pdf(pdf_path)
                if text:
                    return 'pdf', text
            return None, None
        else:
            print("Invalid choice")
            sys.stdout.flush()
            return None, None
    except Exception as e:
        print(f"Error getting user input: {e}")
        sys.stdout.flush()
        return None, None

async def main():
    try:
        try:
            from google.colab import files
        except ImportError:
            print("This script must be run in Google Colab.")
            sys.stdout.flush()
            return

        analyzer = ResearchPaperAnalyzer()
        print("Welcome to the Multimodal Research Paper Analyzer!")
        print("This tool supports text and PDF inputs.\n")
        sys.stdout.flush()

        while True:
            input_type, input_data = get_user_input()
            if not input_type:
                continue

            if input_type == 'text':
                print("\nSearching for relevant papers...")
                sys.stdout.flush()
                papers = analyzer.search_papers(input_data)
                if not papers:
                    print("No papers found for your query.")
                    sys.stdout.flush()
                    continue
                print(f"Found {len(papers)} papers. Analyzing...")
                sys.stdout.flush()
                results = await analyzer.analyze_papers(input_data, papers)
            elif input_type == 'pdf':
                print("\nAnalyzing the uploaded research paper PDF...")
                sys.stdout.flush()
                results = await analyzer.analyze_content("Analyze this paper", input_data)

            if "error" in results:
                print(f"Analysis failed: {results['error']}")
                sys.stdout.flush()
                continue

            print("\n## Analysis Results")
            print("### Summary")
            sys.stdout.flush()
            if results["summary"]["error"]:
                print(f"Error: {results['summary']['error']}")
            else:
                print(results["summary"]["analysis"])
            print()

            print("### Agent Analyses")
            for analysis in results["agent_analyses"]:
                print(f"#### {analysis['focus_area'].title()}")
                if analysis["error"]:
                    print(f"Error: {analysis['error']}")
                else:
                    print(analysis["analysis"])
                print()
            sys.stdout.flush()

            print("### NLP Details")
            print(f"**POS Tags**: {results['nlp_details']['pos_tags'][:10]}...")
            print(f"**Named Entities**: {results['nlp_details']['named_entities'][:10]}...")
            sys.stdout.flush()

            output_file = f"analysis_{uuid4().hex[:8]}.json"
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            print(f"\nResults saved to {output_file}\n")
            sys.stdout.flush()

    except Exception as e:
        print(f"Error in main loop: {e}")
        sys.stdout.flush()

# Run the main function in Colab
try:
    import IPython
    from IPython.display import display
    import nest_asyncio
    nest_asyncio.apply()  
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
except ImportError:
    # Fallback for non-Colab environments
    asyncio.run(main())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  memory = ConversationBufferMemory(memory_key="chat_history")
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Error initializing model: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 72.12 MiB is free. Process 222348 has 14.67 GiB memory in use. Of the allocated memory 14.17 GiB is allocated by PyTorch, and 396.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Falling back to a smaller model...
Welcome to the Multimodal Research Paper Analyzer!
This tool supports text and PDF inputs.


Select input type: 1. Text, 2. PDF
Please upload a PDF of a research paper.


Saving sample_paper.pdf to sample_paper (1).pdf

Analyzing the uploaded research paper PDF...


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-pro"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 2
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 26
}
].


Quota exceeded, retrying in 30 seconds...


  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-pro"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 2
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 53
}
].


Quota exceeded, retrying in 60 seconds...

## Analysis Results
### Summary
## Comprehensive Analysis of Research on Cyber Attack Detection in UAVs

This analysis synthesizes the provided information on research gaps, challenges, future work, and limitations in cyber attack detection for UAVs.

**1. Synthesizing Key Points:**

* **Gaps:** Existing methods, both residue-based and learning-based, have limitations. Residue-based methods struggle with high-dimensional, non-Gaussian data. Learning-based methods are susceptible to noise, require large datasets, and often lack contextual awareness. There's a need for integrated approaches, broader attack detection capabilities (beyond FDI), and advanced residue analysis techniques.
* **Challenges:** Developing robust and effective detection systems, particularly for FDI attacks, is challenging.  Overcoming the limitations of individual methods (residue-based and learning-based) and creating effective hybrid approaches is crucial.  Expanding de