In [None]:
!pip install -q \
    langchain \
    langchain-community \
    sentence-transformers \
    faiss-cpu \
    pymupdf \
    pydantic \
    streamlit \
    networkx \
    scikit-learn


In [None]:
import os
import re
import fitz
import uuid
import networkx as nx
from typing import List, Dict
from collections import Counter

from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
class PaperSection(BaseModel):
    name: str
    text: str

class ResearchPaper(BaseModel):
    paper_id: str
    title: str
    authors: List[str]
    year: int
    abstract: str
    sections: List[PaperSection]
    keywords: List[str]
    references: List[str]


In [None]:
SECTION_MARKERS = [
    "abstract", "introduction", "method", "methodology",
    "experiment", "results", "discussion", "conclusion", "references"
]

def extract_sections_from_pdf(pdf_path: str) -> Dict[str, str]:
    doc = fitz.open(pdf_path)
    raw_text = " ".join([page.get_text() for page in doc])
    text = re.sub(r"\n+", "\n", raw_text).lower()

    extracted = {}
    for i, sec in enumerate(SECTION_MARKERS):
        start = text.find(sec)
        if start == -1:
            continue
        end = len(text)
        for nxt in SECTION_MARKERS[i+1:]:
            idx = text.find(nxt, start + 20)
            if idx != -1:
                end = idx
                break
        extracted[sec] = text[start:end].strip()

    return extracted


In [None]:
def build_research_paper(pdf_path: str) -> ResearchPaper:
    sections = extract_sections_from_pdf(pdf_path)

    paper = ResearchPaper(
        paper_id=str(uuid.uuid4()),
        title=os.path.basename(pdf_path).replace(".pdf", ""),
        authors=["Unknown"],
        year=2023,
        abstract=sections.get("abstract", "")[:1500],
        sections=[
            PaperSection(name=k, text=v)
            for k, v in sections.items()
            if k not in ["abstract", "references"]
        ],
        keywords=["AI", "Deep Learning"],
        references=sections.get("references", "").split("\n")[:20]
    )
    return paper


In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=100
)

def create_chunks(paper: ResearchPaper) -> List[Document]:
    docs = []
    for sec in paper.sections:
        chunks = splitter.split_text(sec.text)
        for ch in chunks:
            docs.append(
                Document(
                    page_content=ch,
                    metadata={
                        "paper_id": paper.paper_id,
                        "section": sec.name,
                        "year": paper.year,
                        "title": paper.title
                    }
                )
            )
    return docs


In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_texts(texts):
    return embedding_model.encode(texts)

def build_faiss_index(documents: List[Document]):
    texts = [d.page_content for d in documents]
    metadatas = [d.metadata for d in documents]
    return FAISS.from_texts(texts, embedding_model, metadatas=metadatas)


In [None]:
def semantic_search(query, vectorstore, top_k=5):
    return vectorstore.similarity_search(query, k=top_k)


In [None]:
import os
from langchain_community.llms import OpenAI

os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY" # Replace with your actual OpenAI API Key
llm = OpenAI(temperature=0)

def rag_answer(query, vectorstore):
    docs = semantic_search(query, vectorstore)
    context = "\n\n".join([d.page_content for d in docs])

    prompt = f"""
Answer strictly using the context below.
If not found, say "Not mentioned in papers".

Context:
{context}

Question:
{query}
"""
    return llm(prompt)

In [None]:
def compare_papers(query, vectorstore):
    docs = semantic_search(query, vectorstore, top_k=8)
    combined = "\n\n".join([d.page_content for d in docs])

    prompt = f"""
Compare the following research excerpts:

{combined}

Provide differences in methods and contributions.
"""
    return llm(prompt)


In [None]:
def create_citation_graph(papers: List[ResearchPaper]):
    graph = nx.DiGraph()
    for p in papers:
        for ref in p.references:
            graph.add_edge(p.title, ref[:80])
    return graph


In [None]:
def trend_analysis(papers: List[ResearchPaper]):
    yearly_keywords = {}
    for p in papers:
        yearly_keywords.setdefault(p.year, []).extend(p.keywords)

    trends = {
        year: Counter(words).most_common(5)
        for year, words in yearly_keywords.items()
    }
    return trends


In [None]:
def paper_metadata_tool(title):
    return {
        "title": title,
        "venue": "arXiv",
        "year": 2023,
        "citations": 100
    }

def related_work_tool(query, vectorstore):
    return semantic_search(query, vectorstore, top_k=3)


In [None]:
%%writefile app.py
import streamlit as st

st.title(" Research Intelligence Assistant")

query = st.text_input("Ask a research question")

if query:
    st.write("Answer will be generated from indexed papers.")


In [None]:
import streamlit as st

# ---------------- CONFIG ----------------
st.set_page_config(page_title="Research Intelligence System", layout="wide")

# ---------------- EMBEDDING ----------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# ---------------- PDF PARSER ----------------
SECTIONS = ["abstract", "introduction", "method", "results", "conclusion", "references"]

def parse_pdf(pdf):
    doc = fitz.open(stream=pdf.read(), filetype="pdf")
    text = " ".join(p.get_text() for p in doc).lower()
    parsed = {}

    for i, sec in enumerate(SECTIONS):
        start = text.find(sec)
        if start == -1:
            continue
        end = len(text)
        for nxt in SECTIONS[i+1:]:
            idx = text.find(nxt, start+50)
            if idx != -1:
                end = idx
                break
        parsed[sec] = text[start:end]
    return parsed

# ---------------- CHUNKING ----------------
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=80)

def create_docs(parsed, title):
    docs = []
    for sec, txt in parsed.items():
        for chunk in splitter.split_text(txt):
            docs.append(
                Document(
                    page_content=chunk,
                    metadata={"section": sec, "title": title}
                )
            )
    return docs

# ---------------- UI ----------------
st.title("üìö Research Paper Intelligence Assistant")

uploaded = st.file_uploader("Upload Research Paper (PDF)", type=["pdf"])

if uploaded:
    with st.spinner("Processing PDF..."):
        parsed = parse_pdf(uploaded)
        docs = create_docs(parsed, uploaded.name)
        texts = [d.page_content for d in docs]
        metadatas = [d.metadata for d in docs]
        vectorstore = FAISS.from_texts(texts, embedder, metadatas=metadatas)

    st.success("Paper Indexed Successfully!")

    col1, col2 = st.columns(2)

    with col1:
        st.subheader("üîç Semantic Search")
        q = st.text_input("Ask a question")
        if q:
            results = vectorstore.similarity_search(q, k=4)
            for r in results:
                st.markdown(f"**Section:** {r.metadata['section']}")
                st.write(r.page_content[:600])

    with col2:
        st.subheader("üìà Keyword Trends")
        words = []
        for d in docs:
            words.extend(d.page_content.split())
        trends = Counter(words).most_common(10)
        st.table(trends)

else:
    st.info("Upload a PDF to start")

In [None]:
!npm install localtunnel

In [None]:
!npx localtunnel --port 8501

[1G[0K‚†ô[1G[0K‚†π[1G[0Kyour url is: https://vast-meals-own.loca.lt


In [None]:
! streamlit run app.py & npx localtunnel --port 8501