# Research Paper Analysis System

This notebook implements a system for analyzing research papers, specifically extracting hypotheses and identifying research gaps.

## Installation of Required Libraries

First, let's install the necessary dependencies:

In [None]:
!pip install streamlit torch transformers peft langchain langchain-community faiss-cpu pypdf2 accelerate bitsandbytes

Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting requests<3,>=2.27 (from streamlit)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshmallow

## Import Libraries

In [None]:
import streamlit as st
import torch
import json
import tempfile
import os
import PyPDF2
import getpass
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)
from peft import PeftModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import files
import ipywidgets as widgets
from IPython.display import display, HTML

## Set Up HuggingFace Token

Securely enter your HuggingFace token when prompted

In [None]:
# Securely get HuggingFace token using getpass

import getpass
import os

hf_token = getpass.getpass("Enter your HuggingFace token: ")
os.environ["HF_TOKEN"] = hf_token


Enter your HuggingFace token: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


## Initialize Session Variables

In [None]:
# Initialize variables to store state (replacing streamlit session state)
class SessionState:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.embeddings = None
        self.vector_store = None
        self.pdf_text = ""
        self.hf_model_repo = None  # Model repository on HuggingFace
        self.hypotheses = None
        self.limitations = None

session_state = SessionState()

## Define Functions

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

def load_model_and_tokenizer(model_repo):
    """Load the model and tokenizer from a Hugging Face repository"""

    # Set up the configuration for quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Use HF_TOKEN from environment variables
    hf_token = os.environ.get("HF_TOKEN", None)

    if not hf_token:
        print("Warning: HuggingFace token not found. Some models may not load correctly.")

    # Base model fallback (public model)
    base_model_name = "tiiuae/falcon-7b-instruct"

    print("Loading model and tokenizer from HuggingFace repo...")

    try:
        # Try to load tokenizer from the fine-tuned model repo first
        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_auth_token=hf_token)
        print(f"Successfully loaded tokenizer from {model_repo}")
    except Exception as e:
        print(f"Could not load tokenizer from {model_repo}. Using base model tokenizer instead. Error: {e}")
        tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=hf_token)

    # Make sure pad token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print(f"Loading model from {model_repo}... This may take a few minutes...")
    try:
        # Try to load the fine-tuned model directly from the repo
        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            device_map="auto",
            quantization_config=bnb_config,
            use_auth_token=hf_token
        )
        print(f"Successfully loaded model from {model_repo}")
    except Exception as e:
        print(f"Could not load complete model from {model_repo}. Error: {e}")
        print("Loading base model and then attempting to load adapters...")

        # If loading the whole model failed, try loading base model + adapters
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            device_map="auto",
            quantization_config=bnb_config,
            use_auth_token=hf_token
        )

        try:
            model = PeftModel.from_pretrained(base_model, model_repo, use_auth_token=hf_token)
            print(f"Successfully loaded adapter from {model_repo}")
        except Exception as adapter_error:
            print(f"Failed to load adapter from {model_repo}. Using base model. Error: {adapter_error}")
            model = base_model

    return model, tokenizer


In [None]:
def extract_text_from_pdf(pdf_file_path):
    """Extract text from a PDF file"""
    text = ""
    with open(pdf_file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    return text

In [None]:
def create_vector_store(text):
    """Create a vector store from the text"""
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_text(text)

    # Load embeddings model
    if session_state.embeddings is None:
        print("Loading embeddings model...")
        session_state.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

    # Create vector store
    print("Creating vector store...")
    vector_store = FAISS.from_texts(chunks, session_state.embeddings)

    return vector_store, chunks

In [None]:
def generate_hypothesis(abstract_text):
    """Agent 1: Generate null and alternate hypotheses from abstract"""

    instruction = "Extract the null and alternate hypotheses from the given abstract."
    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{abstract_text}\n\n### Response:\n"

    inputs = session_state.tokenizer(prompt, return_tensors="pt").to(session_state.model.device)

    outputs = session_state.model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        top_p=0.9,
        top_k=50,
        temperature=0.5
    )

    response = session_state.tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Try to extract just the response part
    if "### Response:" in response:
        response = response.split("### Response:")[1].strip()

    return response

In [None]:
def identify_limitations(paper_text):
    """Agent 2: Identify limitations in the research paper"""

    instruction = """
You are an academic research analyst.

Task: From the following paper text, identify 3 to 6 specific research gaps and express them as clear, factual statements.

Requirements:
- Each point should be concise (1‚Äì2 lines).
- Avoid phrasing as questions. Use declarative statements.
- Each gap should highlight what is missing, underexplored, or limited in current research.
- Avoid repetition and generic phrases.
- Use bullet points starting with "-".
- Focus on clarity and insightfulness.
"""

    # If text is too long, use a summarized version or the abstract
    if len(paper_text) > 4000:
        # Try to find abstract section
        lower_text = paper_text.lower()
        if "abstract" in lower_text:
            start_idx = lower_text.find("abstract")
            end_idx = lower_text.find("introduction", start_idx)
            if end_idx == -1:  # If no "introduction" found
                end_idx = start_idx + 2000  # Use 2000 chars after abstract
            paper_text = paper_text[start_idx:end_idx]
        else:
            # Just use the first 4000 characters if no abstract found
            paper_text = paper_text[:4000]

    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{paper_text}\n\n### Response:\n"

    inputs = session_state.tokenizer(prompt, return_tensors="pt").to(session_state.model.device)

    outputs = session_state.model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,
        top_k=50,
        temperature=0.5,  # Slightly higher temperature for more creative analysis
        repetition_penalty=1.25,  # key fix for repetition
        num_return_sequences=1
        )

    response = session_state.tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Try to extract just the response part
    if "### Response:" in response:
        response = response.split("### Response:")[1].strip()

    return response

In [None]:
def perform_rag_query(query, top_k=3):
    """Perform RAG query to find most relevant chunks for a query"""
    if session_state.vector_store is None:
        print("No vector store available. Please upload a PDF first.")
        return []

    # Get relevant documents
    results = session_state.vector_store.similarity_search(query, k=top_k)
    return [doc.page_content for doc in results]

## Specify Hugging Face Model Repository

In [None]:
# Enter your HuggingFace model repository
model_repo = "tiiuae/falcon-7b-instruct"
session_state.hf_model_repo = model_repo

print(f"Using model from: {model_repo}")

Using model from: tiiuae/falcon-7b-instruct


## Load Model from HuggingFace Repository

In [None]:
# Load model from Hugging Face repository
session_state.model, session_state.tokenizer = load_model_and_tokenizer(session_state.hf_model_repo)

Loading model and tokenizer from HuggingFace repo...




tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

Successfully loaded tokenizer from tiiuae/falcon-7b-instruct
Loading model from tiiuae/falcon-7b-instruct... This may take a few minutes...




config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

Successfully loaded model from tiiuae/falcon-7b-instruct


## Upload and Process PDF

In [None]:
print("Please upload a research paper (PDF):")
uploaded = files.upload()

# Get the first file name from the uploaded files
if uploaded:
    file_name = list(uploaded.keys())[0]
    print(f"Processing {file_name}...")

    # Extract text from the PDF
    session_state.pdf_text = extract_text_from_pdf(file_name)

    # Create vector store
    session_state.vector_store, chunks = create_vector_store(session_state.pdf_text)

    print(f"Successfully processed PDF: {file_name}")
    print(f"Extracted {len(chunks)} text chunks")
    print("\nFirst 1000 characters of the extracted text:")
    print(session_state.pdf_text[:1000])

Please upload a research paper (PDF):


Saving d4va00136b.pdf to d4va00136b.pdf
Processing d4va00136b.pdf...
Loading embeddings model...


  session_state.embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating vector store...
Successfully processed PDF: d4va00136b.pdf
Extracted 181 text chunks

First 1000 characters of the extracted text:
Innovative approaches to sustainable wastewater
treatment: a comprehensive exploration of
conventional and emerging technologies
Jaweria Shamshad *aand Rashid Ur Rehman *b
Access to clean water is under threat due to population growth, climate change, and pollution, emphasizing
the need for e Ô¨Äective wastewater treatment. Wastewater pollutants pose risks to public health and
ecosystems, necessitating proper treatment methods. This paper outlines both conventional and emergingtechnologies for wastewater treatment. Established techniques, such as activated sludge processing,
chlorination, and constructed wetlands, are discussed alongside newer methods, such as advanced
oxidation, ultraviolet disinfection, membrane bioreactors, reverse osmosis, arti Ô¨Åcial intelligence
optimization, and nano Ô¨Åltration, which enhance contaminant removal but may in

## Generate Hypotheses

In [None]:
if session_state.model is not None and session_state.pdf_text:
    print("Generating hypotheses...")
    # For hypotheses, we'll use the first part of the paper which likely contains the abstract
    abstract_text = session_state.pdf_text[:3000]
    session_state.hypotheses = generate_hypothesis(abstract_text)

    print("\nHypotheses:")
    print(session_state.hypotheses)
else:
    print("Please make sure the model is loaded and a PDF has been processed before generating hypotheses.")

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Generating hypotheses...

Hypotheses:
The null hypothesis is that there are no significant differences in treatment efficiency between conventional and emerging technologies. The alternate hypothesis is that there are significant differences in treatment efficiency between these technologies.


## Identify Research Gaps

In [None]:
if session_state.model is not None and session_state.pdf_text:
    print("Identifying research gaps...")
    # For limitations, use the RAG system to find relevant sections
    relevant_chunks = perform_rag_query("limitations methodology weaknesses future work future scope unexplored", top_k=5)
    combined_text = "\n".join(relevant_chunks)
    session_state.limitations = identify_limitations(combined_text)

    print("\nResearch Gaps:")
    print(session_state.limitations)
else:
    print("Please make sure the model is loaded and a PDF has been processed before identifying research gaps.")

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Identifying research gaps...

Research Gaps:
1. Research gap: The paper does not clearly state the purpose of the research. It should provide a clear statement of the research question or objective.

2. Research gap: The paper does not provide a clear statement of the research methods or the data sources used. It should clearly describe the methods and data sources used in the research.

3. Research gap: The paper does not provide a clear statement of the research limitations or challenges. It should clearly describe the limitations and challenges faced in the research.

4. Research gap: The paper does not provide a clear statement of the research implications. It should clearly describe the implications of the research.

5. Research gap: The paper does not provide a clear statement of the research limitations or challenges faced in the research. It should clearly describe the limitations and challenges faced in the research.

6. Research gap: The paper does not provide a clear stateme

In [None]:
!pip install streamlit pyngrok langchain openai tiktoken unstructured faiss-cpu


Collecting unstructured
  Downloading unstructured-0.18.15-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m981.5/981.5 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz (from unstructured)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_

In [None]:
%%writefile app.py
import streamlit as st
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader

# Session state
if 'vector_store' not in st.session_state:
    st.session_state.vector_store = None

st.title("üìÑ RAG PDF Research Assistant")

# Upload PDF
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file:
    with open("temp.pdf", "wb") as f:
        f.write(uploaded_file.read())

    loader = UnstructuredPDFLoader("temp.pdf")
    docs = loader.load()

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_documents(docs)

    embeddings = OpenAIEmbeddings()
    vector_store = FAISS.from_documents(chunks, embeddings)
    st.session_state.vector_store = vector_store

    st.success("PDF uploaded and processed successfully!")

# Query
query = st.text_input("Enter your query:")
if st.button("Search") and query:
    if st.session_state.vector_store is None:
        st.warning("Please upload a PDF first!")
    else:
        results = st.session_state.vector_store.similarity_search(query, k=3)
        st.write("**Top Results:**")
        for i, doc in enumerate(results):
            st.write(f"{i+1}. {doc.page_content}")



Overwriting app.py


In [None]:
!ngrok authtoken YOUR_AUTHTOKEN_HERE

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok
import os

# Run Streamlit in the background
get_ipython().system_raw("streamlit run app.py --server.port 8501 &")

# Create a public URL
public_url = ngrok.connect(port=8501)
public_url


ERROR:pyngrok.process.ngrok:t=2025-10-14T19:20:55+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: YOUR_AUTHTOKEN_HERE\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n"


PyngrokNgrokError: The ngrok process errored on start: authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: YOUR_AUTHTOKEN_HERE\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n.

In [None]:
!pip install nbconvert


In [None]:
!jupyter nbconvert nlp_paper.ipynb --to notebook --ClearOutputPreprocessor.enabled=True --output cleaned_notebook.ipynb
