# Research Paper Analysis System

This notebook implements a system for analyzing research papers, specifically extracting hypotheses and identifying research gaps.

## Installation of Required Libraries

First, let's install the necessary dependencies:

In [3]:
!pip install streamlit torch transformers peft langchain langchain-community faiss-cpu pypdf2 accelerate bitsandbytes langchain-text-splitters



## Import Libraries

In [5]:
import streamlit as st
import torch
import json
import tempfile
import os
import PyPDF2
import getpass
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)
from peft import PeftModel
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import files
import ipywidgets as widgets
from IPython.display import display, HTML

## Set Up HuggingFace Token

Securely enter your HuggingFace token when prompted

In [6]:
# Securely get HuggingFace token using getpass
'''
hf_token = getpass.getpass("Enter your HuggingFace token: ")
os.environ["HF_TOKEN"] = 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
'''
import getpass
import os

hf_token = getpass.getpass("Enter your HuggingFace token: ")
os.environ["HF_TOKEN"] = hf_token


Enter your HuggingFace token: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


## Initialize Session Variables

In [7]:
# Initialize variables to store state (replacing streamlit session state)
class SessionState:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.embeddings = None
        self.vector_store = None

        self.pdf_text = ""
        self.hf_model_repo = None  # Model repository on HuggingFace
        self.hypotheses = None
        self.limitations = None

session_state = SessionState()

## Define Functions

In [16]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

def load_model_and_tokenizer(model_repo):
    """Load the model and tokenizer from a Hugging Face repository"""

    # Set up the configuration for quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Use HF_TOKEN from environment variables
    hf_token = os.environ.get("HF_TOKEN", None)

    if not hf_token:
        print("Warning: HuggingFace token not found. Some models may not load correctly.")

    # Base model fallback (public model)
    base_model_name = "tiiuae/falcon-7b-instruct"

    print("Loading model and tokenizer from HuggingFace repo...")

    try:
        # Try to load tokenizer from the fine-tuned model repo first
        tokenizer = AutoTokenizer.from_pretrained(model_repo, token=hf_token)
        print(f"Successfully loaded tokenizer from {model_repo}")
    except Exception as e:
        print(f"Could not load tokenizer from {model_repo}. Using base model tokenizer instead. Error: {e}")
        tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=hf_token)

    # Make sure pad token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print(f"Loading model from {model_repo}... This may take a few minutes...")
    try:
        # Try to load the fine-tuned model directly from the repo
        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            device_map="auto",
            quantization_config=bnb_config,
            token=hf_token
        )
        print(f"Successfully loaded model from {model_repo}")
    except Exception as e:
        print(f"Could not load complete model from {model_repo}. Error: {e}")
        print("Loading base model and then attempting to load adapters...")

        # If loading the whole model failed, try loading base model + adapters
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            device_map="auto",
            quantization_config=bnb_config,
            token=hf_token
        )

        try:
            model = PeftModel.from_pretrained(base_model, model_repo, token=hf_token)
            print(f"Successfully loaded adapter from {model_repo}")
        except Exception as adapter_error:
            print(f"Failed to load adapter from {model_repo}. Using base model. Error: {adapter_error}")
            model = base_model

    return model, tokenizer

In [9]:
def extract_text_from_pdf(pdf_file_path):
    """Extract text from a PDF file"""
    text = ""
    with open(pdf_file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    return text

In [10]:
def create_vector_store(text):
    """Create a vector store from the text"""
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_text(text)

    # Load embeddings model
    if session_state.embeddings is None:
        print("Loading embeddings model...")
        session_state.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

    # Create vector store
    print("Creating vector store...")
    vector_store = FAISS.from_texts(chunks, session_state.embeddings)

    return vector_store, chunks

In [11]:
def generate_hypothesis(abstract_text):
    """Agent 1: Generate null and alternate hypotheses from abstract"""

    instruction = "Extract the null and alternate hypotheses from the given abstract."
    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{abstract_text}\n\n### Response:\n"

    inputs = session_state.tokenizer(prompt, return_tensors="pt").to(session_state.model.device)

    outputs = session_state.model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        top_p=0.9,
        top_k=50,
        temperature=0.5
    )

    response = session_state.tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Try to extract just the response part
    if "### Response:" in response:
        response = response.split("### Response:")[1].strip()

    return response

In [12]:
def identify_limitations(paper_text):
    """Agent 2: Identify limitations in the research paper"""

    instruction = """
You are an academic research analyst.

Task: From the following paper text, identify 3 to 6 specific research gaps and express them as clear, factual statements.

Requirements:
- Each point should be concise (1‚Äì2 lines).
- Avoid phrasing as questions. Use declarative statements.
- Each gap should highlight what is missing, underexplored, or limited in current research.
- Avoid repetition and generic phrases.
- Use bullet points starting with "-".
- Focus on clarity and insightfulness.
"""

    # If text is too long, use a summarized version or the abstract
    if len(paper_text) > 4000:
        # Try to find abstract section
        lower_text = paper_text.lower()
        if "abstract" in lower_text:
            start_idx = lower_text.find("abstract")
            end_idx = lower_text.find("introduction", start_idx)
            if end_idx == -1:  # If no "introduction" found
                end_idx = start_idx + 2000  # Use 2000 chars after abstract
            paper_text = paper_text[start_idx:end_idx]
        else:
            # Just use the first 4000 characters if no abstract found
            paper_text = paper_text[:4000]

    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{paper_text}\n\n### Response:\n"

    inputs = session_state.tokenizer(prompt, return_tensors="pt").to(session_state.model.device)

    outputs = session_state.model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,
        top_k=50,
        temperature=0.5,  # Slightly higher temperature for more creative analysis
        repetition_penalty=1.25,  # key fix for repetition
        num_return_sequences=1
        )

    response = session_state.tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Try to extract just the response part
    if "### Response:" in response:
        response = response.split("### Response:")[1].strip()

    return response

In [13]:
def perform_rag_query(query, top_k=3):
    """Perform RAG query to find most relevant chunks for a query"""
    if session_state.vector_store is None:
        print("No vector store available. Please upload a PDF first.")
        return []

    # Get relevant documents
    results = session_state.vector_store.similarity_search(query, k=top_k)
    return [doc.page_content for doc in results]

## Specify Hugging Face Model Repository

In [14]:
# Enter your HuggingFace model repository
model_repo = "tiiuae/falcon-7b-instruct"
session_state.hf_model_repo = model_repo

print(f"Using model from: {model_repo}")

Using model from: tiiuae/falcon-7b-instruct


## Load Model from HuggingFace Repository

In [17]:
# Load model from Hugging Face repository
session_state.model, session_state.tokenizer = load_model_and_tokenizer(session_state.hf_model_repo)

Loading model and tokenizer from HuggingFace repo...
Successfully loaded tokenizer from tiiuae/falcon-7b-instruct
Loading model from tiiuae/falcon-7b-instruct... This may take a few minutes...


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

Successfully loaded model from tiiuae/falcon-7b-instruct


## Upload and Process PDF

In [18]:
print("Please upload a research paper (PDF):")
uploaded = files.upload()

# Get the first file name from the uploaded files
if uploaded:
    file_name = list(uploaded.keys())[0]
    print(f"Processing {file_name}...")

    # Extract text from the PDF
    session_state.pdf_text = extract_text_from_pdf(file_name)

    # Create vector store
    session_state.vector_store, chunks = create_vector_store(session_state.pdf_text)

    print(f"Successfully processed PDF: {file_name}")
    print(f"Extracted {len(chunks)} text chunks")
    print("\nFirst 1000 characters of the extracted text:")
    print(session_state.pdf_text[:1000])

Please upload a research paper (PDF):


Saving 2406.11657v1.pdf to 2406.11657v1 (1).pdf
Processing 2406.11657v1 (1).pdf...
Loading embeddings model...


  session_state.embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating vector store...
Successfully processed PDF: 2406.11657v1 (1).pdf
Extracted 75 text chunks

First 1000 characters of the extracted text:
Can LLM be a Personalized Judge?
Yijiang River Dong‚àóand Tiancheng Hu‚àóand Nigel Collier
{yd358, th656, nhc30}@cam.ac.uk
University of Cambridge
Abstract
Ensuring that large language models (LLMs)
reflect diverse user values and preferences is
crucial as their user bases expand globally.
It is therefore encouraging to see the grow-
ing interest in LLM personalization within
the research community. However, current
works often rely on the LLM-as-a-Judge ap-
proach for evaluation without thoroughly ex-
amining its validity. In this paper, we investi-
gate the reliability of LLM-as-a- Personalized -
Judge‚Äîasking LLMs to judge user preferences
based on personas. Our findings suggest that di-
rectly applying LLM-as-a-Personalized-Judge
is less reliable than previously assumed, show-
ing low and inconsistent agreement with human
ground truth. Th

## Generate Hypotheses

In [19]:
if session_state.model is not None and session_state.pdf_text:
    print("Generating hypotheses...")
    # For hypotheses, we'll use the first part of the paper which likely contains the abstract
    abstract_text = session_state.pdf_text[:3000]
    session_state.hypotheses = generate_hypothesis(abstract_text)

    print("\nHypotheses:")
    print(session_state.hypotheses)
else:
    print("Please make sure the model is loaded and a PDF has been processed before generating hypotheses.")

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Generating hypotheses...

Hypotheses:
The null hypothesis is that LLM-as-a-Personalized-Judge is not a reliable and valid method for evaluating LLM personalization.
The alternative hypothesis is that LLM-as-a-Personalized-Judge is a valid and reliable method for evaluating LLM personalization.


## Identify Research Gaps

In [20]:
if session_state.model is not None and session_state.pdf_text:
    print("Identifying research gaps...")
    # For limitations, use the RAG system to find relevant sections
    relevant_chunks = perform_rag_query("limitations methodology weaknesses future work future scope unexplored", top_k=5)
    combined_text = "\n".join(relevant_chunks)
    session_state.limitations = identify_limitations(combined_text)

    print("\nResearch Gaps:")
    print(session_state.limitations)
else:
    print("Please make sure the model is loaded and a PDF has been processed before identifying research gaps.")

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Identifying research gaps...

Research Gaps:
The paper is well written and provides a clear
and concise overview of the research topic. It
provides a detailed discussion of the challenges
and limitations of current research and
methods, and presents a well-defined
framework for future research. The
paper also clearly identifies the main
challenges of the current state of
the field, and provides a clear
vision of how the research will
evolve in the future. The paper
also provides a detailed account
of the experiments and their
results, as well as a clear
discussion of the limitations and
future directions. Overall,
the paper provides a
thorough and in-depth
review of the state of the
art in the field and
provides a clear path for
further research.

The paper is well written and provides a clear
and concise overview of the research topic. It
provides a detailed discussion of the challenges
and limitations of current research and methods,
and presents a well-defined framework for future
r

In [21]:
import gradio as gr
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load Falcon model once globally (do this outside the function)
MODEL_NAME = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto")
# --- STEP 1: Initialize globals ---
session_state = {"vector_store": None}


# --- STEP 2: Process PDF ---
def process_pdf(file):
    if file is None:
        return "Please upload a PDF first."
    reader = PdfReader(file.name)
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_text(text)

    embeddings = HuggingFaceEmbeddings()
    session_state["vector_store"] = FAISS.from_texts(chunks, embeddings)
    return "‚úÖ PDF uploaded and vector store created successfully!"

# --- STEP 3: Ask questions ---

def ask_question(query):
    if session_state["vector_store"] is None:
        return "‚ö†Ô∏è Please upload and process a PDF first."

    # Step 1: Retrieve relevant chunks
    results = session_state["vector_store"].similarity_search(query, k=3)
    context = "\n\n".join([doc.page_content for doc in results])

    # Step 2: Build prompt for Falcon
    prompt = f"""
You are an academic research analyst.

Task: From the following paper text, identify 3 to 6 specific research gaps and express them as clear, factual statements.

Requirements:
- Each point should be concise (1‚Äì2 lines).
- Avoid phrasing as questions. Use declarative statements.
- Each gap should highlight what is missing, underexplored, or limited in current research.
- Avoid repetition and generic phrases.
- Use bullet points starting with "-".
- Focus on clarity and insightfulness.

Context:
{context}
"""

    # Step 3: Generate response using Falcon
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Post-process to remove the repeated prompt part if Falcon echoes it
    if "Answer:" in answer:
        answer = answer.split("Answer:")[-1].strip()

    return answer
# --- STEP 4: Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown("# üìÑ RAG PDF Assistant")
    gr.Markdown("Upload a PDF and ask questions based on its content.")

    with gr.Row():
        pdf_input = gr.File(label="Upload your PDF")
        upload_btn = gr.Button("Process PDF")

    output_text = gr.Textbox(label="Status", interactive=False)
    upload_btn.click(process_pdf, inputs=pdf_input, outputs=output_text)

    query_box = gr.Textbox(label="Enter your question")
    ask_btn = gr.Button("Ask")
    answer_box = gr.Textbox(label="Answer", lines=10)
    ask_btn.click(ask_question, inputs=query_box, outputs=answer_box)

# --- STEP 5: Launch ---
demo.launch(share=True, inline=True, debug=True)

ModuleNotFoundError: No module named 'langchain.vectorstores'

In [None]:
!pip install nbconvert


In [None]:
!jupyter nbconvert nlp_paper.ipynb --to notebook --ClearOutputPreprocessor.enabled=True --output cleaned_notebook.ipynb
