In [1]:
!pip install -q kaggle

!kaggle datasets download -d nechbamohammed/research-papers-dataset -p /content

!unzip /content/research-papers-dataset.zip -d /content

!ls /content


Dataset URL: https://www.kaggle.com/datasets/nechbamohammed/research-papers-dataset
License(s): unknown
Downloading research-papers-dataset.zip to /content
 99% 587M/590M [00:03<00:00, 141MB/s]
100% 590M/590M [00:03<00:00, 165MB/s]
Archive:  /content/research-papers-dataset.zip
  inflating: /content/dblp-v10.csv   
dblp-v10.csv  research-papers-dataset.zip  sample_data


In [2]:
!pip install -U langchain-community psycopg2-binary transformers keybert spacy tabulate
!pip install keybert
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting transformers
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting spacy
  Downloading spacy-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting langchain-core<1.0.0,>=0.3.34 (from langchain-community)
  Downloading langchain_core-0.3.34-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.18 (from langchain-community)
  Downloading langchain-0.3.18-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)

In [3]:
!pip uninstall -y community
!pip install python-louvain

Found existing installation: community 1.0.0b1
Uninstalling community-1.0.0b1:
  Successfully uninstalled community-1.0.0b1


In [4]:
!pip install keybert



In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from wordcloud import WordCloud
import re
from nltk.corpus import stopwords
import nltk
from scipy.stats import pearsonr, spearmanr
from collections import Counter
import ast
from tabulate import tabulate
import networkx as nx
import os
import spacy
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.docstore.document import Document
import random
from collections import defaultdict
from sklearn.cluster import SpectralClustering, AgglomerativeClustering, KMeans, MiniBatchKMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import jaccard_score
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from transformers import BertTokenizer, BertModel
import json
from keybert import KeyBERT

In [None]:
import psycopg2
from pgvector.psycopg2 import register_vector
from transformers import pipeline
import faiss.contrib.torch_utils
import faiss

from tabulate import tabulate
import requests
import time
from transformers import AutoConfig, pipeline


In [6]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [3]:
file_path = "dblp-v10.csv"
df = pd.read_csv(file_path)

In [4]:
# Import necessary libraries
import os
import shutil
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
print("\n🔹 Loading Dataset...")

csv_path = "/content/drive/MyDrive/FDS/papers_data.csv"

try:
    df = pd.read_csv(csv_path)

    required_columns = {"title", "abstract", "authors", "citations"}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"Missing columns: {required_columns - set(df.columns)}")

    df.fillna({"title": "", "abstract": "", "authors": "", "citations": 0}, inplace=True)

    df["authors"] = df["authors"].apply(lambda x: x.split(", ") if isinstance(x, str) else [])

    print(f"✅ Loaded {len(df)} research papers successfully!")

except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    exit()


print("\n🔹 Setting up FAISS Vector Store...")

d = 384
n = len(df)
vectors = np.random.random((n, d)).astype("float32")
vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

index = faiss.IndexFlatIP(d)
index.add(torch.tensor(vectors))

print(f"✅ FAISS index built with {index.ntotal} vectors.\n")


print("🔹 Configuring LLM (Flan-T5)...")

config = AutoConfig.from_pretrained("google/flan-t5-large")
config.n_positions = 8192

llm_generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    device=0 if torch.cuda.is_available() else -1,
    config=config
)
llm_generator.model.config.n_positions = 8192

print("✅ LLM Model Ready!")


API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-large"
API_KEY = "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"


def query_llm_api(prompt, max_length=8192, max_retries=5, retry_delay=10):
    """
    Query the Hugging Face Inference API with retry logic.
    """
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_length": max_length,
            "min_length": 100,
            "num_beams": 2,
            "no_repeat_ngram_size": 2,
            "early_stopping": True
        }
    }

    for attempt in range(max_retries):
        response = requests.post(API_URL, headers=headers, json=payload)
        if response.status_code == 200:
            return response.json()[0]['generated_text']
        elif response.status_code == 503:
            estimated_time = response.json().get("estimated_time", retry_delay)
            print(f"⏳ Model is loading. Retrying in {estimated_time} seconds...")
            time.sleep(estimated_time)
        else:
            print(f"❌ API Error {response.status_code}: {response.text}")
            return None

    print("❌ Max retries reached. No response received.")
    return None

sample_prompt = "Summarize the importance of diffusion models in AI."
generated_text = query_llm_api(sample_prompt)

if generated_text:
    print("\n🔹 **LLM Generated Response:**\n")
    print(f"📜 {generated_text}")
else:
    print("\n❌ LLM Failed to generate a response.")



🔹 Loading Dataset...
✅ Loaded 30164 research papers successfully!

🔹 Setting up FAISS Vector Store...
✅ FAISS index built with 30164 vectors.

🔹 Configuring LLM (Flan-T5)...


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu


✅ LLM Model Ready!

🔹 **LLM Generated Response:**

📜 Understand the importance of diffusion models in AI. Understand how diffusion is used to model the dynamics of a system. Learn about the role of the diffusion model in the development of artificial intelligence. Identify the main components of an artificial intelligent system and their relationship to each other. Analyze the relationship between the model components and the interaction between them. Apply the knowledge of these components to the design of AI systems. Use the information from the models to develop new AI algorithms and systems that are more efficient and effective than existing algorithms.


In [6]:
query_llm_api("I have Data science project")

'I have a data science project. I need to know how to calculate the mean and standard deviation of the data. Can you help me with this? Thanks! :)) ) I will send you the link to the project after I get back from the meeting. Thanks. ;)( ) ..()..(.)>.>> -/.- /-/--(-)/>'

In [None]:
print("\n🔹 Loading Dataset...")

csv_path = "/content/drive/MyDrive/FDS/papers_data.csv"

try:
    df = pd.read_csv(csv_path)

    required_columns = {"id", "title", "abstract", "authors"}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"Missing columns: {required_columns - set(df.columns)}")

    df.fillna({"title": "", "abstract": "", "authors": ""}, inplace=True)

    df["authors"] = df["authors"].apply(lambda x: x.split(", ") if isinstance(x, str) else [])

    df["text"] = df["title"] + " " + df["abstract"]

    print(f"✅ Loaded {len(df)} papers successfully!")

except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    exit()


print("\n🔹 Initializing FAISS Index...")

device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = SentenceTransformer("all-mpnet-base-v2", device=device)

paper_texts = df["text"].tolist()

print("🔄 Computing embeddings (this may take some time)...")
paper_embeddings = embedder.encode(paper_texts, convert_to_numpy=True)
paper_embeddings = paper_embeddings.astype("float32")

paper_embeddings /= np.linalg.norm(paper_embeddings, axis=1, keepdims=True)
paper_embeddings = np.ascontiguousarray(paper_embeddings)

d = paper_embeddings.shape[1]
nlist = 100
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)

index.train(paper_embeddings)
index.add(paper_embeddings)

print(f"✅ FAISS Index built with {index.ntotal} papers.")

print("\n🔹 Loading NLP Models...")

kw_model = KeyBERT(model="all-mpnet-base-v2")
nlp = spacy.load("en_core_web_sm")


print("\n🔹 Configuring LLM Model...")

llm_generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    device=0 if torch.cuda.is_available() else -1,
)
llm_generator.model.config.n_positions = 8192

print("✅ LLM Ready!")


def retrieve_relevant_papers(query, top_k=3):
    """
    Extracts keywords, detects author names, and retrieves relevant papers from FAISS.
    """
    print("\n🔹 Processing Query...")

    keywords = kw_model.extract_keywords(query, keyphrase_ngram_range=(1, 3), stop_words="english", top_n=10)
    keyword_text = " ".join([kw for kw, _ in keywords])
    print(f"📌 Extracted Keywords: {keyword_text}")

    doc = nlp(query)
    detected_authors = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    print(f"📌 Detected Authors: {', '.join(detected_authors) if detected_authors else 'None'}")

    query_embedding = embedder.encode([query + " " + keyword_text], convert_to_numpy=True)
    query_embedding /= np.linalg.norm(query_embedding, axis=1, keepdims=True)

    D, I = index.search(query_embedding, top_k)
    retrieved_indices = I[0].tolist()
    retrieved_papers = df.iloc[retrieved_indices].copy()

    if detected_authors:
        author_mask = df["authors"].apply(
            lambda authors: any(any(auth.lower() in author.lower() for author in authors) for auth in detected_authors)
        )
        author_papers = df[author_mask]
        retrieved_papers = pd.concat([author_papers, retrieved_papers]).drop_duplicates(subset="id").head(top_k)

    return retrieved_papers


def generate_report(retrieved_papers):
    """
    Generates a concise summary report based on retrieved research papers.
    """
    if retrieved_papers.empty:
        return "❌ No relevant papers found."

    print("\n🔹 Generating Report...")

    report_input = "📚 Found Research Papers:\n"
    for _, row in retrieved_papers.iterrows():
        report_input += f"\n📝 **Title:** {row['title']}\n📖 **Abstract:** {row['abstract']}\n👥 **Authors:** {', '.join(row['authors'])}\n"

    prompt = (
        "You are a research assistant. Summarize the following research papers, highlighting:\n"
        "1️⃣ Main research topics\n"
        "2️⃣ Key findings\n"
        "3️⃣ Trends, methodologies, or notable authors\n"
        "4️⃣ Potential applications or future directions\n\n"
        f"{report_input}\n\n📌 **Summary Report:**"
    )

    output = llm_generator(prompt, max_length=8192, do_sample=False)
    summary_report = output[0]["generated_text"].strip()

    return summary_report


def research_assistant(query, top_k=3):
    """
    Full pipeline: retrieve relevant papers and generate a summary report.
    """
    print(f"\n🔹 **User Query:** {query}")

    retrieved_papers = retrieve_relevant_papers(query, top_k=top_k)

    print("\n📌 **Retrieved Papers:**")
    print(tabulate(retrieved_papers[["id", "title", "authors"]], headers="keys", tablefmt="grid"))

    report = generate_report(retrieved_papers)

    print("\n🔹 **Research Assistant Report:**\n")
    print(report)


if __name__ == "__main__":
    user_query = input("\n💡 Enter your research query: ")
    research_assistant(user_query, top_k=3)



🔹 Loading Dataset...
✅ Loaded 30164 papers successfully!

🔹 Initializing FAISS Index...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔄 Computing embeddings (this may take some time)...


In [None]:
df_sampled['authors'] = df_sampled['authors'].apply(lambda x: eval(x) if isinstance(x, str) else x)
df_sampled["text"] = df_sampled["title"].fillna("") + " " + df_sampled["abstract"].fillna("")


print("Initializing embedder and computing paper embeddings...")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedder = SentenceTransformer('all-mpnet-base-v2', device=device)

paper_texts = df_sampled["text"].tolist()


d = paper_embeddings.shape[1]
nlist = 100
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
paper_embeddings = torch.tensor(paper_embeddings)
index.train(paper_embeddings)
index.add(paper_embeddings)
print(f"FAISS index built with {index.ntotal} papers.")


print("Loading KeyBERT and spaCy models...")
kw_model = KeyBERT(model='all-mpnet-base-v2')
nlp = spacy.load("en_core_web_sm")


llm_generator = pipeline("text2text-generation", model="google/flan-t5-large",
                         device=0 if torch.cuda.is_available() else -1, config=config)
llm_generator.model.config.n_positions = 8192

def retrieve_relevant_papers(query, top_k=3):
    """
    Given a user query, extract keywords, detect author names, and use the LLM to clarify author names if necessary.
    Then retrieve relevant papers from the FAISS index.
    """
    keywords = kw_model.extract_keywords(query, keyphrase_ngram_range=(1, 3), stop_words='english', top_n=10, use_maxsum=True)
    keyword_text = " ".join([kw for kw, score in keywords])
    print("Extracted Keywords:", keyword_text)

    doc = nlp(query)
    detected_authors = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    if detected_authors:
        print("Detected author names:", detected_authors)
        prompt = (
            "You are a research assistant. The user has mentioned the following author names in their query: "
            f"{', '.join(detected_authors)}. Please clarify or infer the full names of these authors based on the context. "
            "If the names are already clear, just repeat them. Here is the query for context:\n\n" + query
        )
        output = llm_generator(prompt, max_length=8192, do_sample=False)
        clarified_authors = output[0]['generated_text'].strip().split(", ")
        print("Clarified author names:", clarified_authors)
        detected_authors = clarified_authors

    query_embedding = embedder.encode([query + " " + keyword_text], convert_to_numpy=True)
    query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)


    D, I = index.search(torch.tensor(query_embedding), top_k)
    retrieved_indices = I[0].tolist()
    retrieved_papers = df.iloc[retrieved_indices].copy()

    if detected_authors:
        author_mask = df["authors"].apply(
            lambda authors: any(
                any(detected.lower() in author.lower() for author in authors)
                for detected in detected_authors
            )
        )
        author_papers = df[author_mask]
        combined = pd.concat([author_papers, retrieved_papers]).drop_duplicates(subset="id")
        retrieved_papers = combined.head(top_k)

    return retrieved_papers


def generate_report(retrieved_papers):
    """
    Given retrieved papers, generate a concise summary report using a free LLM.
    The prompt instructs the model to highlight the main research topics,
    key contributions, and notable authors.
    """
    if retrieved_papers.empty:
        return "No relevant papers found."

    report_input = "I have found the following research papers:\n"
    for idx, row in retrieved_papers.iterrows():
        report_input += (
            f"\nTitle: {row['title']}\n"
            f"Abstract: {row['abstract']}\n"
            f"Authors: {', '.join(row['authors'])}\n"
        )

    prompt = (
        "You are a research assistant. Based on the following research papers, generate a concise and informative summary report. "
        "The report should include:\n"
        "1. A brief overview of the main research topics.\n"
        "2. Key contributions or findings from the papers.\n"
        "3. Notable trends, methodologies, or authors.\n"
        "4. Potential applications or future directions.\n\n"
        "Here are the details of the retrieved papers:\n\n" + report_input + "\n\nSummary Report:"
    )
    print("\nPrompt to LLM:\n", prompt)

    output = query_llm_api(prompt)
    summary_report = output
    summary_report = summary_report.replace("Here is the summary report:", "").strip()
    summary_report = summary_report.replace("Summary Report:", "").strip()

    return summary_report


def research_assistant(query, top_k=1):
    """
    Full pipeline: retrieve relevant papers and generate a summary report.
    """
    print("\nUser Query:", query)
    retrieved_papers = retrieve_relevant_papers(query, top_k=top_k)
    print("\nRetrieved Papers:")
    print(retrieved_papers[["id", "title", "authors"]])

    report = generate_report(retrieved_papers)
    return report

if __name__ == '__main__':
    user_query = input("Enter your research query (e.g., 'I need papers on deep learning and NLP by Alice'): ")
    final_report = research_assistant(user_query, top_k=1)
    print("\n=== Research Assistant Report ===")
    print(final_report)

Initializing embedder and computing paper embeddings...
FAISS index built with 30000 papers.
Loading KeyBERT and spaCy models...


Device set to use cuda:0


Enter your research query (e.g., 'I need papers on deep learning and NLP by Alice'):  Computer vision paper



User Query: Computer vision paper
Extracted Keywords: 

Retrieved Papers:
                                        id  \
2794  4b9fbd07-b073-4f63-9515-b9a08faef029   

                                                  title  \
2794  Structure synthesis and singularity analysis o...   

                                  authors  
2794  [Yan Jin, I-Ming Chen, Guilin Yang]  

Prompt to LLM:
 You are a research assistant. Based on the following research papers, generate a concise and informative summary report. The report should include:
1. A brief overview of the main research topics.
2. Key contributions or findings from the papers.
3. Notable trends, methodologies, or authors.
4. Potential applications or future directions.

Here are the details of the retrieved papers:

I have found the following research papers:

Title: Structure synthesis and singularity analysis of a parallel manipulator based on selective actuation
Abstract: A parallel manipulator (PM) based on 3-limb design termed

In [None]:
def get_db_connection():
    conn = psycopg2.connect(
        dbname="research_db",
        user="postgres",
        password="postgres",
        host="localhost",
        port="5432"
    )
    register_vector(conn)
    return conn

def initialize_database():
    conn = psycopg2.connect(
        dbname="postgres",
        user="postgres",
        password="postgres",
        host="localhost",
        port="5432"
    )
    conn.autocommit = True
    cur = conn.cursor()

    cur.execute("SELECT 1 FROM pg_database WHERE datname = 'research_db';")
    if not cur.fetchone():
        print("Creating database 'research_db'...")
        cur.execute("CREATE DATABASE research_db;")

    cur.close()
    conn.close()

    conn = get_db_connection()
    cur = conn.cursor()
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
    cur.execute("""
        CREATE TABLE IF NOT EXISTS papers (
            id TEXT PRIMARY KEY,
            title TEXT,
            abstract TEXT,
            authors TEXT[],
            text TEXT,
            embedding vector(768)
        );
    """)
    conn.commit()
    cur.close()
    conn.close()
    print("Database initialized successfully.")

print("Initializing embedder and computing paper embeddings...")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedder = SentenceTransformer('all-mpnet-base-v2', device=device)

df_sampled["text"] = df_sampled["title"].fillna("") + " " + df["abstract"].fillna("")
paper_texts = df_sampled["text"].tolist()

def clean_null_characters(text):
    if isinstance(text, str):
        return text.replace('\x00', '')
    return text


def clean_text(text):
    if isinstance(text, str):
        
        text = text.replace('\x00', '')
        text = text.replace('\x1a', '')
        text = text.strip()
    return text

df_sampled["text"] = df_sampled["title"].fillna("") + " " + df_sampled["abstract"].fillna("")
paper_texts = df_sampled["text"].tolist()

df_sampled = df_sampled.applymap(clean_text)
df_sampled = df_sampled.applymap(clean_null_characters)
df_sampled = df_sampled.reset_index(drop=True)

print(f"Inserted {len(df_sampled)} papers into database.")
print(f"Inserted {len(df)} papers into database.")

print("Loading NLP models...")
kw_model = KeyBERT(model='all-mpnet-base-v2')
nlp = spacy.load("en_core_web_sm")
llm_generator = pipeline("text2text-generation", model="google/flan-t5-large",
                         device=0 if torch.cuda.is_available() else -1,max_length=8192)

def retrieve_relevant_papers(query, top_k=3):
    """Enhanced retrieval with author disambiguation and pgvector search"""
    keywords = kw_model.extract_keywords(query, keyphrase_ngram_range=(1, 3), top_n=10)
    keyword_text = " ".join([kw[0] for kw in keywords])

    doc = nlp(query)
    detected_authors = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    author_boost = []

    if detected_authors:
        print(f"Detected potential authors: {detected_authors}")
        conn = get_db_connection()
        cur = conn.cursor()

        possible_authors = set()
        for author in detected_authors:
            cur.execute("""
                SELECT authors FROM papers
                WHERE EXISTS (
                    SELECT 1 FROM unnest(authors) AS a
                    WHERE a ILIKE %s
                )
            """, (f"%{author}%",))
            possible_authors.update([a for res in cur.fetchall() for a in res[0]])

        if possible_authors:
            prompt = f"""Given the research query: "{query}", which of these authors are most relevant?
            Authors: {', '.join(possible_authors)}. Respond with 1-3 most relevant names, comma-separated."""

            clarified = llm_generator(prompt, max_new_tokens=50)[0]['generated_text']
            clarified_authors = [a.strip() for a in clarified.split(",") if a.strip() in possible_authors]
            print(f"LLM-clarified authors: {clarified_authors}")

            cur.execute("""
                SELECT id FROM papers
                WHERE authors && %s
                ORDER BY RANDOM()
                LIMIT %s
            """, (clarified_authors, top_k*2))
            author_boost = [res[0] for res in cur.fetchall()]

        conn.close()

    query_embedding = embedder.encode([query + " " + keyword_text])[0]
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    conn = get_db_connection()
    cur = conn.cursor()
    cur.execute("""
        SELECT id, title, authors, abstract,
               embedding <-> %s::vector AS distance
        FROM papers
        ORDER BY distance
        LIMIT %s
    """, (query_embedding.tolist(), top_k*3))

    results = pd.DataFrame(cur.fetchall(),
                          columns=['id', 'title', 'authors', 'abstract', 'distance'])
    conn.close()

    if author_boost:
        boosted = results[results['id'].isin(author_boost)]
        remaining = results[~results['id'].isin(author_boost)]
        results = pd.concat([boosted, remaining]).drop_duplicates('id')

    return results.head(top_k)


def generate_report(retrieved_papers):
    """
    Given retrieved papers, generate a concise summary report using a free LLM.
    The prompt instructs the model to highlight the main research topics,
    key contributions, and notable authors.
    """
    if retrieved_papers.empty:
        return "No relevant papers found."

    report_input = "I have found the following research papers:\n"
    for idx, row in retrieved_papers.iterrows():
        report_input += (
            f"\nTitle: {row['title']}\n"
            f"Abstract: {row['abstract']}\n"
            f"Authors: {', '.join(row['authors'])}\n"
        )

    prompt = (
        "You are a research assistant. Based on the following research papers, generate a concise and informative summary report. "
        "The report should include:\n"
        "1. A brief overview of the main research topics.\n"
        "2. Key contributions or findings from the papers.\n"
        "3. Notable trends, methodologies, or authors.\n"
        "4. Potential applications or future directions.\n\n"
        "Here are the details of the retrieved papers:\n\n" + report_input + "\n\nSummary Report:"
    )
    print("\nPrompt to LLM:\n", prompt)

    output = query_llm_api(prompt)
    summary_report = output
    summary_report = summary_report.replace("Here is the summary report:", "").strip()
    summary_report = summary_report.replace("Summary Report:", "").strip()

    return summary_report


def research_assistant(query, top_k=5):
    print(f"\nProcessing query: {query}")
    papers = retrieve_relevant_papers(query, top_k)
    print("\nTop matches:")
    print(papers[['title', 'authors']])
    return generate_report(papers)

if __name__ == '__main__':
    initialize_database()
    query = input("Enter your research query: ")
    print("\n=== Research Report ===")
    print(research_assistant(query))

Initializing embedder and computing paper embeddings...
Inserted 30000 papers into database.
Inserted 999998 papers into database.
Loading NLP models...


Device set to use cuda:0


Database initialized successfully.


Enter your research query:  Robotics



=== Research Report ===

Processing query: Robotics

Top matches:
                                               title  \
0                      Advanced software in robotics   
1  Education and training in Robotics edited by P...   
2         The Vision System of the ACROBOTER Project   
3   Sensorimotor models of space and object geometry   
4  Roboethics: Social and Ethical Implications of...   

                                             authors  
0                               [Barry Irvin Soroka]  
1                                     [K. A. Pocock]  
2  [Rigas Kouskouridas, Nikolaos Kyriakoulis, Dim...  
3                                    [Jeremy Stober]  
4              [Gianmarco Veruggio, Fiorella Operto]  

Prompt to LLM:
 You are a research assistant. Based on the following research papers, generate a concise and informative summary report. The report should include:
1. A brief overview of the main research topics.
2. Key contributions or findings from the papers.
3.