In [5]:

!pip install lxml



In [1]:
# 01_paper_ingestion.ipynb

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import json

# Function to fetch papers from arXiv
def fetch_arxiv_papers(query, max_results=5):
    """
    Fetches papers from arXiv based on a search query.
    Args:
        query (str): Search keyword(s)
        max_results (int): Number of papers to fetch
    Returns:
        List of dicts containing title, abstract, and link
    """
    url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}"
    response = requests.get(url)
    
    # Parse XML response using lxml
    soup = BeautifulSoup(response.text, "lxml-xml")
    
    entries = soup.find_all("entry")
    papers = []
    
    for entry in entries:
        title = entry.title.text.strip()
        abstract = entry.summary.text.strip()
        link = entry.id.text.strip()
        papers.append({
            "title": title,
            "abstract": abstract,
            "link": link
        })
        
    return papers

# Fetch papers
papers = fetch_arxiv_papers("AI hypothesis generation", max_results=3)

# Print results in JSON format
print(json.dumps(papers, indent=2))


[
  {
    "title": "Towards the New XAI: A Hypothesis-Driven Approach to Decision Support\n  Using Evidence",
    "abstract": "Prior research on AI-assisted human decision-making has explored several\ndifferent explainable AI (XAI) approaches. A recent paper has proposed a\nparadigm shift calling for hypothesis-driven XAI through a conceptual framework\ncalled evaluative AI that gives people evidence that supports or refutes\nhypotheses without necessarily giving a decision-aid recommendation. In this\npaper, we describe and evaluate an approach for hypothesis-driven XAI based on\nthe Weight of Evidence (WoE) framework, which generates both positive and\nnegative evidence for a given hypothesis. Through human behavioural\nexperiments, we show that our hypothesis-driven approach increases decision\naccuracy and reduces reliance compared to a recommendation-driven approach and\nan AI-explanation-only baseline, but with a small increase in under-reliance\ncompared to the recommendation-dr

In [2]:
import os
import json

# Create folder if it doesn't exist
os.makedirs("data", exist_ok=True)

# Save papers
with open("data/arxiv_papers.json", "w", encoding="utf-8") as f:
    json.dump(papers, f, indent=2, ensure_ascii=False)

print("Saved papers to data/arxiv_papers.json")


Saved papers to data/arxiv_papers.json


In [3]:
for paper in papers:
    abstract = paper["abstract"]
    clean_abstract = " ".join(abstract.split())  # remove newlines and extra spaces
    paper["clean_abstract"] = clean_abstract

print("Preprocessed abstracts for NLP tasks.")


Preprocessed abstracts for NLP tasks.


In [5]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [7]:
import os, json, re

# Ensure folder exists
os.makedirs("data/raw", exist_ok=True)

# Load papers either from memory or from saved file
try:
    papers  # check if variable exists
except NameError:
    with open("data/arxiv_papers.json", "r", encoding="utf-8") as f:
        papers = json.load(f)

# Write each paper to its own JSON in data/raw/
def safe_name(text, maxlen=60):
    t = re.sub(r"[^\w\-]+", "_", text)
    return t[:maxlen].strip("_") or "paper"

for i, p in enumerate(papers, start=1):
    rec = {"title": p.get("title",""), "abstract": p.get("abstract","")}
    fname = f"{i:02d}_" + safe_name(rec["title"]) + ".json"
    with open(os.path.join("data", "raw", fname), "w", encoding="utf-8") as f:
        json.dump(rec, f, ensure_ascii=False, indent=2)

len(os.listdir("data/raw"))


3

In [10]:
import os
import json
import re
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK resources are downloaded
nltk.download("punkt")
nltk.download("punkt_tab")

# Define paths
RAW_DIR = "data/raw"
PROCESSED_FILE = "data/processed/abstracts.json"

# Preprocessing function
def clean_text(text):
    text = text.strip()
    text = re.sub(r"\s+", " ", text)  # normalize spaces
    return text

def preprocess_abstracts(raw_dir):
    processed = {}
    
    for fname in os.listdir(raw_dir):
        if not fname.endswith(".json"):
            continue
        
        path = os.path.join(raw_dir, fname)
        with open(path, "r", encoding="utf-8") as f:
            paper = json.load(f)
        
        abstract = paper.get("abstract", "")
        if not abstract:
            continue
        
        abstract = clean_text(abstract)
        sentences = sent_tokenize(abstract)
        
        processed[fname] = sentences
    
    return processed

# Run preprocessing
processed_abstracts = preprocess_abstracts(RAW_DIR)

# Ensure output directory exists
os.makedirs("data/processed", exist_ok=True)

# Save results
with open(PROCESSED_FILE, "w", encoding="utf-8") as f:
    json.dump(processed_abstracts, f, indent=2)

print(f"✅ Preprocessing complete! Saved to {PROCESSED_FILE}")


✅ Preprocessing complete! Saved to data/processed/abstracts.json


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
