In [2]:
import feedparser
print("✅ feedparser is installed!")


✅ feedparser is installed!


In [3]:
import os
import feedparser
import json

# Define raw data directory
RAW_DIR = "data/raw"
os.makedirs(RAW_DIR, exist_ok=True)

def fetch_arxiv(category="cs.AI", max_results=5):
    """
    Fetch papers from arXiv API and save as JSON in data/raw/.
    category: arXiv category (e.g., cs.AI, cs.CL, cs.LG, stat.ML)
    max_results: number of papers to fetch
    """
    url = f"http://export.arxiv.org/api/query?search_query=cat:{category}&start=0&max_results={max_results}"
    feed = feedparser.parse(url)
    
    for i, entry in enumerate(feed.entries):
        paper = {
            "id": entry.id,
            "title": entry.title,
            "abstract": entry.summary,
            "authors": [author.name for author in entry.authors],
            "published": entry.published
        }
        with open(os.path.join(RAW_DIR, f"paper_{i+1}.json"), "w", encoding="utf-8") as f:
            json.dump(paper, f, indent=2)

    print(f"✅ Downloaded {len(feed.entries)} papers into {RAW_DIR}")

# Example: Fetch 5 recent AI papers
fetch_arxiv("cs.AI", max_results=5)


✅ Downloaded 5 papers into data/raw


In [5]:
import os
import json
import feedparser
import urllib.parse  # <-- for URL encoding

# Define raw data directory
RAW_DIR = "data/raw"
os.makedirs(RAW_DIR, exist_ok=True)

# Function to fetch papers from arXiv
def fetch_arxiv_papers(query="machine learning", max_results=5):
    """
    Fetch abstracts from arXiv API
    query: search term (default "machine learning")
    max_results: number of papers to fetch
    """
    query = urllib.parse.quote(query)  # <-- encode query safely
    url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}"
    feed = feedparser.parse(url)

    papers = []
    for entry in feed.entries:
        paper = {
            "id": entry.id,
            "title": entry.title.strip().replace("\n", " "),
            "abstract": entry.summary.strip().replace("\n", " "),
            "authors": [author.name for author in entry.authors],
            "published": entry.published
        }
        papers.append(paper)

    return papers

# Fetch papers (now works with spaces in query)
papers = fetch_arxiv_papers(query="artificial intelligence", max_results=10)

# Save each paper as a JSON file inside data/raw
for i, paper in enumerate(papers):
    fname = os.path.join(RAW_DIR, f"paper_{i+1}.json")
    with open(fname, "w", encoding="utf-8") as f:
        json.dump(paper, f, indent=2)

print(f"✅ Download complete! Saved {len(papers)} papers to {RAW_DIR}")


✅ Download complete! Saved 10 papers to data/raw


In [None]:
import os
import json

RAW_DIR = "data/raw"
PROCESSED_DIR = "data/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

all_papers = []

# Load all JSON files from data/raw
for fname in os.listdir(RAW_DIR):
    if fname.endswith(".json"):
        with open(os.path.join(RAW_DIR, fname), "r", encoding="utf-8") as f:
            paper = json.load(f)
            all_papers.append({
                "title": paper["title"],
                "abstract": paper["abstract"],
                "authors": paper["authors"],
                "published": paper["published"]
            })

# Save consolidated file
out_path = os.path.join(PROCESSED_DIR, "abstracts.json")
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(all_papers, f, indent=2)

print(f"✅ Processed {len(all_papers)} papers into {out_path}")


In [6]:
import os
import json

RAW_DIR = "data/raw"
PROCESSED_DIR = "data/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

all_papers = []

# Load all JSON files from data/raw
for fname in os.listdir(RAW_DIR):
    if fname.endswith(".json"):
        with open(os.path.join(RAW_DIR, fname), "r", encoding="utf-8") as f:
            paper = json.load(f)
            all_papers.append({
                "title": paper["title"],
                "abstract": paper["abstract"],
                "authors": paper["authors"],
                "published": paper["published"]
            })

# Save consolidated file
out_path = os.path.join(PROCESSED_DIR, "abstracts.json")
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(all_papers, f, indent=2)

print(f"✅ Processed {len(all_papers)} papers into {out_path}")


✅ Processed 10 papers into data/processed\abstracts.json


In [7]:
import json

with open("data/processed/abstracts.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Total papers: {len(data)}\n")
print("Sample paper:\n")
print("Title:", data[0]["title"])
print("Abstract:", data[0]["abstract"][:500], "...")
print("Authors:", ", ".join(data[0]["authors"]))
print("Published:", data[0]["published"])


Total papers: 10

Sample paper:

Title: The Governance of Physical Artificial Intelligence
Abstract: Physical artificial intelligence can prove to be one of the most important challenges of the artificial intelligence. The governance of physical artificial intelligence would define its responsible intelligent application in the society. ...
Authors: Yingbo Li, Anamaria-Beatrice Spulber, Yucong Duan
Published: 2023-04-06T08:26:38Z
