In [14]:
import arxiv
import os
import json
import PyPDF2
import spacy
import requests
from bs4 import BeautifulSoup
import wikipediaapi
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def setup_directories():
    """Create necessary directories for the project."""
    directories = [
        "data/papers/arxiv",
        "data/explanations/wikipedia",
        "data/explanations/simple_wikipedia",
        "data/processed"
    ]
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    print("Directory structure created.")

In [7]:
import arxiv
import os

def download_arxiv_papers(search_query, max_results=100, output_dir="data/papers/arxiv"):
    os.makedirs(output_dir, exist_ok=True)
    
    client = arxiv.Client()
    
    search = arxiv.Search(
        query=search_query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    
    for result in client.results(search):
        try:
            filename = f"{output_dir}/{result.get_short_id().replace('/', '_')}.pdf"
            result.download_pdf(filename=filename)
            
            print(f"Downloaded: {result.title}")
        except Exception as e:
            print(f"Error downloading {result.get_short_id()}: {e}")

In [8]:
download_arxiv_papers("Attention is all you need", 3, "papers_test")

Downloaded: Attention Is All You Need But You Don't Need All Of It For Inference of Large Language Models
Downloaded: All the attention you need: Global-local, spatial-channel attention for image retrieval
Downloaded: RITA: Group Attention is All You Need for Timeseries Analytics


In [None]:
def download_cs_papers(max_results_per_category=20):
    cs_categories = [
        "cs.AI",    # Artificial Intelligence
        "cs.CL",    # Computation and Language (NLP)
        "cs.CV",    # Computer Vision
        "cs.LG",    # Machine Learning
        "cs.SE",    # Software Engineering
        "cs.HC",    # Human-Computer Interaction
        "cs.DB",    # Databases
        "cs.NE",    # Neural and Evolutionary Computing
        "cs.SD",    # Sound (Audio processing)
        "cs.IR"     # Information Retrieval
    ]
    
    client = arxiv.Client(page_size=100, delay_seconds=3)
    
    all_papers = []
    
    for category in cs_categories:
        print(f"Downloading papers for category: {category}")
        output_dir = f"data/papers/arxiv/{category}"
        os.makedirs(output_dir, exist_ok=True)
        
        search = arxiv.Search(
            query=f"cat:{category}",
            max_results=max_results_per_category,
            sort_by=arxiv.SortCriterion.Relevance
        )
        
        try:
            results = list(client.results(search))
            print(f"Found {len(results)} papers in {category}")
            
            for result in results:
                try:
                    paper_id = result.get_short_id().replace('/', '_')
                    filename = f"{output_dir}/{paper_id}.pdf"
                    
                    result.download_pdf(filename=filename)
                    
                    paper_info = {
                        "id": result.get_short_id(),
                        "title": result.title,
                        "authors": [author.name for author in result.authors],
                        "abstract": result.summary,
                        "category": category,
                        "published": str(result.published),
                        "pdf_path": filename
                    }
                    all_papers.append(paper_info)
                    
                    print(f"Downloaded: {paper_info['id']} - {paper_info['title']}")
                    
                except Exception as e:
                    print(f"Error downloading {result.get_short_id()}: {e}")
        except Exception as e:
            print(f"Error searching category {category}: {e}")
    
    with open("data/papers/arxiv_cs_papers_metadata.json", "w") as f:
        json.dump(all_papers, f, indent=2)
    
    print(f"Downloaded {len(all_papers)} papers in total")

In [None]:
def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        with open(pdf_path, "rb") as f:
            pdf_reader = PyPDF2.PdfReader(f)
            max_pages = min(20, len(pdf_reader.pages)) #max 20 pages
            for page_num in range(max_pages):
                text += pdf_reader.pages[page_num].extract_text() or ""
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

In [None]:
def extract_key_concepts(text, top_n=15):
    if not text or len(text) < 100:
        return []
        
    try:
        nlp = spacy.load("en_core_web_sm")
        
        max_length = min(len(text), 25000)
        doc = nlp(text[:max_length])
        
        noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks 
                       if len(chunk.text.split()) > 1 and len(chunk.text) < 50]
        
        # Count occurrences and get most frequent
        if not noun_phrases:
            return []
            
        vectorizer = CountVectorizer(ngram_range=(1, 3))
        X = vectorizer.fit_transform([" ".join(noun_phrases)])
        
        # Get top phrases
        feature_names = vectorizer.get_feature_names_out()
        counts = X.toarray()[0]
        
        # Sort by frequency and filter out common non-technical terms
        stopwords = ["et al", "paper", "section", "figure", "table", "result", "method", "approach"]
        key_concepts = []
        
        for i in counts.argsort()[::-1]:
            term = feature_names[i]
            if not any(stop in term for stop in stopwords) and len(term) > 5:
                key_concepts.append(term)
            if len(key_concepts) >= top_n:
                break
                
        return key_concepts
    except Exception as e:
        print(f"Error extracting concepts: {e}")
        return []
