In [1]:
import os

def tree(dir_path: str, prefix: str = ""):
    entries = sorted(os.listdir(dir_path))
    entries = [e for e in entries if not e.startswith('.')]  # Skip hidden files
    entries_count = len(entries)

    for index, entry in enumerate(entries):
        path = os.path.join(dir_path, entry)
        connector = "└── " if index == entries_count - 1 else "├── "
        print(prefix + connector + entry)
        if os.path.isdir(path):
            extension = "    " if index == entries_count - 1 else "│   "
            tree(path, prefix + extension)

# Example usage
cwd = os.getcwd()
print(os.path.basename(cwd))
tree(cwd)

research_finder
├── __pycache__
│   └── config.cpython-311.pyc
├── cache
│   └── 817de8751c24a3553f9a3f250d447a20.json
├── config.py
├── main.py
├── requirements.txt
├── research_finder
│   ├── __init__.py
│   ├── __pycache__
│   │   ├── __init__.cpython-311.pyc
│   │   ├── aggregator.cpython-311.pyc
│   │   ├── cache.cpython-311.pyc
│   │   ├── exporter.cpython-311.pyc
│   │   └── utils.cpython-311.pyc
│   ├── aggregator.py
│   ├── cache.py
│   ├── exporter.py
│   ├── searchers
│   │   ├── __init__.py
│   │   ├── __pycache__
│   │   │   ├── __init__.cpython-311.pyc
│   │   │   ├── arxiv.cpython-311.pyc
│   │   │   ├── base_searcher.cpython-311.pyc
│   │   │   ├── crossref.cpython-311.pyc
│   │   │   ├── google_scholar.cpython-311.pyc
│   │   │   ├── openalex.cpython-311.pyc
│   │   │   ├── pubmed.cpython-311.pyc
│   │   │   └── semantic_scholar.cpython-311.pyc
│   │   ├── arxiv.py
│   │   ├── base_searcher.py
│   │   ├── crossref.py
│   │   ├── google_scholar.py
│   │   ├── openalex.p

research_finder/
├── requirements.txt
├── main.py
└── research_finder/
    ├── __init__.py
    ├── aggregator.py
    ├── exporter.py
    └── searchers/
        ├── __init__.py
        ├── base_searcher.py
        ├── semantic_scholar.py
        ├── arxiv.py
        └── google_scholar.py

Setup and Installation

In [1]:
# Install required libraries if you haven't already
# Note: In a Jupyter environment, you can run shell commands with '!'
!pip install requests pandas feedparser scholarly

# Standard library imports
import logging
from typing import List, Dict, Any

# Setup basic logging to see output from our classes
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

print("Libraries installed and logging configured.")

Libraries installed and logging configured.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


The Base Class (BaseSearcher)

In [2]:
# research_finder/searchers/base_searcher.py

from abc import ABC, abstractmethod

class BaseSearcher(ABC):
    """Abstract base class for all article searchers."""
    
    def __init__(self, name: str):
        self.name = name
        self.results: List[Dict[str, Any]] = []

    @abstractmethod
    def search(self, query: str, limit: int) -> None:
        """
        Performs a search and populates the self.results list.
        Each result should be a dictionary.
        """
        pass

    def get_results(self) -> List[Dict[str, Any]]:
        """Returns the list of standardized results."""
        return self.results

    def clear_results(self) -> None:
        """Clears the stored results."""
        self.results = []
        
print("BaseSearcher class defined.")

BaseSearcher class defined.


Semantic Scholar Searcher

In [3]:
# research_finder/searchers/semantic_scholar.py

import requests
from IPython.display import display, Markdown # For nice output in notebooks

class SemanticScholarSearcher(BaseSearcher):
    """Searcher for the Semantic Scholar API."""
    
    BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"

    def __init__(self):
        super().__init__("Semantic Scholar")
        self.logger = logging.getLogger(self.name)

    def search(self, query: str, limit: int = 10) -> None:
        self.logger.info(f"Searching for: '{query}' with limit {limit}")
        self.clear_results()
        params = {
            'query': query,
            'limit': limit,
            'fields': 'title,authors,year,abstract,url,citationCount,tldr'
        }
        try:
            response = requests.get(self.BASE_URL, params=params)
            response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)
            data = response.json()
            
            for item in data.get('data', []):
                authors = [author.get('name') for author in item.get('authors', [])]
                abstract = item.get('tldr', {}).get('text') or item.get('abstract')

                paper = {
                    'Title': item.get('title'),
                    'Authors': ', '.join(authors),
                    'Year': item.get('year'),
                    'Abstract': abstract,
                    'URL': item.get('url'),
                    'Source': self.name
                }
                self.results.append(paper)
            self.logger.info(f"Found {len(self.results)} papers.")
        except requests.exceptions.RequestException as e:
            self.logger.error(f"API request failed: {e}")

print("SemanticScholarSearcher class defined.")

SemanticScholarSearcher class defined.


arXiv Searcher

In [4]:
# research_finder/searchers/arxiv.py

import requests
import feedparser

class ArxivSearcher(BaseSearcher):
    """Searcher for the arXiv API."""
    
    BASE_URL = "http://export.arxiv.org/api/query"

    def __init__(self):
        super().__init__("arXiv")
        self.logger = logging.getLogger(self.name)

    def search(self, query: str, limit: int = 10) -> None:
        self.logger.info(f"Searching for: '{query}' with limit {limit}")
        self.clear_results()
        params = {
            'search_query': f'all:"{query}"',
            'start': 0,
            'max_results': limit
        }
        try:
            response = requests.get(self.BASE_URL, params=params)
            response.raise_for_status()
            feed = feedparser.parse(response.content)

            for entry in feed.entries:
                authors = [author.name for author in entry.authors]
                paper = {
                    'Title': entry.title,
                    'Authors': ', '.join(authors),
                    'Year': entry.published.split('-')[0],
                    'Abstract': entry.summary,
                    'URL': entry.link,
                    'Source': self.name
                }
                self.results.append(paper)
            self.logger.info(f"Found {len(self.results)} papers.")
        except requests.exceptions.RequestException as e:
            self.logger.error(f"API request failed: {e}")
        except Exception as e:
            self.logger.error(f"Failed to parse arXiv response: {e}")

print("ArxivSearcher class defined.")

ArxivSearcher class defined.


The Aggregator

In [5]:
# research_finder/aggregator.py

class Aggregator:
    """Aggregates results from multiple searchers."""
    
    def __init__(self):
        self.searchers: List[BaseSearcher] = []
        self.logger = logging.getLogger("Aggregator")

    def add_searcher(self, searcher: BaseSearcher) -> None:
        """Adds a searcher instance to the list."""
        if isinstance(searcher, BaseSearcher):
            self.searchers.append(searcher)
            self.logger.info(f"Added searcher: {searcher.name}")
        else:
            self.logger.error(f"Failed to add searcher: {searcher} is not a valid BaseSearcher instance.")

    def run_all_searches(self, query: str, limit: int) -> List[dict]:
        """
        Runs the search query on all added searchers and returns combined results.
        """
        self.logger.info(f"--- Starting search for '{query}' ---")
        all_results = []
        for searcher in self.searchers:
            try:
                searcher.search(query, limit)
                all_results.extend(searcher.get_results())
            except Exception as e:
                self.logger.error(f"An error occurred with searcher '{searcher.name}': {e}")
        
        self.logger.info(f"--- Search complete. Total results found: {len(all_results)} ---")
        return all_results

print("Aggregator class defined.")

Aggregator class defined.


The Exporter

In [6]:
# research_finder/exporter.py

import pandas as pd

class Exporter:
    """Handles exporting data to various formats."""

    def __init__(self):
        self.logger = logging.getLogger("Exporter")

    def to_csv(self, data: List[Dict[str, Any]], filename: str) -> None:
        """Exports a list of dictionaries to a CSV file."""
        if not data:
            self.logger.warning("No data provided to export.")
            return
        
        try:
            df = pd.DataFrame(data)
            # Ensure a consistent column order
            df = df[['Title', 'Authors', 'Year', 'Source', 'URL', 'Abstract']]
            df.to_csv(filename, index=False, encoding='utf-8')
            self.logger.info(f"Successfully exported {len(data)} results to {filename}")
        except Exception as e:
            self.logger.error(f"Failed to export to CSV: {e}")

print("Exporter class defined.")

Exporter class defined.


Main Execution and Testing

In [7]:
# --- Main Execution Logic ---

# 1. Define your search parameters
SEARCH_QUERY = "quantum computing"
OUTPUT_FILE = "notebook_test_results.csv"
RESULTS_PER_SOURCE = 5

# 2. Initialize the components
aggregator = Aggregator()
exporter = Exporter()

# 3. Add the searchers you want to use
# To debug, you can comment out one of these lines.
aggregator.add_searcher(SemanticScholarSearcher())
aggregator.add_searcher(ArxivSearcher())

# 4. Run the searches
all_articles = aggregator.run_all_searches(SEARCH_QUERY, RESULTS_PER_SOURCE)

# 5. Display a sample of the results in the notebook
if all_articles:
    print("\n--- Sample of Results Found ---")
    df_display = pd.DataFrame(all_articles)
    display(df_display.head()) # Shows first 5 rows
    
    # 6. Export all results to the CSV file
    exporter.to_csv(all_articles, OUTPUT_FILE)
else:
    print("No articles found to export.")


2025-09-30 16:23:27,907 - Aggregator - INFO - Added searcher: Semantic Scholar
2025-09-30 16:23:27,907 - Aggregator - INFO - Added searcher: arXiv
2025-09-30 16:23:27,908 - Aggregator - INFO - --- Starting search for 'quantum computing' ---
2025-09-30 16:23:27,908 - Semantic Scholar - INFO - Searching for: 'quantum computing' with limit 5
2025-09-30 16:23:29,690 - Aggregator - ERROR - An error occurred with searcher 'Semantic Scholar': 'NoneType' object has no attribute 'get'
2025-09-30 16:23:29,690 - arXiv - INFO - Searching for: 'quantum computing' with limit 5
2025-09-30 16:23:30,444 - arXiv - INFO - Found 5 papers.
2025-09-30 16:23:30,445 - Aggregator - INFO - --- Search complete. Total results found: 5 ---



--- Sample of Results Found ---


Unnamed: 0,Title,Authors,Year,Abstract,URL,Source
0,Pulse controlled noise suppressed quantum comp...,"Lu-Ming Duan, Guang-Can Guo",1998,To make arbitrarily accurate quantum computati...,http://arxiv.org/abs/quant-ph/9807072v1,arXiv
1,Unconventional Quantum Computing Devices,Seth Lloyd,2000,This paper investigates a variety of unconvent...,http://arxiv.org/abs/quant-ph/0003151v1,arXiv
2,Photonic Quantum Computers,M. AbuGhanem,2024,In the pursuit of scalable and fault-tolerant ...,http://arxiv.org/abs/2409.08229v1,arXiv
3,Quantum computer and its quasiclassical model,Timur F. Kamalov,2001,Could the theories with hidden variables be em...,http://arxiv.org/abs/quant-ph/0109152v1,arXiv
4,Quantum Computing for Multi Period Asset Alloc...,"Queenie Sun, Nicholas Grablevsky, Huaizhang De...",2024,Portfolio construction has been a long-standin...,http://arxiv.org/abs/2410.11997v1,arXiv


2025-09-30 16:23:30,466 - Exporter - INFO - Successfully exported 5 results to notebook_test_results.csv


I am building a tool in Python that will search for research articles using key words in various vendors and websites. In return, it will provide 'Title', 'Authors', 'Year', 'Venue', 'Source', 'Citation Count', 'DOI', 'License Type', 'URL', 'APA 7 Reference' of the matched articles. The tool is capable to export this in a csv file upon request. The user can manually enter the key words and also can manually select a vendor from available vendors list. Here is the directory tree for your better understanding.

research_finder
├── config.py
├── main.py
├── requirements.txt
├── research_finder
│   ├── __init__.py
│   ├── aggregator.py
│   ├── cache.py
│   ├── exporter.py
│   ├── searchers
│   │   ├── __init__.py
│   │   ├── arxiv.py
│   │   ├── base_searcher.py
│   │   ├── crossref.py
│   │   ├── google_scholar.py
│   │   ├── openalex.py
│   │   ├── pubmed.py
│   │   └── semantic_scholar.py
│   └── utils.py
└── search_tool.ipynb



i am providing you all the codes one by one:

requirements.txt:
requests
pandas
feedparser
scholarly


main.py:
import logging
from research_finder.aggregator import Aggregator
from research_finder.exporter import Exporter
from research_finder.searchers.semantic_scholar import SemanticScholarSearcher
from research_finder.searchers.arxiv import ArxivSearcher

# We will handle the optional Google Scholar import here
try:
    from research_finder.searchers.google_scholar import GoogleScholarSearcher
    GOOGLE_SCHOLAR_AVAILABLE = True
except ImportError:
    GOOGLE_SCHOLAR_AVAILABLE = False
    print("Warning: 'scholarly' library not found. Google Scholar will not be an option.")
    print("To enable it, run: pip install scholarly")


def setup_logging():
    """Configure basic logging for the application."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[logging.StreamHandler()]
    )

def get_user_input():
    """Gets search parameters from the user via command-line prompts."""
    print("\n--- Research Article Finder ---")
    query = input("Enter keywords to search for: ").strip()
    if not query:
        print("Search query cannot be empty. Exiting.")
        exit()

    output_file = input("Enter output CSV filename (e.g., results.csv): ").strip()
    if not output_file:
        output_file = "search_results.csv"
    if not output_file.endswith('.csv'):
        output_file += '.csv'

    while True:
        try:
            limit_str = input("Enter max results per source (e.g., 10): ").strip()
            limit = int(limit_str)
            if limit > 0:
                break
            else:
                print("Please enter a positive number.")
        except ValueError:
            print("Invalid input. Please enter a number.")
            
    return query, output_file, limit

def get_searcher_selection():
    """
    Displays a menu of available searchers and gets the user's selection.
    """
    # Define the list of available searchers
    # Each item is a tuple: (Display Name, Searcher Class)
    available_searchers = [
        ("Semantic Scholar", SemanticScholarSearcher),
        ("arXiv", ArxivSearcher)
    ]
    if GOOGLE_SCHOLAR_AVAILABLE:
        available_searchers.append(("Google Scholar (Unreliable)", GoogleScholarSearcher))

    print("\n--- Select Search Vendors ---")
    for i, (name, _) in enumerate(available_searchers, 1):
        print(f"  {i}. {name}")
    
    while True:
        choice_str = input(f"Enter vendor numbers to use (e.g., 1,2) or press Enter for all: ").strip()
        
        # If user presses Enter, select all
        if not choice_str:
            return [searcher_class for (_, searcher_class) in available_searchers]

        try:
            # Parse comma-separated numbers
            chosen_indices = [int(num.strip()) for num in choice_str.split(',')]
            selected_searchers = []
            
            # Validate choices
            for index in chosen_indices:
                if 1 <= index <= len(available_searchers):
                    selected_searchers.append(available_searchers[index - 1][1])
                else:
                    raise ValueError(f"Invalid number: {index}")
            
            if not selected_searchers:
                print("No valid vendors selected. Please try again.")
                continue

            return selected_searchers

        except (ValueError, IndexError):
            print("Invalid input. Please enter numbers separated by commas (e.g., 1,3).")


def main():
    """Main function to run the research finder tool."""
    setup_logging()
    logger = logging.getLogger("Main")

    # 1. Get user input for the search
    query, output_file, limit = get_user_input()

    # 2. Get user's choice of search vendors
    selected_searcher_classes = get_searcher_selection()

    # 3. Initialize Components
    aggregator = Aggregator()
    exporter = Exporter()

    # 4. Instantiate and add the selected searchers to the Aggregator
    for searcher_class in selected_searcher_classes:
        try:
            aggregator.add_searcher(searcher_class())
        except Exception as e:
            logger.error(f"Could not initialize searcher {searcher_class.__name__}: {e}")

    # 5. Run Searches and Get Results
    all_articles = aggregator.run_all_searches(query, limit)

    # 6. Export Results
    if all_articles:
        exporter.to_csv(all_articles, output_file)
    else:
        logger.info("No articles found to export.")

if __name__ == "__main__":
    main()

.gitignore:
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# IDEs
.vscode/
.idea/
*.swp
*.swo

# Project specific output files
*.csv
*.json
!requirements.txt

.env:
S2_API_KEY="your_actual_api_key_goes_here"


utils.py:
def format_apa7(paper: dict) -> str:
    """
    Formats a paper dictionary into a basic APA-7 style reference string.
    Note: This is an approximation and may not cover all edge cases.
    """
    # 1. Format Authors
    authors_list = [a.strip() for a in paper.get('Authors', '').split(',') if a.strip()]
    if not authors_list:
        author_str = "n.a."
    else:
        formatted_authors = []
        for author in authors_list:
            parts = author.split()
            if len(parts) == 0:
                continue
            # Assumes "Lastname Firstname" or "Lastname, F."
            last_name = parts[-1]
            initials = "".join([p[0] + "." for p in parts[:-1]])
            formatted_authors.append(f"{last_name}, {initials}")
        
        if len(formatted_authors) == 1:
            author_str = formatted_authors[0]
        elif len(formatted_authors) <= 20:
            author_str = ", ".join(formatted_authors[:-1]) + ", & " + formatted_authors[-1]
        else: # APA 7 rule for >20 authors
            author_str = formatted_authors[0] + ", et al."

    # 2. Get Year
    year = paper.get('Year', 'n.d.')
    year_str = f"({year})."

    # 3. Get Title
    title = paper.get('Title', '')
    
    # 4. Get Source/Venue and construct reference
    source = paper.get('Source')
    venue = paper.get('Venue', '')
    doi = paper.get('DOI', '')
    url = paper.get('URL', '')

    if source == 'arXiv':
        # Preprint format
        ref = f"{author_str} {year_str} *{title}* [Preprint]. arXiv."
        if url:
            ref += f" {url}"
    else:
        # Journal article format
        ref = f"{author_str} {year_str} {title}."
        if venue:
            ref += f" *{venue}*."
        
        # Add DOI or URL
        if doi:
            ref += f" https://doi.org/{doi}"
        elif url:
            ref += f" {url}"
            
    return ref.strip()



exporter.py:
import pandas as pd
import logging
from typing import List, Dict, Any
from .utils import format_apa7

class Exporter:
    """Handles exporting data to various formats."""

    def __init__(self):
        self.logger = logging.getLogger("Exporter")

    def to_csv(self, data: List[Dict[str, Any]], filename: str) -> None:
        """Exports a list of dictionaries to a CSV file with a fixed set of columns."""
        if not data:
            self.logger.warning("No data provided to export.")
            return
        
        try:
            # 1. Generate APA 7 reference for each paper
            for paper in data:
                paper['APA 7 Reference'] = format_apa7(paper)

            # 2. Define the fixed, final order of columns for the output CSV
            final_columns = [
                'Title', 'Authors', 'Year', 'Venue', 'Source', 'Citation Count', 'DOI', 'License Type',
                'URL', 'APA 7 Reference'
            ]

            # 3. Create the DataFrame
            df = pd.DataFrame(data)
            
            # 4. Ensure all desired columns exist in the DataFrame before reordering
            # This prevents errors if a searcher doesn't provide a specific field
            for col in final_columns:
                if col not in df.columns:
                    df[col] = '' # Add missing columns as empty strings

            # 5. Reorder the DataFrame and save to CSV
            final_df = df[final_columns]
            final_df.to_csv(filename, index=False, encoding='utf-8')
            
            self.logger.info(f"Successfully exported {len(data)} results to {filename}")

        except Exception as e:
            self.logger.error(f"Failed to export to CSV: {e}")


aggregator.py:
import logging
from typing import List
from .searchers.base_searcher import BaseSearcher

class Aggregator:
    """Aggregates results from multiple searchers."""
    
    def __init__(self):
        self.searchers: List[BaseSearcher] = []
        self.logger = logging.getLogger("Aggregator")

    def add_searcher(self, searcher: BaseSearcher) -> None:
        """Adds a searcher instance to the list."""
        if isinstance(searcher, BaseSearcher):
            self.searchers.append(searcher)
            self.logger.info(f"Added searcher: {searcher.name}")
        else:
            self.logger.error(f"Failed to add searcher: {searcher} is not a valid BaseSearcher instance.")

    def run_all_searches(self, query: str, limit: int) -> List[dict]:
        """
        Runs the search query on all added searchers and returns combined results.
        """
        self.logger.info(f"--- Starting search for '{query}' ---")
        all_results = []
        for searcher in self.searchers:
            try:
                searcher.search(query, limit)
                all_results.extend(searcher.get_results())
            except Exception as e:
                self.logger.error(f"An error occurred with searcher '{searcher.name}': {e}")
        
        self.logger.info(f"--- Search complete. Total results found: {len(all_results)} ---")
        return all_results


semantic_scholar.py:
import requests
import logging
from .base_searcher import BaseSearcher

class SemanticScholarSearcher(BaseSearcher):
    """Searcher for the Semantic Scholar API."""
    
    BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"

    def __init__(self):
        super().__init__("Semantic Scholar")
        self.logger = logging.getLogger(self.name)

    def search(self, query: str, limit: int = 10) -> None:
        self.logger.info(f"Searching for: '{query}' with limit {limit}")
        self.clear_results()
        # UPDATED: Added 'openAccessPdf.license' to the fields to retrieve
        params = {
            'query': query,
            'limit': limit,
            'fields': 'title,authors,year,abstract,url,citationCount,tldr,doi,venue,openAccessPdf.license'
        }
        try:
            response = requests.get(self.BASE_URL, params=params)
            response.raise_for_status()
            data = response.json()
            
            for item in data.get('data', []):
                authors = [author.get('name') for author in item.get('authors', [])]
                abstract = item.get('tldr', {}).get('text') or item.get('abstract')
                
                # UPDATED: Extract license information
                license_info = item.get('openAccessPdf', {}).get('license') or 'N/A'

                paper = {
                    'Title': item.get('title'),
                    'Authors': ', '.join(authors),
                    'Year': item.get('year'),
                    # 'Abstract': abstract,
                    'URL': item.get('url'),
                    'Source': self.name,
                    'Citation': item.get('citationCount', 0),
                    'DOI': item.get('doi'),
                    'Venue': item.get('venue'),
                    'License Type': license_info
                }
                self.results.append(paper)
            self.logger.info(f"Found {len(self.results)} papers.")
        except requests.exceptions.RequestException as e:
            self.logger.error(f"API request failed: {e}")


google_scholar.py:
# research_finder/searchers/google_scholar.py

import time
import logging
import re
from .base_searcher import BaseSearcher

try:
    from scholarly import scholarly
except ImportError:
    scholarly = None

class GoogleScholarSearcher(BaseSearcher):
    """Searcher for Google Scholar using the unofficial 'scholarly' library."""
    
    def __init__(self):
        if not scholarly:
            raise ImportError("scholarly library not found. Install with 'pip install scholarly'")
        super().__init__("Google Scholar")
        self.logger = logging.getLogger(self.name)

    def search(self, query: str, limit: int = 5) -> None:
        self.logger.info(f"Searching for: '{query}' with limit {limit}. (Caution: Unreliable)")
        self.clear_results()
        try:
            search_query = scholarly.search_pubs(query)
            for i, pub in enumerate(search_query):
                if i >= limit:
                    break
                
                doi = None
                url = pub.get('pub_url', '')
                if 'doi.org/' in url:
                    doi = url.split('doi.org/')[-1]

                paper = {
                    'Title': pub.get('bib', {}).get('title'),
                    'Authors': pub.get('bib', {}).get('author', ''),
                    'Year': pub.get('bib', {}).get('pub_year'),
                    # 'Abstract': pub.get('bib', {}).get('abstract'),
                    'URL': url,
                    'Source': self.name,
                    'Citation': pub.get('bib', {}).get('num_citations', 'N/A'),
                    'DOI': doi,
                    'Venue': pub.get('bib', {}).get('journal', ''),
                    'License Type': 'N/A'
                }
                self.results.append(paper)
                time.sleep(1)
            self.logger.info(f"Found {len(self.results)} papers.")
        except Exception as e:
            self.logger.error(f"Search failed: {e}. This is common with Google Scholar.")


base_searchers.py:
from abc import ABC, abstractmethod
from typing import List, Dict, Any

class BaseSearcher(ABC):
    """Abstract base class for all article searchers."""
    
    def __init__(self, name: str):
        self.name = name
        self.results: List[Dict[str, Any]] = []

    @abstractmethod
    def search(self, query: str, limit: int) -> None:
        """
        Performs a search and populates the self.results list.
        Each result should be a dictionary.
        """
        pass

    def get_results(self) -> List[Dict[str, Any]]:
        """Returns the list of standardized results."""
        return self.results

    def clear_results(self) -> None:
        """Clears the stored results."""
        self.results = []


arxiv.py:
import requests
import feedparser
import logging
from .base_searcher import BaseSearcher

class ArxivSearcher(BaseSearcher):
    """Searcher for the arXiv API."""
    
    BASE_URL = "http://export.arxiv.org/api/query"

    def __init__(self):
        super().__init__("arXiv")
        self.logger = logging.getLogger(self.name)

    def search(self, query: str, limit: int = 10) -> None:
        self.logger.info(f"Searching for: '{query}' with limit {limit}")
        self.clear_results()
        params = {
            'search_query': f'all:"{query}"',
            'start': 0,
            'max_results': limit
        }
        try:
            response = requests.get(self.BASE_URL, params=params)
            response.raise_for_status()
            feed = feedparser.parse(response.content)

            for entry in feed.entries:
                authors = [author.name for author in entry.authors]
                arxiv_id = entry.id.split('/')[-1]
                
                # UPDATED: Extract license information
                license_info = entry.get('rights', 'N/A')

                paper = {
                    'Title': entry.title,
                    'Authors': ', '.join(authors),
                    'Year': entry.published.split('-')[0],
                    # 'Abstract': entry.summary,
                    'URL': entry.link,
                    'Source': self.name,
                    'Citation': 'N/A',
                    'DOI': arxiv_id,
                    'Venue': 'arXiv',
                    # ADDED: New fields
                    'License Type': license_info
                }
                self.results.append(paper)
            self.logger.info(f"Found {len(self.results)} papers.")
        except requests.exceptions.RequestException as e:
            self.logger.error(f"API request failed: {e}")
        except Exception as e:
            self.logger.error(f"Failed to parse arXiv response: {e}")

