In [4]:
!pip install requests pandas



In [5]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m419.8/981.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=7622ccb1bd5909fbafd9483c6df790e0400d32148f177bf9ba021d6b9ec3dd20
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [7]:
import requests  # For making API requests
import pandas as pd  #
import time  # For handling API rate limits
import os  # For accessing environment variables
from google.colab import files  # For downloading files in Google Colab
from datetime import datetime  # For handling date operations
from langdetect import detect, LangDetectException # detect = language identifier, LangDetectException = error if detection fails

# Load API key from an environment variable for security
SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY", "Inset API Key")

# Get the current year and define the past 20 years range
CURRENT_YEAR = datetime.now().year  # Get the current year
START_YEAR = CURRENT_YEAR - 20  # Define the start year as 20 years ago

# Function to fetch AI/ML research papers from Semantic Scholar with pagination
def fetch_research_papers(query, max_results=100):  # Allows fetching up to 100 results per query
    url = "https://api.semanticscholar.org/graph/v1/paper/search"  # API endpoint for research papers
    headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY}  # API authentication header

    papers = []  # List to store fetched research papers
    total_fetched = 0  # Counter to track the total number of papers fetched
    offset = 0  # Track pagination to avoid duplicate results

    # Loop until we fetch the required number of results
    while total_fetched < max_results:
        params = {
            "query": query,  # Search query term
            "limit": min(100, max_results - total_fetched),  # Fetch up to 100 at a time (Predefined by Google)
            "offset": offset,  # Pagination offset to fetch next batch of results
            "fields": "title,abstract,authors,year,url,citationCount,journal,venue,publicationTypes", # Lables of dataset
            "year": f"{START_YEAR}-{CURRENT_YEAR}"  # Fetch only from the last 20 years
        }

        response = requests.get(url, headers=headers, params=params)  # Make API request

        # Handle API rate limits and errors with exponential backoff
        if response.status_code == 429:  # If API rate limit is exceeded
            wait_time = 2  # Initial wait time in seconds
            while response.status_code == 429:  # Keep retrying until allowed
                print(f"Rate limit exceeded. Waiting for {wait_time} seconds...")
                time.sleep(wait_time)  # Wait before retrying
                wait_time *= 2  # Increase wait time (exponential backoff)
                response = requests.get(url, headers=headers, params=params)  # Retry API request

        elif response.status_code != 200:  # Handle other errors
            print(f"Error fetching data: {response.status_code} - {response.text}")
            break  # Stop execution on API error

        data = response.json()  # Convert API response to JSON format
        papers_fetched = data.get("data", [])  # Extract paper data from response

        # If no more results, stop fetching
        if not papers_fetched:
            print(f"No more results for query: {query}")
            break

        for paper in papers_fetched:
            year = paper.get("year", 0)
            if START_YEAR <= year <= CURRENT_YEAR:
               title = paper.get("title", "")
               abstract = paper.get("abstract", "")

            try:
               if detect(title) != "en":
                    continue
               if abstract and detect(abstract) != "en":
                continue
            except LangDetectException:
                continue  # skip if language detection fails

            papers.append({
                "Title": title or "N/A",
                "Abstract": abstract or "N/A",
                "Authors": ", ".join([author["name"] for author in paper.get("authors", [])]),
                "Year": year,
                "URL": paper.get("url", "N/A"),
                "Citations": paper.get("citationCount", "N/A"),
                "Journal": (paper.get("journal") or {}).get("name", "N/A"),
                "Venue": paper.get("venue", "N/A"),
                "Publication Types": ", ".join(paper.get("publicationTypes", []) or [])
            })

        total_fetched += len(papers_fetched)  # Update the total fetched count
        offset += len(papers_fetched)  # Move the offset forward for pagination

        # Respect API rate limits by adding a small delay
        time.sleep(1)

    return papers  # Return the list of fetched papers

# List of queries to fetch AI & ML research papers
queries = [
    # Machine Learning Variants
    "Machine Learning",
    "Machine Learning in Healthcare",
    "Machine Learning in Finance",
    "Machine Learning in Cybersecurity",
    "Automated Machine Learning (AutoML)",
    "Bayesian Machine Learning",

    # Deep Learning
    "Deep Learning",
    "Deep Learning Architectures",
    "Deep Learning for Natural Language Processing",
    "Deep Learning for Computer Vision",
    "Neural Network Optimization",
    "Deep Reinforcement Learning",

    # Reinforcement Learning
    "Reinforcement Learning",
    "Multi-Agent Reinforcement Learning",
    "Q-Learning",
    "Policy Gradient Methods",
    "Model-Based Reinforcement Learning",
    "Self-Play in Reinforcement Learning",

    # Natural Language Processing (NLP)
    "Natural Language Processing",
    "Transformer Models for NLP",
    "BERT Model",
    "GPT Models",
    "Language Modeling",
    "Sentiment Analysis",

    # Computer Vision
    "Computer Vision",
    "Image Recognition",
    "Object Detection",
    "Face Recognition",
    "3D Computer Vision",
    "Medical Image Analysis",

    # AI Architectures & Networks
    "Neural Networks",
    "Graph Neural Networks",
    "Recurrent Neural Networks (RNN)",
    "Convolutional Neural Networks (CNN)",
    "Transformer Networks",

    # Unsupervised & Self-Supervised Learning
    "Unsupervised Learning",
    "Self-Supervised Learning",
    "Contrastive Learning",
    "Representation Learning",
    "Few-Shot Learning",
    "Zero-Shot Learning",

    # AI Applications
    "AI in Drug Discovery",
    "AI for Autonomous Vehicles",
    "AI in Agriculture",
    "AI in Finance",
    "AI in Robotics",
    "AI for Edge Computing",

    # Ethical AI
    "Explainable AI",
    "AI Ethics",
    "Fairness in AI",
    "Bias in AI Models",

    # Generative AI
    "Generative Adversarial Networks (GANs)",
    "Stable Diffusion Models",
    "Text-to-Image AI",
    "AI Art Generation",

    # Big Data & AI
    "Big Data Analytics",
    "AI for Big Data",
    "Federated Learning",

    # AI in Forecasting & Predictions
    "Time Series Forecasting",
    "Predictive Analytics",
    "Anomaly Detection",
    "Clustering Algorithms",

    # Emerging AI Fields
    "Neuro-Symbolic AI",
    "Multi-Modal AI",
    "AI for Scientific Research",
    "AI in Quantum Computing",
    "AI for IoT",
    "AI Hardware Acceleration"
]

all_papers = []  # List to store papers from all queries

# Loop through each query and fetch research papers
for query in queries:
    print(f"Fetching up to 100 papers for: {query}")
    papers = fetch_research_papers(query, max_results=100)  # Fetch papers for the given query
    all_papers.extend(papers)  # Append fetched papers to the main list

# Convert fetched data to a Pandas DataFrame
df = pd.DataFrame(all_papers)

# Save data as a CSV file
csv_filename = "Google Scholar AI&ML Papers.csv"
df.to_csv(csv_filename, index=False)

# Download the file (for Google Colab)
files.download(csv_filename)


Fetching up to 100 papers for: Machine Learning
Fetching up to 100 papers for: Machine Learning in Healthcare
Fetching up to 100 papers for: Machine Learning in Finance
Fetching up to 100 papers for: Machine Learning in Cybersecurity
Fetching up to 100 papers for: Automated Machine Learning (AutoML)
Fetching up to 100 papers for: Bayesian Machine Learning
Fetching up to 100 papers for: Deep Learning
Fetching up to 100 papers for: Deep Learning Architectures
Fetching up to 100 papers for: Deep Learning for Natural Language Processing
Fetching up to 100 papers for: Deep Learning for Computer Vision
Fetching up to 100 papers for: Neural Network Optimization
Fetching up to 100 papers for: Deep Reinforcement Learning
Fetching up to 100 papers for: Reinforcement Learning
Fetching up to 100 papers for: Multi-Agent Reinforcement Learning
Fetching up to 100 papers for: Q-Learning
Fetching up to 100 papers for: Policy Gradient Methods
Fetching up to 100 papers for: Model-Based Reinforcement Lear

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>