<a href="https://colab.research.google.com/github/TechWhizGenius/Teja_INFO5731_Fall2024/blob/main/TERM_PROJECT/Scrapping_Cornell.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This file contains scraping data from scopus.

In [None]:
import requests
import pandas as pd
import time

In [None]:

# API keys and URLs
SCOPUS_API_KEY = 'fc8e266f77097a834ac8a98322346ea4'  # Scopus API Key
ARTICLE_API_KEY = 'fc8e266f77097a834ac8a98322346ea4'  # Article retrieval API Key


search_url = "https://api.elsevier.com/content/search/scopus"
article_url = "https://api.elsevier.com/content/article/doi/"

scopus_headers = {
    'X-ELS-APIKey': SCOPUS_API_KEY,
    'Accept': 'application/json'
}

article_headers = {
    'X-ELS-APIKey': ARTICLE_API_KEY,
    'Accept': 'text/xml'
}

# Define keywords to search
keywords = [

    '"Autonomous Vehicles" AND "Deep Learning"'

]

start_year = 2010
end_year = 2024

# Function to scrape search results from Scopus API
def scopus_search(query, max_pages=10):
    papers = []
    count_per_page = 25  # 25 = Number of results per page
    total_results = 0

    for start in range(0, max_pages * count_per_page, count_per_page):
        params = {
            'query': query,
            'count': count_per_page,
            'start': start,
            'date': f'{start_year}-{end_year}',
            'sort': 'relevance'
        }

        try:
            response = requests.get(search_url, headers=scopus_headers, params=params)
            response.raise_for_status()
            data = response.json()

            entries = data.get('search-results', {}).get('entry', [])
            total_results += len(entries)

            # Stop if no more entries are available
            if not entries:
                break

            # Process each paper entry
            for entry in entries:
                title = entry.get('dc:title', 'N/A')
                authors = entry.get('dc:creator', 'N/A')
                year = entry.get('prism:coverDate', 'N/A').split('-')[0]
                journal = entry.get('prism:publicationName', 'N/A')
                doi = entry.get('prism:doi', 'N/A')
                url = entry.get('link')[0].get('@href', 'N/A') if entry.get('link') else 'N/A'

                # Retrieve the abstract using the DOI if available
                abstract = get_abstract(doi) if doi != 'N/A' else 'N/A'

                papers.append({
                    'Title': title,
                    'Authors': authors,
                    'Abstract': abstract,  # Directly retrieved from the article API
                    'Year': year,
                    'Journal/Conference': journal,
                    'DOI': doi,
                    'URL': url
                })

            print(f"Processed {total_results} results so far for query '{query}'...")

        except Exception as e:
            print(f"Error occurred while fetching data: {e}")
            break

        time.sleep(1)  # Delay to avoid overwhelming the API

    return papers

# Function to retrieve abstract using the Article API
def get_abstract(doi):
    try:
        response = requests.get(f"{article_url}{doi}", headers=article_headers)
        if response.status_code == 200:
            data = response.text
            # Extract abstract content from the XML (you can adjust parsing based on the response structure)
            start = data.find("<dc:description>") + len("<dc:description>")
            end = data.find("</dc:description>")
            abstract = data[start:end].strip() if start != -1 and end != -1 else "N/A"
            return abstract
        else:
            print(f"Error: {response.status_code} for DOI {doi}")
            return "N/A"
    except Exception as e:
        print(f"Error fetching abstract for DOI {doi}: {e}")
        return "N/A"

# Start scraping process
def main():
    all_papers = []

    for keyword in keywords:
        print(f"Scraping papers for query: {keyword}")
        papers = scopus_search(keyword, max_pages=3)
        all_papers.extend(papers)

    # Convert the list of papers to a DataFrame and save it to CSV
    df = pd.DataFrame(all_papers)
    df.to_csv('scopus_papers_with_abstracts.csv', index=False)
    print("Scraping completed and data saved to 'scopus_papers_with_abstracts.csv'.")
    print(df.head())

# Run the scraper
if __name__ == "__main__":
    main()


Scraping papers for query: "Autonomous Vehicles" AND "Deep Learning"
Error: 404 for DOI 10.12928/TELKOMNIKA.v22i6.25519
Processed 25 results so far for query '"Autonomous Vehicles" AND "Deep Learning"'...
Error: 404 for DOI 10.1038/s44172-024-00292-3
Error: 404 for DOI 10.1007/s12652-024-04879-8
Error: 404 for DOI 10.1007/s10462-024-10937-6
Error: 404 for DOI 10.1007/s00530-024-01521-7
Error: 404 for DOI 10.1007/s11554-024-01562-1
Error: 404 for DOI 10.1038/s41598-024-73976-7
Error: 404 for DOI 10.1038/s41598-024-73881-z
Error: 404 for DOI 10.11591/ijece.v14i6.pp7145-7157
Error: 404 for DOI 10.1186/s13634-024-01186-4
Error: 404 for DOI 10.1038/s41598-024-74679-9
Error: 404 for DOI 10.1061/JTEPBS.TEENG-8557
Processed 50 results so far for query '"Autonomous Vehicles" AND "Deep Learning"'...
Error: 404 for DOI 10.1007/s10846-024-02176-2
Error: 404 for DOI 10.1088/1361-6501/ad7bdd
Error: 404 for DOI 10.1088/1361-6501/ad7b66
Error: 404 for DOI 10.1088/1361-6501/ad5862
Error: 404 for DOI 10

In [None]:
df = pd.read_csv('scopus_papers_with_abstracts.csv')
df.head()

Unnamed: 0,Title,Authors,Abstract,Year,Journal/Conference,DOI,URL
0,A review of black-box adversarial attacks on i...,Zhu Y.,"In recent years, deep learning-based image cla...",2024,Neurocomputing,10.1016/j.neucom.2024.128512,https://api.elsevier.com/content/abstract/scop...
1,A deep learning-based algorithm for online det...,Ji M.,As a core material in wood structure buildings...,2024,Industrial Crops and Products,10.1016/j.indcrop.2024.119671,https://api.elsevier.com/content/abstract/scop...
2,High-speed railway express delivery volume for...,Huang W.,Current researches on logistics delivery volum...,2024,Expert Systems with Applications,10.1016/j.eswa.2024.125196,https://api.elsevier.com/content/abstract/scop...
3,Biomimetic model of photovoltaic cell defect d...,Qu Z.,Solar energy plays an important role in new po...,2024,Applied Energy,10.1016/j.apenergy.2024.124033,https://api.elsevier.com/content/abstract/scop...
4,Revisiting class-incremental object detection:...,Bai L.,"In real-world settings, object detectors frequ...",2024,Expert Systems with Applications,10.1016/j.eswa.2024.125057,https://api.elsevier.com/content/abstract/scop...


In [None]:
df.shape

(75, 7)

In [None]:
df['Abstract'].isna().sum()

21

In [None]:
df[df['Abstract'].isnull()]

Unnamed: 0,Title,Authors,Abstract,Year,Journal/Conference,DOI,URL
16,Tomato leaf disease recognition system using F...,Bachri K.O.,,2024,Telkomnika (Telecommunication Computing Electr...,10.12928/TELKOMNIKA.v22i6.25519,https://api.elsevier.com/content/abstract/scop...
25,A platform-agnostic deep reinforcement learnin...,Li D.,,2024,Communications Engineering,10.1038/s44172-024-00292-3,https://api.elsevier.com/content/abstract/scop...
28,Enhancing urban landscape analysis through com...,Saravanarajan V.S.,,2024,Journal of Ambient Intelligence and Humanized ...,10.1007/s12652-024-04879-8,https://api.elsevier.com/content/abstract/scop...
31,Advances in text-guided 3D editing: a survey,Lu L.,,2024,Artificial Intelligence Review,10.1007/s10462-024-10937-6,https://api.elsevier.com/content/abstract/scop...
32,PillarVTP: vehicle trajectory prediction metho...,Liao Z.,,2024,Multimedia Systems,10.1007/s00530-024-01521-7,https://api.elsevier.com/content/abstract/scop...
33,A small object detection architecture with con...,Mu J.,,2024,Journal of Real-Time Image Processing,10.1007/s11554-024-01562-1,https://api.elsevier.com/content/abstract/scop...
38,A novel 8-connected Pixel Identity GAN with Ne...,Mahmoud G.M.,,2024,Scientific Reports,10.1038/s41598-024-73976-7,https://api.elsevier.com/content/abstract/scop...
39,RRT-guided experience generation for reinforce...,Bécsi T.,,2024,Scientific Reports,10.1038/s41598-024-73881-z,https://api.elsevier.com/content/abstract/scop...
45,Efficient smart distributed face identificatio...,Ahmadi S.M.,,2024,International Journal of Electrical and Comput...,10.11591/ijece.v14i6.pp7145-7157,https://api.elsevier.com/content/abstract/scop...
46,EFMF-pillars: 3D object detection based on enh...,Zhang W.,,2024,Eurasip Journal on Advances in Signal Processing,10.1186/s13634-024-01186-4,https://api.elsevier.com/content/abstract/scop...


In [None]:
import requests
import pandas as pd
import time

# API keys and URLs
SCOPUS_API_KEY = 'fc8e266f77097a834ac8a98322346ea4'  # Scopus API Key
ARTICLE_API_KEY = 'fc8e266f77097a834ac8a98322346ea4'  # Article retrieval API Key

search_url = "https://api.elsevier.com/content/search/scopus"
article_url = "https://api.elsevier.com/content/article/doi/"

scopus_headers = {
    'X-ELS-APIKey': SCOPUS_API_KEY,
    'Accept': 'application/json'
}

article_headers = {
    'X-ELS-APIKey': ARTICLE_API_KEY,
    'Accept': 'text/xml'
}

# Define keywords to search
keywords = [
    '"Autonomous Vehicles" AND "Deep Learning"'
]

start_year = 2010
end_year = 2024

# Function to scrape search results from Scopus API
def scopus_search(query):
    papers = []
    count_per_page = 25  # Set to maximum limit
    total_results = 0
    start = 0

    # Retrieve the first page to get the total number of results
    try:
        initial_params = {
            'query': query,
            'count': 1,  # Get just 1 result initially to find total results
            'start': start,
            'date': f'{start_year}-{end_year}',
            'sort': 'relevance'
        }
        response = requests.get(search_url, headers=scopus_headers, params=initial_params)
        response.raise_for_status()
        initial_data = response.json()
        total_results = int(initial_data['search-results']['opensearch:totalResults'])
        print(f"Total results for query '{query}': {total_results}")

    except Exception as e:
        print(f"Error fetching initial data: {e}")
        return []

    # Now retrieve pages in batches until reaching total_results
    for start in range(0, total_results, count_per_page):
        params = {
            'query': query,
            'count': count_per_page,
            'start': start,
            'date': f'{start_year}-{end_year}',
            'sort': 'relevance'
        }

        try:
            response = requests.get(search_url, headers=scopus_headers, params=params)
            response.raise_for_status()
            data = response.json()

            entries = data.get('search-results', {}).get('entry', [])

            # Process each paper entry
            for entry in entries:
                title = entry.get('dc:title', 'N/A')
                authors = entry.get('dc:creator', 'N/A')
                year = entry.get('prism:coverDate', 'N/A').split('-')[0]
                journal = entry.get('prism:publicationName', 'N/A')
                doi = entry.get('prism:doi', 'N/A')
                url = entry.get('link')[0].get('@href', 'N/A') if entry.get('link') else 'N/A'

                # Retrieve the abstract using the DOI if available
                abstract = get_abstract(doi) if doi != 'N/A' else 'N/A'

                papers.append({
                    'Title': title,
                    'Authors': authors,
                    'Abstract': abstract,
                    'Year': year,
                    'Journal/Conference': journal,
                    'DOI': doi,
                    'URL': url
                })

            print(f"Processed {start + len(entries)} of {total_results} results for query '{query}'...")

            # Stop if no more entries are available
            if not entries:
                break

        except Exception as e:
            print(f"Error occurred while fetching data: {e}")
            break

        time.sleep(1)  # Delay to avoid overwhelming the API

    return papers

# Function to retrieve abstract using the Article API
def get_abstract(doi):
    try:
        response = requests.get(f"{article_url}{doi}", headers=article_headers)
        if response.status_code == 200:
            data = response.text
            # Extract abstract content from the XML (adjust parsing based on response structure)
            start = data.find("<dc:description>") + len("<dc:description>")
            end = data.find("</dc:description>")
            abstract = data[start:end].strip() if start != -1 and end != -1 else "N/A"
            return abstract
        else:
            print(f"Error: {response.status_code} for DOI {doi}")
            return "N/A"
    except Exception as e:
        print(f"Error fetching abstract for DOI {doi}: {e}")
        return "N/A"

# Start scraping process
def main():
    all_papers = []

    for keyword in keywords:
        print(f"Scraping papers for query: {keyword}")
        papers = scopus_search(keyword)
        all_papers.extend(papers)

    # Convert the list of papers to a DataFrame and save it to CSV
    df = pd.DataFrame(all_papers)
    df.to_csv('scopus_papers_with_abstracts.csv', index=False)
    print("Scraping completed and data saved to 'scopus_papers_with_abstracts.csv'.")
    print(df.head())

# Run the scraper
if __name__ == "__main__":
    main()


Scraping papers for query: "Autonomous Vehicles" AND "Deep Learning"
Total results for query '"Autonomous Vehicles" AND "Deep Learning"': 34073
Error: 404 for DOI 10.12928/TELKOMNIKA.v22i6.25519
Processed 25 of 34073 results for query '"Autonomous Vehicles" AND "Deep Learning"'...
Error: 404 for DOI 10.1038/s44172-024-00292-3
Error: 404 for DOI 10.1007/s12652-024-04879-8
Error: 404 for DOI 10.1007/s10462-024-10937-6
Error: 404 for DOI 10.1007/s00530-024-01521-7
Error: 404 for DOI 10.1007/s11554-024-01562-1
Error: 404 for DOI 10.1038/s41598-024-73976-7
Error: 404 for DOI 10.1038/s41598-024-73881-z
Error: 404 for DOI 10.11591/ijece.v14i6.pp7145-7157
Error: 404 for DOI 10.1186/s13634-024-01186-4
Error: 404 for DOI 10.1038/s41598-024-74679-9
Error: 404 for DOI 10.1061/JTEPBS.TEENG-8557
Processed 50 of 34073 results for query '"Autonomous Vehicles" AND "Deep Learning"'...
Error: 404 for DOI 10.1007/s10846-024-02176-2
Error: 404 for DOI 10.1088/1361-6501/ad7bdd
Error: 404 for DOI 10.1088/136

KeyboardInterrupt: 

In [None]:
df = pd.read_csv('scopus_papers_with_abstracts.csv')
df.head()

In [None]:
df.shape

In [None]:
df['Abstract'].isna().sum()

In [None]:
import requests
import time
import pandas as pd

# API keys and URLs
SCOPUS_API_KEY = 'fc8e266f77097a834ac8a98322346ea4'  # Scopus API Key
ARTICLE_API_KEY = 'fc8e266f77097a834ac8a98322346ea4'  # Article retrieval API Key

search_url = "https://api.elsevier.com/content/search/scopus"
article_url = "https://api.elsevier.com/content/article/doi/"

scopus_headers = {
    'X-ELS-APIKey': SCOPUS_API_KEY,
    'Accept': 'application/json'
}

article_headers = {
    'X-ELS-APIKey': ARTICLE_API_KEY,
    'Accept': 'text/xml'
}

# Define keywords to search
keywords = [
    '"Autonomous Vehicles" AND "Deep Learning"'
]

start_year = 2010
end_year = 2024

# Function to scrape search results from Scopus API
def scopus_search(query, max_results=None, check_total_only=False):
    papers = []
    count_per_page = 25  # Results per page
    total_results = 0

    # Initial call to get total results
    if check_total_only:
        params = {
            'query': query,
            'count': 1,
            'start': 0,
            'date': f'{start_year}-{end_year}',
            'sort': 'relevance'
        }
        response = requests.get(search_url, headers=scopus_headers, params=params)
        data = response.json()
        total_results = int(data['search-results']['opensearch:totalResults'])
        return total_results  # Return only the total count if check_total_only is True

    # Loop through pages to collect results
    for start in range(0, max_results or total_results, count_per_page):
        params = {
            'query': query,
            'count': count_per_page,
            'start': start,
            'date': f'{start_year}-{end_year}',
            'sort': 'relevance'
        }

        try:
            response = requests.get(search_url, headers=scopus_headers, params=params)
            response.raise_for_status()
            data = response.json()

            entries = data.get('search-results', {}).get('entry', [])
            total_results += len(entries)

            # Process each paper entry
            for entry in entries:
                title = entry.get('dc:title', 'N/A')
                authors = entry.get('dc:creator', 'N/A')
                year = entry.get('prism:coverDate', 'N/A').split('-')[0]
                journal = entry.get('prism:publicationName', 'N/A')
                doi = entry.get('prism:doi', 'N/A')
                url = entry.get('link')[0].get('@href', 'N/A') if entry.get('link') else 'N/A'

                # Retrieve the abstract using the DOI if available
                abstract = get_abstract(doi) if doi != 'N/A' else 'N/A'

                papers.append({
                    'Title': title,
                    'Authors': authors,
                    'Abstract': abstract,
                    'Year': year,
                    'Journal/Conference': journal,
                    'DOI': doi,
                    'URL': url
                })

            if len(papers) >= (max_results or total_results):
                break

            time.sleep(1)  # API rate limit delay

        except Exception as e:
            print(f"Error occurred while fetching data: {e}")
            break

    return papers[:max_results]

# Function to retrieve abstract using the Article API
def get_abstract(doi):
    try:
        response = requests.get(f"{article_url}{doi}", headers=article_headers)
        if response.status_code == 200:
            data = response.text
            # Extract abstract content from the XML
            start = data.find("<dc:description>") + len("<dc:description>")
            end = data.find("</dc:description>")
            abstract = data[start:end].strip() if start != -1 and end != -1 else "N/A"
            return abstract
        else:
            print(f"Error: {response.status_code} for DOI {doi}")
            return "N/A"
    except Exception as e:
        print(f"Error fetching abstract for DOI {doi}: {e}")
        return "N/A"

# Main function to run the scraper
def main():
    all_papers = []

    for keyword in keywords:
        print(f"Scraping papers for query: {keyword}")

        # Get the total number of results for the query
        total_results = scopus_search(keyword, check_total_only=True)
        print(f"Total results for query '{keyword}': {total_results}")

        # Ask the user for the number of results they want
        max_results = int(input(f"How many results would you like to scrape (up to {total_results})? "))
        max_results = min(max_results, total_results)  # Ensure it doesn't exceed total results

        papers = scopus_search(keyword, max_results=max_results)
        all_papers.extend(papers)

    # Convert the list of papers to a DataFrame and save it to CSV
    df = pd.DataFrame(all_papers)
    df.to_csv('scopus_papers_with_abstracts.csv', index=False)
    print("Scraping completed and data saved to 'scopus_papers_with_abstracts.csv'.")
    print(df.head())

# Run the scraper
if __name__ == "__main__":
    main()


Scraping papers for query: "Autonomous Vehicles" AND "Deep Learning"
Total results for query '"Autonomous Vehicles" AND "Deep Learning"': 34073
How many results would you like to scrape (up to 34073)? 500
Error: 404 for DOI 10.12928/TELKOMNIKA.v22i6.25519
Error: 404 for DOI 10.1038/s44172-024-00292-3
Error: 404 for DOI 10.1007/s12652-024-04879-8
Error: 404 for DOI 10.1007/s10462-024-10937-6
Error: 404 for DOI 10.1007/s00530-024-01521-7
Error: 404 for DOI 10.1007/s11554-024-01562-1
Error: 404 for DOI 10.1038/s41598-024-73976-7
Error: 404 for DOI 10.1038/s41598-024-73881-z
Error: 404 for DOI 10.11591/ijece.v14i6.pp7145-7157
Error: 404 for DOI 10.1186/s13634-024-01186-4
Error: 404 for DOI 10.1038/s41598-024-74679-9
Error: 404 for DOI 10.1061/JTEPBS.TEENG-8557
Error: 404 for DOI 10.1007/s10846-024-02176-2
Error: 404 for DOI 10.1088/1361-6501/ad7bdd
Error: 404 for DOI 10.1088/1361-6501/ad7b66
Error: 404 for DOI 10.1088/1361-6501/ad5862
Error: 404 for DOI 10.1038/s41598-024-74357-w
Error: 40

In [None]:
df = pd.read_csv('scopus_papers_with_abstracts.csv')
df.head()

Unnamed: 0,Title,Authors,Abstract,Year,Journal/Conference,DOI,URL
0,A review of black-box adversarial attacks on i...,Zhu Y.,"In recent years, deep learning-based image cla...",2024,Neurocomputing,10.1016/j.neucom.2024.128512,https://api.elsevier.com/content/abstract/scop...
1,A deep learning-based algorithm for online det...,Ji M.,As a core material in wood structure buildings...,2024,Industrial Crops and Products,10.1016/j.indcrop.2024.119671,https://api.elsevier.com/content/abstract/scop...
2,High-speed railway express delivery volume for...,Huang W.,Current researches on logistics delivery volum...,2024,Expert Systems with Applications,10.1016/j.eswa.2024.125196,https://api.elsevier.com/content/abstract/scop...
3,Biomimetic model of photovoltaic cell defect d...,Qu Z.,Solar energy plays an important role in new po...,2024,Applied Energy,10.1016/j.apenergy.2024.124033,https://api.elsevier.com/content/abstract/scop...
4,Revisiting class-incremental object detection:...,Bai L.,"In real-world settings, object detectors frequ...",2024,Expert Systems with Applications,10.1016/j.eswa.2024.125057,https://api.elsevier.com/content/abstract/scop...


In [None]:
df.shape

(500, 7)

In [None]:
df['Abstract'].isna().sum()

226