<a href="https://colab.research.google.com/github/TechWhizGenius/Teja_INFO5731_Fall2024/blob/main/TERM_PROJECT/Scrapping_Scopus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This file contains scraping data from scopus. Its observed that the used api couldn't extract some abstracts. And so, we are removing all those papers once collected


In [None]:
# Importing necessary packages

import requests
import pandas as pd
import time

In [None]:
import requests
import time
import pandas as pd

# API keys and URLs
SCOPUS_API_KEY = 'fc8e266f77097a834ac8a98322346ea4'  # Scopus API Key
ARTICLE_API_KEY = 'fc8e266f77097a834ac8a98322346ea4'  # Article retrieval API Key

search_url = "https://api.elsevier.com/content/search/scopus"
article_url = "https://api.elsevier.com/content/article/doi/"

scopus_headers = {
    'X-ELS-APIKey': SCOPUS_API_KEY,
    'Accept': 'application/json'
}

article_headers = {
    'X-ELS-APIKey': ARTICLE_API_KEY,
    'Accept': 'text/xml'
}

# Define keywords to search
keywords = [
    '"Autonomous Navigation"'
]

start_year = 2010
end_year = 2024

# Function to scrape search results from Scopus API
def scopus_search(query, max_results=None, check_total_only=False):
    papers = []
    count_per_page = 25  # Results per page
    total_results = 0

    # Initial call to get total results
    if check_total_only:
        params = {
            'query': query,
            'count': 1,
            'start': 0,
            'date': f'{start_year}-{end_year}',
            'sort': 'relevance'
        }
        response = requests.get(search_url, headers=scopus_headers, params=params)
        data = response.json()
        total_results = int(data['search-results']['opensearch:totalResults'])
        return total_results  # Return only the total count if check_total_only is True

    # Loop through pages to collect results
    for start in range(0, max_results or total_results, count_per_page):
        params = {
            'query': query,
            'count': count_per_page,
            'start': start,
            'date': f'{start_year}-{end_year}',
            'sort': 'relevance'
        }

        try:
            response = requests.get(search_url, headers=scopus_headers, params=params)
            response.raise_for_status()
            data = response.json()

            entries = data.get('search-results', {}).get('entry', [])
            total_results += len(entries)

            # Process each paper entry
            for entry in entries:
                title = entry.get('dc:title', 'N/A')
                authors = entry.get('dc:creator', 'N/A')
                year = entry.get('prism:coverDate', 'N/A').split('-')[0]
                journal = entry.get('prism:publicationName', 'N/A')
                doi = entry.get('prism:doi', 'N/A')
                url = entry.get('link')[0].get('@href', 'N/A') if entry.get('link') else 'N/A'

                # Retrieve the abstract using the DOI if available
                abstract = get_abstract(doi) if doi != 'N/A' else 'N/A'

                papers.append({
                    'Title': title,
                    'Authors': authors,
                    'Abstract': abstract,
                    'Year': year,
                    'Journal/Conference': journal,
                    'DOI': doi,
                    'URL': url
                })

            if len(papers) >= (max_results or total_results):
                break

            time.sleep(1)  # API rate limit delay

        except Exception as e:
            print(f"Error occurred while fetching data: {e}")
            break

    return papers[:max_results]

# Function to retrieve abstract using the Article API
def get_abstract(doi):
    try:
        response = requests.get(f"{article_url}{doi}", headers=article_headers)
        if response.status_code == 200:
            data = response.text
            # Extract abstract content from the XML
            start = data.find("<dc:description>") + len("<dc:description>")
            end = data.find("</dc:description>")
            abstract = data[start:end].strip() if start != -1 and end != -1 else "N/A"
            return abstract
        else:
            #print(f"Error: {response.status_code} for DOI {doi}")
            return "N/A"
    except Exception as e:
        print(f"Error fetching abstract for DOI {doi}: {e}")
        return "N/A"

# Main function to run the scraper
def main():
    all_papers = []

    for keyword in keywords:
        print(f"Scraping papers for query: {keyword}")

        # Get the total number of results for the query
        total_results = scopus_search(keyword, check_total_only=True)
        print(f"Total results for query '{keyword}': {total_results}")

        # Ask the user for the number of results they want
        max_results = int(input(f"How many results would you like to scrape (up to {total_results})? "))
        max_results = min(max_results, total_results)  # Ensure it doesn't exceed total results

        papers = scopus_search(keyword, max_results=max_results)
        all_papers.extend(papers)

    # Convert the list of papers to a DataFrame and save it to CSV
    df = pd.DataFrame(all_papers)
    df.to_csv('scopus_papers_Artificial Intelligence in Transportation.csv', index=False)
    print("Scraping completed and data saved to 'scopus_papers_Artificial Intelligence in Transportation.csv'.")
    print(df.head())

# Run the scraper
if __name__ == "__main__":
    main()

Scraping papers for query: "Autonomous Navigation"
Total results for query '"Autonomous Navigation"': 31426
How many results would you like to scrape (up to 31426)? 5000
Scraping completed and data saved to 'scopus_papers_Artificial Intelligence in Transportation.csv'.
                                               Title        Authors  \
0  BEV perception for autonomous driving: State o...        Zhao J.   
1  USV formation navigation decision-making throu...         Cui Z.   
2  Integrated model of cerebellal supervised lear...          Wu Z.   
3  Robotic destructive and nondestructive testing...  Roudsari S.S.   
4  Ground-based on-line weed control using comput...   Abouzahir S.   

                                            Abstract  Year  \
0  The remarkable performance of Bird’s Eye View ...  2024   
1  To address the challenging of balancing Unmann...  2024   
2  Behavioral decision-making in unknown environm...  2024   
3  Structural elements may develop defects due to...  2

In [None]:
df = pd.read_csv('scopus_papers_Artificial Intelligence in Transportation.csv')
print(df.head())

print('----------------------------------------------------------------------------------------------------------')

print('The Size of the Dataframe is ',df.shape)
print('Number of null values in abstrct column is ', df['Abstract'].isna().sum())

                                               Title       Authors  \
0  Nighttime fog and low stratus detection under ...      Jiang J.   
1  A machine learning comparison of transportatio...    Banyong C.   
2  The impact of smart city construction on achie...       Yang X.   
3      GLAN: A graph-based linear assignment network        Liu H.   
4  Obvious artificial intelligence-generated anom...  Gulumbe B.H.   

                                            Abstract  Year  \
0  A scheme for satellite remote sensing is propo...  2024   
1  Thailand's collaboration with China to develop...  2024   
2  Amidst the intensification of global warming c...  2024   
3  Differentiable solvers for the linear assignme...  2024   
4                                                NaN  2024   

                                  Journal/Conference  \
0  ISPRS Journal of Photogrammetry and Remote Sen...   
1                             Results in Engineering   
2                                    L

In [None]:
# Remove rows with null values in the 'Abstract' column
df_cleaned = df.dropna(subset=['Abstract'])

# Save the cleaned DataFrame to a CSV file
df_cleaned.to_csv('scopus_papers_Artificial Intelligence in Transportation.csv', index=False)

print('The number of rows in final scraped dataframe is ', df_cleaned.shape)

from google.colab import files

files.download('scopus_papers_Artificial Intelligence in Transportation.csv')

The number of rows in final scraped dataframe is  (59, 7)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>