In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [33]:
def scrape_google_scholar(query):
    base_url = "https://scholar.google.com"
    url = f"{base_url}/scholar?q={query}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    articles = []
    for result in soup.find_all('div', class_='gs_ri'):
        title = result.find('h3', class_='gs_rt').text.strip()
        abstract = result.find('div', class_='gs_rs')
        abstract = abstract.text.strip() if abstract else None
        authors = result.find('div', class_='gs_a').text.strip()
        link = result.find('a')['href']
        
        # Now let's scrape the subject area and country of the authors from the article's page
        article_response = requests.get(link, headers=headers)
        article_soup = BeautifulSoup(article_response.text, 'html.parser')
        
        # Extracting subject area
        subject_area = article_soup.find('div', class_='gs_scl')
        subject_area = subject_area.text.strip() if subject_area else None
        
        # Extracting country of authors from author affiliations
        author_affiliations = article_soup.find_all('div', class_='gs_aff')
        country = None
        for affiliation in author_affiliations:
            affiliation_text = affiliation.text.strip()
            if affiliation_text.endswith(')'):
                country = affiliation_text.split(',')[-1].strip(')')
                break
        
        articles.append({
            'Title': title,
            'Abstract': abstract,
            'Authors': authors,
            'Link': link,
            'Subject Area': subject_area,
            'Country': country
        })
    
    return articles

In [34]:
def scrape_sciencedirect(query):
    base_url = "https://www.sciencedirect.com"
    url = f"{base_url}/search?qs={query}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.sciencedirect.com/",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    articles = []
    for result in soup.find_all('a', class_='result-list-title-link'):
        title = result.text.strip()
        link = result['href']
        authors = result.find_next('ul', class_='author-group')
        authors = authors.text.strip() if authors else None
        date = result.find_next('dd', class_='publication-date')
        date = date.text.strip() if date else None
        articles.append({'Title': title, 'Authors': authors, 'Date': date, 'Link': link})
    
    return articles


In [35]:
def scrape_ieee_xplore(query):
    base_url = "https://ieeexplore.ieee.org"
    url = f"{base_url}/search/searchresult.jsp?newsearch=true&queryText={query}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(response.text, 'html.parser')

        articles = []
        for result in soup.find_all('div', class_='List-results-items'):
            title = result.find('h2', class_='title').text.strip()
            authors = result.find('p', class_='authors').text.strip()
            publication_info = result.find('p', class_='publisher').text.strip()
            link = base_url + result.find('a', class_='document-title')['href']
            articles.append({'Title': title, 'Authors': authors, 'Publication Info': publication_info, 'Link': link})

        return articles
    except requests.exceptions.RequestException as e:
        print("Error fetching data:", e)
        return []

In [36]:
query = "engineering"
google_scholar_results = scrape_google_scholar(query)
sciencedirect_results = scrape_sciencedirect(query)
ieee_xplore_results = scrape_ieee_xplore(query)

In [37]:
# Convert results to pandas DataFrame
google_scholar_df = pd.DataFrame(google_scholar_results)
sciencedirect_df = pd.DataFrame(sciencedirect_results)
ieee_xplore_df = pd.DataFrame(ieee_xplore_results)

In [16]:
def display_results(df: pd.DataFrame):
    if df.empty:
        print("Got blocked by the website. Try again later.")
    else:
        print(df)
    return

In [38]:
display_results(google_scholar_df)

                                               Title  \
0  [HTML][HTML] Engineering solventogenic clostridia   
1                           Biocommodity engineering   
2           [หนังสือ][B] Oceanographical engineering   
3                [หนังสือ][B] Engineering in history   
4  [หนังสือ][B] Micromanufacturing engineering an...   
5             [หนังสือ][B] Aquacultural engineering.   
6  [หนังสือ][B] Bioseparations science and engine...   
7                [หนังสือ][B] Engineering statistics   
8  [HTML][HTML] Reliability engineering: Old prob...   
9         [หนังสือ][B] System engineering management   

                                            Abstract  \
0  … Pathway engineering efforts have resulted in...   
1  The application of biotechnology to the produc...   
2  As is the case with many modern fields of stud...   
3  … engineering:“It is customary to think of eng...   
4  … , and thin film fabrication Outlines system ...   
5  This book is divided into 2 parts which cove

In [43]:
google_scholar_df

Unnamed: 0,Title,Abstract,Authors,Link,Subject Area,Country
0,[HTML][HTML] Engineering solventogenic clostridia,… Pathway engineering efforts have resulted in...,ET Papoutsakis - Current opinion in biotechnol...,https://www.sciencedirect.com/science/article/...,,
1,Biocommodity engineering,The application of biotechnology to the produc...,"LR Lynd, CE Wyman, TU Gerngross - Biotechnolog...",https://aiche.onlinelibrary.wiley.com/doi/abs/...,,
2,[หนังสือ][B] Oceanographical engineering,As is the case with many modern fields of stud...,RL Wiegel - 2013 - books.google.com,https://books.google.com/books?hl=th&lr=&id=0A...,,
3,[หนังสือ][B] Engineering in history,… engineering:“It is customary to think of eng...,RS Kirby - 1990 - books.google.com,https://books.google.com/books?hl=th&lr=&id=MX...,,
4,[หนังสือ][B] Micromanufacturing engineering an...,"… , and thin film fabrication Outlines system ...",Y Qin - 2010 - books.google.com,https://books.google.com/books?hl=th&lr=&id=yf...,,
5,[หนังสือ][B] Aquacultural engineering.,This book is divided into 2 parts which cover ...,FW Wheaton - 1993 - cabidigitallibrary.org,https://www.cabidigitallibrary.org/doi/full/10...,,
6,[หนังสือ][B] Bioseparations science and engine...,"Designed for undergraduates, graduate students...","RG Harrison, P Todd, SR Rudge, DP Petrides - 2...",https://books.google.com/books?hl=th&lr=&id=15...,,
7,[หนังสือ][B] Engineering statistics,This Student Solutions Manual is meant to acco...,"DC Montgomery, GC Runger, NF Hubele - 2009 - b...",https://books.google.com/books?hl=th&lr=&id=O-...,,
8,[HTML][HTML] Reliability engineering: Old prob...,… on problems and challenges of current reliab...,E Zio - Reliability engineering & system safet...,https://www.sciencedirect.com/science/article/...,,
9,[หนังสือ][B] System engineering management,"An updated classic covering applications, proc...",BS Blanchard - 2004 - books.google.com,https://books.google.com/books?hl=th&lr=&id=Gf...,,


In [42]:
display_results(sciencedirect_df)

Got blocked by the website. Try again later.


In [10]:
display_results(ieee_xplore_df)

Got blocked by the website. Try again later.
