# 📝 Scraping Google Scholar

#### I used ScraperAPI which handles proxies, browsers, and CAPTCHAs, so you can get the HTML from any web page with a simple API call!

In [2]:
import requests
import csv
from bs4 import BeautifulSoup

# the API key provided by scraperapi.com
my_api_key = 'my_api_key'

# specify the search term
search_term = 'ChatGPT'

# specify the first page for scraping
page_num = 0

# create a csv file to store the data
with open('chatgpt_articles.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Link', 'Snippet', 'Citations', 'Puplication Information'])

    while (page_num < 100):
        
        # create the URL for the API request
        url = f'https://api.scraperapi.com/?api_key={my_api_key}&url=https://scholar.google.com/scholar?start={page_num * 10}&q={search_term}&hl=en&as_sdt=0,5'
        
        # send a request to the URL and get the response
        response = requests.get(url)
        
        # create a soup object from the response text
        soup = BeautifulSoup(response.content, "lxml")


        # find all the search results on the page
        results = soup.find_all('div', {'class': 'gs_ri'})
    
        if not results:
            break
            
        else:
            # iterate through the search results and extract the data
            for result in results:
                title_elem = result.find('h3', {'class': 'gs_rt'})
                if title_elem is not None:
                    title = title_elem.text
                    link = None  # define link with a default value
                    link_elem = title_elem.find('a')
                    if link_elem is not None:
                        link = link_elem['href']
                snippet_elem = result.find('div', {'class': 'gs_rs'})
                if snippet_elem is not None:
                    snippet = snippet_elem.text
                citations_elem = result.find('div', {'class': 'gs_fl'})
                if citations_elem is not None:
                    citations = citations_elem.text
                pub_info_elem = result.find('div', {'class': 'gs_a'})
                if pub_info_elem is not None:
                    pub_info = pub_info_elem.text
                

                # write the data to the csv file
                writer.writerow([title, link, snippet, citations, pub_info])
            
        page_num += 1
            
print("done")




done


#### In order to avoid break downs or IP blocks, we iterated the code above 10 times, and every time scraped only 10 pages. Finally, we concatenated all csv files.

In [7]:
# merging all the csv files into one
import pandas as pd
import glob

# Get a list of all CSV files in the current directory
files = glob.glob('chatgpt_articles*.csv')

# Concatenate all CSV files into a single dataframe
df = pd.concat([pd.read_csv(f) for f in files])

# Write the concatenated dataframe to a CSV file
df.to_csv('chatgpt_articles.csv', index=False)
