# Data Collection

<div class="alert alert-block alert-info">
Data consists of items (scientific articles, books and citations) from Google Scholar searches on equine colic. There are about 1000 items. The collected information contains the title, the truncated abstract or description and the metainfo (which contains the authors, publication journal or website for articles, the publisher for books, and the year of publication).
</div>

## Workspace and Useful Functions

In [1]:
# Import neccessary libraries

import requests
from bs4 import BeautifulSoup

import time
import random
import os
import numpy as np
import pandas as pd

In [None]:
# Function to retrieve the information for each item
def get_articles_metadata(soup):
    '''
    Returns lists with articles information.
    INPUT:
        soup (bs4.BeautifulSoup) 
    OUTPUT:
        titles, abstracts, augumented_titles, metainfo (lists)
    '''
    
    titles, abstracts, metainfo = [], [], []
    
    # Get the item's title
    article_tags = soup.select('[data-lid]')
    for entry in article_tags:
        titles.append(entry.select('h3')[0].get_text())
    
    # Get the metainfo: authors, publication, year
    authors_tags = soup.find_all("div", {"class": "gs_a"})
    for entry in authors_tags:
        metainfo.append(entry.get_text())
    
    # Get the truncated abstract or description
    abstract_tags = soup.find_all("div", {"class": "gs_rs"})
    for entry in abstract_tags:
        abstracts.append(entry.get_text())
        
    # Combine titles and truncated abstract 
    augumented_titles = list(zip(titles, abstracts))
        
    return titles, abstracts, augumented_titles, metainfo
    

In [None]:
# Create the items repository
articles_repos_dict = {'Title' : [], 'TruncAbstr' : [], 
                       'AugmTitle' : [], 'Info' : []}

# Add records to the repository
def add_to_repo(titles, abstracts, augmented_titles, metainfo):
    articles_repos_dict['Title'].extend(titles)
    articles_repos_dict['TruncAbstr'].extend(abstracts)
    articles_repos_dict['AugmTitle'].extend(augmented_titles)
    articles_repos_dict['Info'].extend(metainfo)
    
    return pd.DataFrame(articles_repos_dict)

## Scrape Google Scholar

<div class="alert alert-block alert-info">
Some of the data was collected and parsed via crawling the Google Scholar page.
</div>

In [2]:
# Create a requests session 

s = requests.Session()
s.auth = ('user', 'pass')
s.headers.update({'x-test': 'true'})

# both 'x-test' and 'x-test2' are sent
s.get('http://httpbin.org/headers', headers={'x-test2': 'true'})

<Response [200]>

In [3]:
# Function to retrieve the information from one page (contains 10 items)

def get_page_info(url):
    
    '''
    Downloads a page with a given url.
    
    INPUT:
        url (str) - specify the url for the page to download
        
    OUTPUT:
        soup (bs4 object) - Beautiful Soup object
    '''
    
    # Download the page 
    response = s.get(url)
    
    # Check if the response is succesful 
    if response.status_code != 200:
        print('Status code:', response.status_code)
    raise Exception('Failed to fetch web page. ')
        
    # Parse using beautiful soup
    soup = BeautifulSoup(response.text,'lxml')
    
    return soup

In [None]:
# Extract information from several pages
# Must be slow not to overload the servers

url_equine = 'https://scholar.google.com/scholar?start={}&q="equine+colic"+&hl=en&as_sdt=0,5'

# Number of articles to extract, there are 10 articles per page
n = 100

for i in range (0,40,10):
    
    # Get url for the each page
    url = url_equine.format(i)
    #url = url_human.format(i)
    
    # Get the soup object for each page
    soup = get_page_info(url)
    
    # Collect all the information 
    titles, abstracts, augmented_titles, metainfo = get_articles_metadata(soup)
    
    # Add records to the repository
    collection = add_to_repo(titles, abstracts, augmented_titles, metainfo)
    
    # Save the record in a file
    collection.to_csv(f'data/eqcol_{i}.csv', index=False)
  
    # Use sleep to avoid status code 429
    time.sleep(np.random.randint(30, 40))

## Parse Downloaded Files

<div class="alert alert-block alert-info">
Part of the data was collected by downloading the files separately and then parsing them with Beautiful Soup.
</div>

In [None]:
# Print the working directory and the file directories
directory_human = os.getcwd()+'/files_human'
directory_horse = os.getcwd()+'/files_horse'

In [None]:
# Parse each html file with BeautifulSoup
def get_local_data(directory, file_to_save):
    
    # Loop through all the files present in the given directory
    for filename in os.listdir(directory):
        
        # Check the file extension
        if filename.endswith('.html'):
          
            # Get the full path of the file
            fname = os.path.join(directory, filename)
          
            # Open the file and create a BeautifulSoup object
            with open(fname, 'r') as file:
                soup = BeautifulSoup(file.read(), 'lxml')
            
                # Collect all the neccessary information 
                titles, abstracts, augmented_titles, metainfo = get_articles_metadata(soup)
            
                # Add the records to the repository
                collection = add_to_repo(titles, abstracts, augmented_titles, metainfo)
                
        # Save the records in a file
        collection.to_csv(f'data/'+file_to_save+f'.csv', index=False)
            

<div class="alert alert-block alert-info">
Have to clean cache and rebuild the soup object for each group of files, otherwise the soup object will add on the top of the previous step.
</div>

In [None]:
# Parse the files on equine colic
horse_local_data= get_local_data(directory_horse, 'horse_local')

In [None]:
# Check the results
horse_check = pd.read_csv('data/horse_local.csv')
horse_check

## Combine the Data

In [None]:
# The data obtained via crawling the Google Scholar
eqcol_df = pd.read_csv('data/eqcol_460.csv')
eqcol_df

In [None]:
# Combine the two dataframes on equine colic
horse_full = pd.concat([eqcol_df, horse_check], ignore_index=True, sort=False)
horse_full

In [None]:
# Save the combined dataframe to a file
horse_full.to_csv(f'data/horse_complete.csv', index=False)

## References

- [Nandini Saini, Scraping Information of Research Papers on Google Scholar using Python, Medium (Aug 18, 2021)](https://medium.com/@nandinisaini021/scraping-publications-of-aerial-image-research-papers-on-google-scholar-using-python-a0dee9744728)