# Database Creator

In [20]:
# All imports go here

import os
import sys
import requests
from bs4 import BeautifulSoup as bs
import csv 

## Google Scholar Retriever

In [21]:
def google_scholar():
    
    # URL to search for scientits. The keyword is 'physic', which will hopefull 
    # return most of the physics scientist. The Indian search is enforced through
    # the domains .ac.in and .res.in. Currently searches only through first page,
    # needs to be extended to all pages
    
    url = "https://scholar.google.co.in/citations?hl=en&view_op=search_authors&mauthors=physic+%2B+.ac.in+%7C+.res.in&btnG="

    page = requests.get(url)
    soup = bs(page.content, features='lxml')
    results = {}

    for tag in soup.findAll('h3', attrs={'class': "gs_ai_name"}):

        # obtain name, affiliation, interests and homepage (if exists)
        name = tag.text
        link = "https://scholar.google.com"+tag.next['href']
        author_soup = bs(requests.get(link).content, features='lxml')
        affil_tag = author_soup.find('div', attrs={'class':"gsc_prf_il"})
        affil = affil_tag.text
        try:
            homepage = author_soup.find('a', text = "Homepage")['href']
        except:
            homepage = ""
        interests = [child.text for child in author_soup.find('div', attrs={'class':"gsc_prf_il", 'id':"gsc_prf_int"}).findChildren()]

        data = [name, affil, homepage, ', '.join(interests)]
        
        # append data for this scientist to the dictionary
        for interest in interests:
            
            # sanitise interest by changing space to
            # _ and converting all to lower case
            interest_sanitised = interest.replace(' ', '_').lower()
            
            # create key if does not exist
            if interest_sanitised not in results:
                results[interest_sanitised] = [data]
            else:
                results[interest_sanitised].append(data)
        
    return results