# Database Creator

In [None]:
# All imports go here

import os
import sys
import requests
from bs4 import BeautifulSoup as bs
import csv 

## Google Scholar Retriever

In [None]:
def google_scholar():
    
    # URL to search for scientits. The keyword is 'physic', which will hopefull 
    # return most of the physics scientist. The Indian search is enforced through
    # the domains .ac.in and .res.in. Currently searches only through first page,
    # needs to be extended to all pages
    
    url = "https://scholar.google.co.in/citations?hl=en&view_op=search_authors&mauthors=physic+%2B+.ac.in+%7C+.res.in&btnG="

    page = requests.get(url)
    soup = bs(page.content, features='lxml')
    results = {}

    for tag in soup.findAll('h3', attrs={'class': "gs_ai_name"}):

        # obtain name, affiliation, interests and homepage (if exists)
        name = tag.text
        link = "https://scholar.google.com"+tag.next['href']
        author_soup = bs(requests.get(link).content, features='lxml')
        affil_tag = author_soup.find('div', attrs={'class':"gsc_prf_il"})
        affil = affil_tag.text
        try:
            homepage = author_soup.find('a', text = "Homepage")['href']
        except:
            homepage = ""
        interests = [child.text for child in author_soup.find('div', attrs={'class':"gsc_prf_il", 'id':"gsc_prf_int"}).findChildren()]

        data = [name, affil, homepage, ', '.join(interests)]
        
        # append data for this scientist to the dictionary
        for interest in interests:
            
            # sanitise interest by changing space to
            # _ and converting all to lower case
            interest_sanitised = interest.replace(' ', '_').lower()
            
            # create key if does not exist
            if interest_sanitised not in results:
                results[interest_sanitised] = [data]
            else:
                results[interest_sanitised].append(data)
        
    return results

# Dump results into files

In [None]:
# headings of each CSV file
headings = ['Name', 'Affiliation', 'Homepage', 'Interests']

# path of database folder, inside which all the
# csv files will live. Folder is automatically
# created if it does not exist.
folder = "./database/"
if not os.path.exists(folder):
    os.mkdir(folder)

# obtain dictionaries from all existing retrievers
results_arr = []
results_arr.append(google_scholar())

# write results of each dictionary into each csv file
for results in results_arr:
    for key in results:
        
        # file is opened in append mode 'a', this ensures
        # that existing files are not overwritten
        csv_w = csv.writer(open(folder+key+".csv", 'a'), delimiter='\t')
        data = results[key]
        csv_w.writerow(headings)
        csv_w.writerows(data)