# Database Creator

In [60]:
# All imports go here

import os
import sys
import requests
from bs4 import BeautifulSoup as bsoup
import csv
import wget

## Combine interest keywords into a single keyword dictionary

In [61]:
def add_to_results(data, results):
    interests = data[-1]
    for interest in interests:
        # sanitise interest by changing space to
        # _ and converting all to lower case
        interest_sanitised = interest.replace(' ', '_').lower()

        # create key if does not exist
        if interest_sanitised not in results:
            results[interest_sanitised] = [data]
        else:
            results[interest_sanitised].append(data)
    return results

## IISER-K Retriever

In [None]:
def iiserk():
    url = "https://www.iiserkol.ac.in/web/en/people/faculty/dps/#gsc.tab=0"
    soup = bsoup(requests.get(url).content, features='lxml')
    results = {}
    for tag in soup.findAll('b', text="Department:"):
        parent = tag.findPrevious('td')
        # print (parent.findChild('a').findNext())
        name = parent.findChild('a', attrs={'style':"color:#0976e4;"}).text.strip()
        desig = parent.findChild('b', text="Department:").findPrevious().previousSibling.strip()
        try:
            interests_string = parent.findChild('b', text="Research Area:").nextSibling
            delim = ';' if ';' in interests_string else ','
            interests = [s.strip() for s in interests_string.split(delim)]
        except:
            interests = []
        print (name, desig, interests)
        homepage = "https://www.iiserkol.ac.in" + tag.findPrevious('a')['href']
        data = [name, "IISER-K", desig, homepage, interests]
        results = add_to_results(data, results)
    return results

## Google Scholar Retriever

In [None]:
def google_scholar():
    
    # URL to search for scientists. The keyword is 'physic', which will hopefully 
    # return most of the physics scientists. The Indian search is enforced through
    # the domains .ac.in and .res.in. Currently searches only through first page,
    # needs to be extended to all pages
    
    url = "https://scholar.google.co.in/citations?hl=en&view_op=search_authors&mauthors=physic+%2B+.ac.in+%7C+.res.in&btnG="
       
    page = requests.get(url)
    soup = bsoup(page.content, features='lxml')
    results = {}
    
    count = 1
    while soup != []:
        print ("G Scholar: Page", count)
        count += 1
        
        tags = soup.findAll('h3', attrs={'class': "gs_ai_name"})
        if tags == []:
            print ("There's a problem.")
        for tag in tags:

            # obtain name, affiliation, interests and homepage (if exists)
            name = tag.text
            link = "https://scholar.google.com"+tag.next['href']
            author_soup = bsoup(requests.get(link).content, features='lxml')
            affil_tag = author_soup.find('div', attrs={'class':"gsc_prf_il"})
            affil = affil_tag.text
            try:
                homepage = author_soup.find('a', text = "Homepage")['href']
            except:
                homepage = ""
            interests = [child.text for child in author_soup.find('div', attrs={'class':"gsc_prf_il", 'id':"gsc_prf_int"}).findChildren()]

            data = [name, affil, affil, homepage, ', '.join(interests)]

            # append data for this scientist to the dictionary
            results = add_to_results(data, results)
            
        try:
            next_btn = soup.find('button', attrs={'aria-label': "Next", 'class': "gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx"})
            url = 'https://scholar.google.co.in' + str(next_btn['onclick']).replace('window.location=\'', '')[:-1]
            index = url.find("\\x")
            while index >= 0:
                hexa = url[index+2:index+4]
                actual = bytearray.fromhex(hexa).decode()
                url = url.replace("\\x"+hexa, actual)
                index = url.find("\\x")
            page = requests.get(url)
            soup = bsoup(page.content, features='lxml')
        except:
            soup = []
        
    return results

## IISc Retriever

In [62]:
def iisc():
    results = {}
    url_aap = "http://www.physics.iisc.ac.in/~jap/people-fac.html"
    fname = "file.html"
    os.system("wget " + url_aap + " -O " + fname + " >/dev/null 2>&1")
    soup = bsoup(open(fname, 'r'))
    for tag in soup.findAll('div', attrs={'class': "about-veno"}):
        name = tag.findNext('h3').text
        desig = tag.findNext('h4').text
        interests = str(list(tag.findNext('h5', text='Research Interests:').next_siblings)[0]).replace('\n', '').strip().split('.')
        homepage = tag.findNext('span', text='Web:').findNext('a')['href']
        data = [name, "IISc", desig, homepage, ', '.join(interests)]

        results = add_to_results(data, results)
    os.system("rm " + fname)
    return results

# Dump results into files

**WARNING: Existing files are overwritten at present.**

In [None]:
# headings of each CSV file
headings = ['Name', 'Affiliation', 'Homepage', 'Interests']

# path of database folder, inside which all the
# csv files will live. Folder is automatically
# created if it does not exist.
folder = "./physics_database/"
if not os.path.exists(folder):
    os.mkdir(folder)

# obtain dictionaries from all existing retrievers
results_arr = []
results_arr.append(iisc())

# write results of each dictionary into each csv file
for results in results_arr:
    # print (results)
    for key in results:
        print (key)
        print (1)
        # file is opened in append mode 'a', this ensures
        # that existing files are not overwritten
        csv_w = csv.writer(open(folder+key+".csv", 'w'), delimiter='\t')
        data = results[key]
        csv_w.writerow(headings)
        csv_w.writerows(data)