# Database Creator

In [None]:
# All imports go here

import os
import sys
import yaml
import requests
from bs4 import BeautifulSoup
import csv
import wget
from tqdm import tqdm
import time

# headings of each CSV file
headings = ['Name', 'Institute', 'Designation', 'Interests', 'Homepage', 'Email']

# path of various files and folders
folder = "./physics_database/"
if not os.path.exists(folder): os.mkdir(folder)
category_file = "category_file.yml"
if not os.path.exists(folder+category_file): define_cats()
categories = yaml.safe_load(open(folder + category_file, 'r'))
all_subcats = [subcat for cat in categories.values() for subcat in cat]

def get_new_cats(data):
    unique = []
    for prof in data:
        unique += [interest.lower() for interest in prof[3] if interest.lower() not in all_subcats]
    return unique

def get_category(interests, categories):
    rel_cats = []
    for interest in interests:
        for key,val in categories.items():
            if interest.replace('/', '_').replace(' ', '_').lower() in val: rel_cats.append(key)
    return rel_cats

## Combine interest keywords into a single keyword dictionary

In [None]:
def add_to_dict(results, dict_all):
    for i, result in enumerate(results):
        interests = result[-1]
        result.append([])
        for interest in interests:
            # sanitise interest by changing space to _ and converting all to lower case
            interest_sanitised = interest.replace('/', '_').replace(' ', '_').lower()
            if interest_sanitised not in dict_all:
                # create key if does not exist
                dict_all[interest_sanitised] = [result[:-1]]
            else:
                dict_all[interest_sanitised].append(result[:-1])
            for category, subcats in categories.items():
                if category in result[-1]: continue
                if interest_sanitised in subcats:
                    fpath = folder + "main-" + category + ".csv"
                    if not os.path.exists(fpath): csv.writer(open(fpath, 'w'), delimiter='\t').writerow(headings)
                    csv.writer(open(fpath, 'a'), delimiter='\t').writerow(result[:-1])
                    result[-1].append(category)
        results[i] = result
    return results, dict_all

## IIT Kharagpur Retriver

In [None]:
def iitkgp():
    #Get the faculty page using requests
    page = requests.get('http://www.iitkgp.ac.in/department/PH/faculties')
    #Parse it using Beautiful Soup
    soup = BeautifulSoup(page.text, 'html.parser')
    prof_details = []
    #Scraping code
    block = soup.find_all('div', attrs = {'class':'col-lg-12'})[2]
    base_url = 'http://www.iitkgp.ac.in'
    for i in tqdm(block.findChildren('div', attrs = {'class':'col-lg-12'}), desc="IIT KGP"):
        for j in i.findChildren('div', attrs = {'class':'row'}):
            prof_dict = []
            for k in j.findChildren('h3'):
                prof_dict.append(k.text.strip()) #Get name of professor
                prof_dict.append('IIT Kharagpur')
                prof_dict.append(k.findNext('span').findNext('span').text.split('\n')[1].strip())
            for l in j.findChildren('blockquote', attrs = {'class':'blockquote'}):
                prof_dict.append([i.text.replace('\xa0','').strip() for i in l.findChildren('li')])  #Get the research areas
            prof_page = base_url+j.findNext('a')['href'].split(';')[0]
            prof_dict.append(prof_page) #Get the contact detail of the Professor
            prof_soup = BeautifulSoup(requests.get(prof_page).text, 'html.parser')
            prof_dict.append(prof_soup.find('div', attrs = {'class':'accordion-contact-list'}).findChild('li').text.strip()) #Get the contact detail of the Professor
            prof_details.append(prof_dict) 
            time.sleep(0.5)
            break
    return prof_details

iitkgp()

## IISER-K Retriever

In [None]:
def iiserk():
    url = "https://www.iiserkol.ac.in/web/en/people/faculty/dps/#gsc.tab=0"
    soup = BeautifulSoup(requests.get(url).content, features='lxml')
    prof_details = []
    for tag in tqdm(soup.findAll('b', text="Department:"), desc="IISER K"):
        parent = tag.findPrevious('td')
        name = parent.findChild('a', attrs={'style':"color:#0976e4;"}).text.strip()
        desig = parent.findChild('b', text="Department:").findPrevious().previousSibling.strip()
        try:
            interests_string = parent.findChild('b', text="Research Area:").nextSibling
            delim = ';' if ';' in interests_string else ','
            interests = [s.strip() for s in interests_string.split(delim)]
        except:
            interests = []
        homepage = "https://www.iiserkol.ac.in" + tag.findPrevious('a')['href']
        email = parent.findChild('b', text="Email:").nextSibling.replace(' [AT] ', '@')
        prof_details.append([name, "IISER Kolkata", desig, interests, homepage, email])
        time.sleep(0.5)
    return prof_details

## Google Scholar Retriever

In [None]:
def google_scholar():
    
    # URL to search for scientists. The keyword is 'physic', which will hopefully 
    # return most of the physics scientists. The Indian search is enforced through
    # the domains .ac.in and .res.in. Currently searches only through first page,
    # needs to be extended to all pages
    
    url = "https://scholar.google.co.in/citations?hl=en&view_op=search_authors&mauthors=physic+%2B+.ac.in+%7C+.res.in&btnG="
       
    page = requests.get(url)
    soup = bsoup(page.content, features='lxml')
    results = {}
    
    count = 1
    while soup != []:
        print ("G Scholar: Page", count)
        count += 1
        
        tags = soup.findAll('h3', attrs={'class': "gs_ai_name"})
        if tags == []:
            print ("There's a problem.")
        for tag in tags:

            # obtain name, affiliation, interests and homepage (if exists)
            name = tag.text
            link = "https://scholar.google.com"+tag.next['href']
            author_soup = bsoup(requests.get(link).content, features='lxml')
            affil_tag = author_soup.find('div', attrs={'class':"gsc_prf_il"})
            affil = affil_tag.text
            try:
                homepage = author_soup.find('a', text = "Homepage")['href']
            except:
                homepage = ""
            interests = [child.text for child in author_soup.find('div', attrs={'class':"gsc_prf_il", 'id':"gsc_prf_int"}).findChildren()]

            data = [name, affil, affil, homepage, ', '.join(interests)]

            # append data for this scientist to the dictionary
            results = add_to_results(data, results)
            
        try:
            next_btn = soup.find('button', attrs={'aria-label': "Next", 'class': "gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx"})
            url = 'https://scholar.google.co.in' + str(next_btn['onclick']).replace('window.location=\'', '')[:-1]
            index = url.find("\\x")
            while index >= 0:
                hexa = url[index+2:index+4]
                actual = bytearray.fromhex(hexa).decode()
                url = url.replace("\\x"+hexa, actual)
                index = url.find("\\x")
            page = requests.get(url)
            soup = bsoup(page.content, features='lxml')
        except:
            soup = []
        
    return results

## IISc Retriever

In [None]:
def iisc():
    results = {}
    url_aap = "http://www.physics.iisc.ac.in/~jap/people-fac.html"
    fname = "file.html"
    os.system("wget " + url_aap + " -O " + fname + " >/dev/null 2>&1")
    soup = bsoup(open(fname, 'r'))
    for tag in soup.findAll('div', attrs={'class': "about-veno"}):
        name = tag.findNext('h3').text
        desig = tag.findNext('h4').text
        interests = str(list(tag.findNext('h5', text='Research Interests:').next_siblings)[0]).replace('\n', '').strip().split('.')
        homepage = tag.findNext('span', text='Web:').findNext('a')['href']
        data = [name, "IISc", desig, homepage, ', '.join(interests)]

        results = add_to_results(data, results)
    os.system("rm " + fname)
    return results

# Dump results into files

**WARNING: Existing files are overwritten at present.**

In [None]:
dict_all = {}

# obtain prof. data from all existing retrievers
insti_names = ['iiser_k', 'iit_kgp']
[csv.writer(open(folder + insti_name + ".csv", 'w'), delimiter='\t').writerow(headings) for insti_name in insti_names]
insti_data = [iiserk(), iitkgp()]
for insti_datum, insti_name in zip(insti_data, insti_names):
    csv.writer(open(folder + insti_name + ".csv", 'a'), delimiter='\t').writerows(insti_datum)
    for prof in insti_datum:
        for rel_cat in get_category(prof[3], categories):
            if not os.path.exists(folder + rel_cat + ".csv"): csv.writer(open(folder + rel_cat + ".csv", 'w'), delimiter='\t').writerow(headings)
            csv.writer(open(folder + rel_cat + ".csv", 'a'), delimiter='\t').writerow(prof)

In [None]:
def define_cats():
    categories = {
        'Astrophysics and Astronomy': 
        ['space science',
         'astrophysics',
         'astrophysical magnetohydrodynamics'
         'sun-earth-system science'],
 'Biophysics': ['mathematical and theoretical biology',
  'computational biology',
  'biophysics and complex systems',
  'theoretical biological physics',
  'biophotonics'],
 'Condensed Matter': ['condensed matter physics (experimental)',
  'carrier dynamics in semiconductors',
  'quantum magnetism',
  'strongly correlated electron systems and magnetooptics',
  'experimental condensed matter physics',
  'bose-einstein condensates',
  'cold fermions',
  'quantum condensed matter physics',
  'strongly correlated electron systems',
  'condensed matter (experiment)',
  'emergent phenomena in low-dimensional quantum systems',
  'condensed matter theory',
  'soft condensed matter',
  'soft condensed matter physics'],
 'Gravity and Cosmology': ['gravitation &amp',
  'cosmology',
  'gravitation and cosmology (classical and quantum)',
  'cosmology and its connections to particle physics'],
 'High Energy Physics': ['high energy physics'],
 'Material Science': ['polymer physics',
  'quantum theory of functional materials'],
 'Nonlinear Dynamics': ['non-linear dynamics',
  'nonlinear optical phenomena',
  'nonlinear dynamics'],
 'Optics': ['spectroscopy',
  'ultrafast optical spectroscopy',
  'nonlinear optical phenomena',
  'optics and photonics',
  'photoelectron-photoion imaging spectroscopy',
  'optical physics',
  'ultrafast spectroscopy',
  'thz spectroscopy',
  'thz metamaterials',
  'nonlinear optics/thz',
  'plasmonics and nano optics',
  'polarization optics',
  'biophotonics'],
 'Others': ['open quantum systems',
  'nmr',
  'physics',
  'polyelectrolyte physics',
  'low energy electron-molecule collisions',
  'surface science',
  'weak measurements',
  'magnetism in mesoscopic systems and spintronics application',
  'field theory and wavelet transform'],
 'Quantum Information and Computation': ['quantum information processing',
  'quantum information and quantum computation',
  'quantum computation and quantum information'],
 'Quantum Mechanics': ['quantum phenomena',
  'non-equilibrium quantum dynamics',
  'quantum transport'],
 'Statistical Physics': ['statistical physics',
  'nonequilibrium statistical physics']}
    yaml.dump(categories, open(folder+category_file, 'w'))
    
define_cats()