# Database Creator

In [None]:
# All imports go here

import os
import sys
import requests
from bs4 import BeautifulSoup
import csv
import wget
import yaml
import time

# headings of each CSV file
headings = ['Name', 'Institute', 'Designation', 'Homepage', 'Interests']

# path of various files and folders
folder = "./physics_database/"
if not os.path.exists(folder): os.mkdir(folder)
category_file = "category_file.yml"
# if not os.path.exists(folder+category_file): define_cats()
# categories = yaml.safe_load(open(folder + category_file, 'r'))

## Combine interest keywords into a single keyword dictionary

In [None]:
def add_to_dict(results, dict_all):
    for i, result in enumerate(results):
        interests = result[-1]
        result.append([])
        for interest in interests:
            # sanitise interest by changing space to _ and converting all to lower case
            interest_sanitised = interest.replace('/', '_').replace(' ', '_').lower()
            if interest_sanitised not in dict_all:
                # create key if does not exist
                dict_all[interest_sanitised] = [result[:-1]]
            else:
                dict_all[interest_sanitised].append(result[:-1])
            for category, subcats in categories.items():
                if category in result[-1]: continue
                if interest_sanitised in subcats:
                    fpath = folder + "main-" + category + ".csv"
                    if not os.path.exists(fpath): csv.writer(open(fpath, 'w'), delimiter='\t').writerow(headings)
                    csv.writer(open(fpath, 'a'), delimiter='\t').writerow(result[:-1])
                    result[-1].append(category)
        results[i] = result
    return results, dict_all

## IIT Kharagpur Retriver

In [None]:
def iitkgp():
    #Get the faculty page using requests
    page = requests.get('http://www.iitkgp.ac.in/department/PH/faculties')
    #Parse it using Beautiful Soup
    soup = BeautifulSoup(page.text, 'html.parser')
    prof_details = []
    #Scraping code
    block = soup.find_all('div', attrs = {'class':'col-lg-12'})[2]
    base_url = 'http://www.iitkgp.ac.in'
    for i in block.findChildren('div', attrs = {'class':'col-lg-12'}):
        print (len(prof_details))
        for j in i.findChildren('div', attrs = {'class':'row'}):
            prof_dict = {}
            for k in j.findChildren('h3'):
                prof_dict['Name'] = k.text.strip() #Get name of professor
                prof_dict['Institute'] = 'IIT Kharagpur'
            for l in j.findChildren('blockquote', attrs = {'class':'blockquote'}):
                prof_dict['Research Area'] = ','.join([i.text.replace('\xa0','').strip() for i in l.findChildren('li')]) #Get the research areas
            prof_page = base_url+j.findNext('a')['href'].split(';')[0]
            prof_dict['Webpage'] = prof_page #Get the contact detail of the Professor
            prof_details.append(prof_dict) 
            time.sleep(0.5)
            break
    return prof_details

iitkgp()

## IISER-K Retriever

In [None]:
def iiserk():
    url = "https://www.iiserkol.ac.in/web/en/people/faculty/dps/#gsc.tab=0"
    soup = bsoup(requests.get(url).content, features='lxml')
    results = []
    for tag in soup.findAll('b', text="Department:"):
        parent = tag.findPrevious('td')
        name = parent.findChild('a', attrs={'style':"color:#0976e4;"}).text.strip()
        desig = parent.findChild('b', text="Department:").findPrevious().previousSibling.strip()
        try:
            interests_string = parent.findChild('b', text="Research Area:").nextSibling
            delim = ';' if ';' in interests_string else ','
            interests = [s.strip() for s in interests_string.split(delim)]
        except:
            interests = []
        homepage = "https://www.iiserkol.ac.in" + tag.findPrevious('a')['href']
        results.append([name, "IISER Kolkata", desig, homepage, interests])
    return results

## Google Scholar Retriever

In [None]:
def google_scholar():
    
    # URL to search for scientists. The keyword is 'physic', which will hopefully 
    # return most of the physics scientists. The Indian search is enforced through
    # the domains .ac.in and .res.in. Currently searches only through first page,
    # needs to be extended to all pages
    
    url = "https://scholar.google.co.in/citations?hl=en&view_op=search_authors&mauthors=physic+%2B+.ac.in+%7C+.res.in&btnG="
       
    page = requests.get(url)
    soup = bsoup(page.content, features='lxml')
    results = {}
    
    count = 1
    while soup != []:
        print ("G Scholar: Page", count)
        count += 1
        
        tags = soup.findAll('h3', attrs={'class': "gs_ai_name"})
        if tags == []:
            print ("There's a problem.")
        for tag in tags:

            # obtain name, affiliation, interests and homepage (if exists)
            name = tag.text
            link = "https://scholar.google.com"+tag.next['href']
            author_soup = bsoup(requests.get(link).content, features='lxml')
            affil_tag = author_soup.find('div', attrs={'class':"gsc_prf_il"})
            affil = affil_tag.text
            try:
                homepage = author_soup.find('a', text = "Homepage")['href']
            except:
                homepage = ""
            interests = [child.text for child in author_soup.find('div', attrs={'class':"gsc_prf_il", 'id':"gsc_prf_int"}).findChildren()]

            data = [name, affil, affil, homepage, ', '.join(interests)]

            # append data for this scientist to the dictionary
            results = add_to_results(data, results)
            
        try:
            next_btn = soup.find('button', attrs={'aria-label': "Next", 'class': "gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx"})
            url = 'https://scholar.google.co.in' + str(next_btn['onclick']).replace('window.location=\'', '')[:-1]
            index = url.find("\\x")
            while index >= 0:
                hexa = url[index+2:index+4]
                actual = bytearray.fromhex(hexa).decode()
                url = url.replace("\\x"+hexa, actual)
                index = url.find("\\x")
            page = requests.get(url)
            soup = bsoup(page.content, features='lxml')
        except:
            soup = []
        
    return results

## IISc Retriever

In [None]:
def iisc():
    results = {}
    url_aap = "http://www.physics.iisc.ac.in/~jap/people-fac.html"
    fname = "file.html"
    os.system("wget " + url_aap + " -O " + fname + " >/dev/null 2>&1")
    soup = bsoup(open(fname, 'r'))
    for tag in soup.findAll('div', attrs={'class': "about-veno"}):
        name = tag.findNext('h3').text
        desig = tag.findNext('h4').text
        interests = str(list(tag.findNext('h5', text='Research Interests:').next_siblings)[0]).replace('\n', '').strip().split('.')
        homepage = tag.findNext('span', text='Web:').findNext('a')['href']
        data = [name, "IISc", desig, homepage, ', '.join(interests)]

        results = add_to_results(data, results)
    os.system("rm " + fname)
    return results

# Dump results into files

**WARNING: Existing files are overwritten at present.**

In [None]:
dict_all = {}

# obtain prof. data from all existing retrievers
profs = []
for results in [iiserk()]:
    new_results, dict_all = add_to_dict(results, dict_all)   
    profs += new_results
    
# write results of each keyword into separate csv files
for key,val in dict_all.items():
    csv_w = csv.writer(open(folder+key+".csv", 'w'), delimiter='\t')
    csv_w.writerow(headings)
    csv_w.writerows(val)

In [None]:
def define_cats():
    categories = {
        'Condensed Matter': ["condensed_matter_physics_(experimental)",
                     "carrier_dynamics_in_semiconductors",
                     "quantum_magnetism",
                     "strongly_correlated_electron_systems_and_magnetooptics",
                    "experimental_condensed_matter_physics",
                    "bose-einstein_condensates",
                    "cold_fermions",
                    "quantum_condensed_matter_physics",
                    "strongly_correlated_electron_systems",
                    "condensed_matter_(experiment)",
                    "emergent_phenomena_in_low-dimensional_quantum_systems",
                    "condensed_matter_theory",
                    "soft_condensed_matter",
                    "soft_condensed_matter_physics",
               ],
        'Optics': ["spectroscopy", 
              "ultrafast_optical_spectroscopy", 
              "nonlinear_optical_phenomena", 
              "optics_and_photonics", 
              "photoelectron-photoion_imaging_spectroscopy", 
              "optical_physics", 
              "ultrafast_spectroscopy", 
              "thz_spectroscopy", 
              "thz_metamaterials", 
              "nonlinear_optics_thz",
              "plasmonics_and_nano_optics",
              "polarization_optics",
              "biophotonics"
             ],
        'Quantum Information and Computation': ["quantum_information_processing",
           "quantum_information_and_quantum_computation",
           "quantum_computation_and_quantum_information",
           ],
        'High Energy Physics': ["high_energy_physics",
                  ],
        'Gravity and Cosmology': ["gravitation_&amp", 
               "cosmology", 
               "gravitation_and_cosmology_(classical_and_quantum)", 
               "cosmology_and_its_connections_to_particle_physics",
              ],
        'Quantum Mechanics': ["quantum_phenomena",
                  "non-equilibrium_quantum_dynamics",
                  "quantum_transport",
                 ],
        'Nonlinear Dynamics': ["non-linear_dynamics",
           "nonlinear_optical_phenomena",
           "nonlinear_dynamics",
          ],
        'Biophysics': ["mathematical_and_theoretical_biology", 
               "computational_biology", 
               "biophysics_and_complex_systems",
               "theoretical_biological_physics",
               "biophotonics",
              ],
        'Astrophysics and Astronomy': ["space_science",
             "astrophysics",
             "astrophysical_magnetohydrodynamics",
            ],
        'Material Science': ["polymer_physics", 
                        "quantum_theory_of_functional_materials",
                       ],
        'Statistical Physics': ["statistical_physics",
                      "nonequilibrium_statistical_physics",
                ],
        'Others': ["open_quantum_systems",
                   "nmr",
                   "physics",
                   "polyelectrolyte_physics",
                   "low_energy_electron-molecule_collisions",
                   "surface_science",
                   "weak_measurements",
                   "magnetism_in_mesoscopic_systems_and_spintronics_application",
                   "field_theory_and_wavelet_transform",
                 ]
    }    

    yaml.dump(categories, open(folder+category_file, 'w'))