In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests as rq
from bs4 import BeautifulSoup as bs
import re

In [2]:
# Found using Postman and intercepting traffics while visiting topuniversities.com
top_universities_url = 'https://www.topuniversities.com'
top_universities_ranking_url = top_universities_url + '/sites/default/files/qs-rankings-data/357051.txt'

In [3]:
top_universities_json = (rq.get(top_universities_ranking_url).json())['data']

In [4]:
print(top_universities_json[11])

{'nid': '294481', 'url': '/universities/ecole-polytechnique-f%C3%A9d%C3%A9rale-de-lausanne-epfl', 'title': 'Ecole Polytechnique Fédérale de Lausanne (EPFL)', 'logo': '<img src="https://www.topuniversities.com/sites/default/files/ecole-polytechnique-fdrale-de-lausanne-epfl_177_small_1.jpg" alt="Ecole Polytechnique Fédérale de Lausanne (EPFL) Logo">', 'core_id': '177', 'score': '91.2', 'rank_display': '12', 'country': 'Switzerland', 'cc': 'CH', 'region': 'Europe', 'stars': None, 'guide': '<a href="/where-to-study/europe/switzerland/guide" class="guide-link" target="_blank">Switzerland</a>'}


In [5]:
def match_number(text):
    ''' [-+]?[0-9]*\.?[0-9]+ '''
    return float(re.sub('[^-+0-9.]', '', text))

def extract_number(html, selector):
    return match_number(html.select_one(selector).get_text())

# Fetch name, rank, country and region,
# number of faculty members (international and total), and
# number of students (international and total)
def fetch_university_infos(university):
    university_request = rq.get(top_universities_url + university['url'])
    university_html = bs(university_request.text, 'html.parser')
    university_data = dict()
    
    university_data['name'] = university['title']
    university_data['rank'] = int(university['rank_display'])
    university_data['region'] = university['region']
    university_data['country'] = university['country']
    university_data['faculty_total'] = int(extract_number(university_html, 'div.faculty.total div.number'))
    university_data['faculty_inter'] = int(extract_number(university_html, 'div.faculty.inter div.number'))
    university_data['faculty_local'] = university_data['faculty_total'] - university_data['faculty_inter']
    university_data['student_total'] = int(extract_number(university_html, 'div.student.total div.number'))
    university_data['student_inter'] = int(extract_number(university_html, 'div.inter.total div.number'))
    university_data['student_local'] = university_data['student_total'] - university_data['student_inter']
    
    return university_data

pd.DataFrame(list(map(fetch_university_infos, top_universities_json[0:20])))

Unnamed: 0,country,faculty_inter,faculty_local,faculty_total,name,rank,region,student_inter,student_local,student_total
0,United States,1679,1303,2982,Massachusetts Institute of Technology (MIT),1,North America,3717,7350,11067
1,United States,2042,2243,4285,Stanford University,2,North America,3611,12267,15878
2,United States,1311,3039,4350,Harvard University,3,North America,5266,17163,22429
3,United States,350,603,953,California Institute of Technology (Caltech),4,North America,647,1608,2255
4,United Kingdom,2278,3212,5490,University of Cambridge,5,Europe,6699,12071,18770
5,United Kingdom,2964,3786,6750,University of Oxford,6,Europe,7353,12367,19720
6,United Kingdom,2554,3791,6345,UCL (University College London),7,Europe,14854,16226,31080
7,United Kingdom,2071,1859,3930,Imperial College London,8,Europe,8746,7344,16090
8,United States,635,1814,2449,University of Chicago,9,North America,3379,10178,13557
9,Switzerland,1886,591,2477,ETH Zurich - Swiss Federal Institute of Techno...,10,Europe,7563,12252,19815
