In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests as rq
from bs4 import BeautifulSoup as bs
import re
import pickle

In [6]:
# Found using Postman and intercepting traffics while visiting topuniversities.com
top_universities_url = 'https://www.topuniversities.com'
top_universities_ranking_url = top_universities_url + '/sites/default/files/qs-rankings-data/357051.txt'

In [36]:
def match_number(text):
    ''' [-+]?[0-9]*\.?[0-9]+ '''
    return float(re.sub('[^-+0-9.]', '', text))

def extract_number(html, selector):
    number_tag = html.select_one(selector)
    if number_tag is None:
        return float('NaN')
    else:
        return float(match_number(number_tag.get_text()))

def fetch_university_infos(university):
    university_request = rq.get(top_universities_url + university['url'])
    university_html = bs(university_request.text, 'html.parser')
    university_data = dict()
    
    university_data['name'] = university['title']
    university_data['rank'] = int(match_number(university['rank_display']))
    university_data['region'] = university['region']
    university_data['country'] = university['country']
    university_data['faculty_total'] = extract_number(university_html, 'div.faculty.total div.number')
    university_data['faculty_inter'] = extract_number(university_html, 'div.faculty.inter div.number')
    university_data['faculty_local'] = university_data['faculty_total'] - university_data['faculty_inter']
    university_data['student_total'] = extract_number(university_html, 'div.student.total div.number')
    university_data['student_inter'] = extract_number(university_html, 'div.inter.total div.number')
    university_data['student_local'] = university_data['student_total'] - university_data['student_inter']
    
    return university_data

try:
    top_universities_file = open('top_universities.pickle', 'rb')
    top_universities = pickle.load(top_universities_file)
except:
    top_universities_json = (rq.get(top_universities_ranking_url).json())['data']
    top_universities = pd.DataFrame(list(map(fetch_university_infos, top_universities_json[0:200])))
    top_universities_file = open('top_universities.pickle', 'wb')
    pickle.dump(top_universities, top_universities_file)

In [37]:
top_universities

Unnamed: 0,country,faculty_inter,faculty_local,faculty_total,name,rank,region,student_inter,student_local,student_total
0,United States,1679.0,1303.0,2982.0,Massachusetts Institute of Technology (MIT),1,North America,3717.0,7350.0,11067.0
1,United States,2042.0,2243.0,4285.0,Stanford University,2,North America,3611.0,12267.0,15878.0
2,United States,1311.0,3039.0,4350.0,Harvard University,3,North America,5266.0,17163.0,22429.0
3,United States,350.0,603.0,953.0,California Institute of Technology (Caltech),4,North America,647.0,1608.0,2255.0
4,United Kingdom,2278.0,3212.0,5490.0,University of Cambridge,5,Europe,6699.0,12071.0,18770.0
5,United Kingdom,2964.0,3786.0,6750.0,University of Oxford,6,Europe,7353.0,12367.0,19720.0
6,United Kingdom,2554.0,3791.0,6345.0,UCL (University College London),7,Europe,14854.0,16226.0,31080.0
7,United Kingdom,2071.0,1859.0,3930.0,Imperial College London,8,Europe,8746.0,7344.0,16090.0
8,United States,635.0,1814.0,2449.0,University of Chicago,9,North America,3379.0,10178.0,13557.0
9,Switzerland,1886.0,591.0,2477.0,ETH Zurich - Swiss Federal Institute of Techno...,10,Europe,7563.0,12252.0,19815.0


In [50]:
top_universities['faculty_student_ratio'] = top_universities.faculty_total / top_universities.student_total
top_universities['inter_student_ratio'] = top_universities.student_inter / top_universities.student_total

top_universities_by_faculty = top_universities.sort_values('faculty_student_ratio', ascending = False)
top_universities_by_inter = top_universities.sort_values('inter_student_ratio', ascending = False)

top_regions = top_universities.groupby('region').sum()
top_regions['faculty_student_ratio'] = top_regions.faculty_total / top_regions.student_total
top_regions['inter_student_ratio'] = top_regions.student_inter / top_regions.student_total
top_regions

Unnamed: 0_level_0,faculty_inter,faculty_local,faculty_total,rank,student_inter,student_local,student_total,faculty_student_ratio,inter_student_ratio
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Africa,379.0,1354.0,1733.0,191,3325.0,16268.0,19593.0,0.08845,0.169703
Asia,25462.0,80849.0,106734.0,3369,110100.0,696903.0,807003.0,0.13226,0.136431
Europe,67598.0,150760.0,218358.0,10199,449364.0,1507887.0,1957251.0,0.111564,0.229589
Latin America,5648.0,39734.0,45382.0,1037,36871.0,398879.0,435750.0,0.104147,0.084615
North America,43836.0,138287.0,182123.0,4379,292116.0,1254237.0,1546353.0,0.117776,0.188906
Oceania,12786.0,12561.0,25347.0,874,118798.0,231369.0,350167.0,0.072385,0.339261
