In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests as rq
from bs4 import BeautifulSoup as bs
import re
import pickle

In [12]:
# Found using Postman and intercepting traffics while visiting topuniversities.com
top_universities_url = 'https://www.topuniversities.com'
top_universities_ranking_url = top_universities_url + '/sites/default/files/qs-rankings-data/357051.txt'

the_universities_url = 'https://www.timeshighereducation.com'
the_universities_ranking_url = the_universities_url + '/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'

In [97]:
def match_number(text):
    ''' [-+]?[0-9]*\.?[0-9]+ '''
    return float(re.sub('[^-+0-9.]', '', text))

def extract_number(html, selector):
    number_tag = html.select_one(selector)
    if number_tag is None:
        return float('NaN')
    else:
        return float(match_number(number_tag.get_text()))

def fetch_top_university_infos(university):
    university_request = rq.get(top_universities_url + university['url'])
    university_html = bs(university_request.text, 'html.parser')
    university_data = dict()
    
    university_data['name'] = university['title']
    university_data['top_rank'] = int(match_number(university['rank_display']))
    university_data['region'] = university['region']
    university_data['country'] = university['country']
    university_data['faculty_total'] = extract_number(university_html, 'div.faculty.total div.number')
    university_data['faculty_inter'] = extract_number(university_html, 'div.faculty.inter div.number')
    university_data['faculty_local'] = university_data['faculty_total'] - university_data['faculty_inter']
    university_data['student_total'] = extract_number(university_html, 'div.student.total div.number')
    university_data['student_inter'] = extract_number(university_html, 'div.inter.total div.number')
    university_data['student_local'] = university_data['student_total'] - university_data['student_inter']
    
    return university_data

try:
    top_universities_file = open('top_universities.pickle', 'rb')
    top_universities = pickle.load(top_universities_file)
except:
    top_universities_json = (rq.get(top_universities_ranking_url).json())['data']
    top_universities = pd.DataFrame(list(map(fetch_top_university_infos, top_universities_json[0:200])))
    
    top_universities.region = top_universities.region.astype('category')
    top_universities.country = top_universities.country.astype('category')
    
    top_universities_file = open('top_universities.pickle', 'wb')
    pickle.dump(top_universities, top_universities_file)
    
# Create the mapping country => region for the other scrapper
for country_region_pair, universities in top_universities.groupby(['country', 'region']):
    country_regions[country_region_pair[0]] = country_region_pair[1]

In [71]:
top_universities

Unnamed: 0,country,faculty_inter,faculty_local,faculty_total,name,region,student_inter,student_local,student_total,top_rank
0,United States,1679.0,1303.0,2982.0,Massachusetts Institute of Technology (MIT),North America,3717.0,7350.0,11067.0,1
1,United States,2042.0,2243.0,4285.0,Stanford University,North America,3611.0,12267.0,15878.0,2
2,United States,1311.0,3039.0,4350.0,Harvard University,North America,5266.0,17163.0,22429.0,3
3,United States,350.0,603.0,953.0,California Institute of Technology (Caltech),North America,647.0,1608.0,2255.0,4
4,United Kingdom,2278.0,3212.0,5490.0,University of Cambridge,Europe,6699.0,12071.0,18770.0,5
5,United Kingdom,2964.0,3786.0,6750.0,University of Oxford,Europe,7353.0,12367.0,19720.0,6
6,United Kingdom,2554.0,3791.0,6345.0,UCL (University College London),Europe,14854.0,16226.0,31080.0,7
7,United Kingdom,2071.0,1859.0,3930.0,Imperial College London,Europe,8746.0,7344.0,16090.0,8
8,United States,635.0,1814.0,2449.0,University of Chicago,North America,3379.0,10178.0,13557.0,9
9,Switzerland,1886.0,591.0,2477.0,ETH Zurich - Swiss Federal Institute of Techno...,Europe,7563.0,12252.0,19815.0,10


In [79]:
def ratio_stats(universities):
    universities['faculty_student_ratio'] = universities.faculty_total / universities.student_total
    universities['inter_student_ratio'] = universities.student_inter / universities.student_total
    
    top_by_faculty = (universities.sort_values('faculty_student_ratio', ascending = False))[:20]
    top_by_inter = (universities.sort_values('inter_student_ratio', ascending = False))[:20]
    
    return (top_by_faculty, top_by_inter)

In [85]:
top_universities_by_faculty, top_universities_by_inter = ratio_stats(top_universities)

top_country = top_universities.groupby('country').sum()
top_country_by_faculty, top_country_by_inter = ratio_stats(top_country)

top_regions = top_universities.groupby('region').sum()
top_regions_by_faculty, top_regions_by_inter = ratio_stats(top_regions)

top_country_by_inter

Unnamed: 0_level_0,faculty_inter,faculty_local,faculty_total,student_inter,student_local,student_total,top_rank,faculty_student_ratio,inter_student_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Australia,11382.0,10652.0,22034.0,106359.0,195635.0,301994.0,641,0.072962,0.352189
United Kingdom,30216.0,49718.0,79934.0,199426.0,384195.0,583621.0,2462,0.136962,0.341705
Hong Kong,6296.0,3870.0,10166.0,24499.0,54339.0,78838.0,246,0.128948,0.310751
Austria,1572.0,2545.0,4117.0,19667.0,43779.0,63446.0,336,0.06489,0.30998
Switzerland,9208.0,6115.0,15323.0,32995.0,76117.0,109112.0,655,0.140434,0.302396
Singapore,6079.0,3365.0,9444.0,16168.0,42298.0,58466.0,26,0.16153,0.276537
Canada,10734.0,18583.0,29317.0,73239.0,208275.0,281514.0,626,0.10414,0.260161
New Zealand,1404.0,1909.0,3313.0,12439.0,35734.0,48173.0,233,0.068773,0.258215
Ireland,1171.0,1682.0,2853.0,8187.0,26607.0,34794.0,256,0.081997,0.235299
Netherlands,5683.0,14604.0,20287.0,46044.0,151587.0,197631.0,1197,0.102651,0.23298


In [98]:
import math

def fetch_the_university_infos(university):
    university_data = dict()
    
    student_faculty_ratio = match_number(university['stats_student_staff_ratio'])
    inter_student_ratio = match_number(university['stats_pc_intl_students']) / 100.0
    
    university_data['name'] = university['name']
    university_data['the_rank'] = int(match_number(university['rank']))
    university_data['region'] = country_regions[university['location']]
    university_data['country'] = university['location']
    university_data['student_total'] = int(match_number(university['stats_number_students']))
    university_data['student_inter'] = int(university_data['student_total'] * inter_student_ratio)
    university_data['student_local'] = university_data['student_total'] - university_data['student_inter']
    university_data['faculty_total'] = int(university_data['student_total'] / student_faculty_ratio)
    university_data['faculty_inter'] = float('NaN')
    university_data['faculty_local'] = float('NaN')
    
    return university_data
    
try:
    the_universities_file = open('the_universities.pickle', 'rb')
    the_universities = pickle.load(the_universities_file)
except:
    the_universities_json = (rq.get(the_universities_ranking_url).json())['data']
    the_universities = pd.DataFrame(list(map(fetch_the_university_infos, the_universities_json[0:200])))
        
    the_universities.region = the_universities.region.astype('category')
    the_universities.country = the_universities.country.astype('category')
    
    the_universities_file = open('the_universities.pickle', 'wb')
    pickle.dump(the_universities, the_universities_file)

In [99]:
the_universities

Unnamed: 0,country,faculty_inter,faculty_local,faculty_total,name,region,student_inter,student_local,student_total,the_rank
0,United Kingdom,,,1822,University of Oxford,Europe,7755,12654,20409,1
1,United Kingdom,,,1687,University of Cambridge,Europe,6436,11953,18389,2
2,United States,,,339,California Institute of Technology,North America,596,1613,2209,3
3,United States,,,2112,Stanford University,North America,3485,12360,15845,3
4,United States,,,1284,Massachusetts Institute of Technology,North America,3800,7377,11177,5
5,United States,,,2283,Harvard University,North America,5284,15042,20326,6
6,United States,,,958,Princeton University,North America,1909,6046,7955,7
7,United Kingdom,,,1390,Imperial College London,Europe,8721,7136,15857,8
8,United States,,,2181,University of Chicago,North America,3381,10144,13525,9
9,Switzerland,,,1317,ETH Zurich – Swiss Federal Institute of Techno...,Europe,7308,11925,19233,10


In [100]:
the_universities_by_faculty, the_universities_by_inter = ratio_stats(the_universities)

the_country = the_universities.groupby('country').sum()
the_country_by_faculty, the_country_by_inter = ratio_stats(the_country)

the_regions = the_universities.groupby('region').sum()
the_regions_by_faculty, the_regions_by_inter = ratio_stats(the_regions)

the_country_by_inter

Unnamed: 0_level_0,faculty_inter,faculty_local,faculty_total,student_inter,student_local,student_total,the_rank,faculty_student_ratio,inter_student_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Luxembourg,,,340,2832,2137,4969,179,0.068424,0.569934
United Kingdom,,,44425,213045,383404,596449,3428,0.074482,0.357189
Hong Kong,,,4140,25157,52506,77663,443,0.053307,0.323925
Australia,,,9937,83807,184823,268630,616,0.036991,0.311979
Singapore,,,3364,17084,39017,56101,74,0.059963,0.304522
Switzerland,,,10048,32743,75109,107852,666,0.093165,0.303592
New Zealand,,,1614,8800,21548,30348,192,0.053183,0.28997
Ireland,,,708,4362,11795,16157,117,0.04382,0.269976
Austria,,,1700,9197,26178,35375,165,0.048057,0.259986
Canada,,,13236,55902,193499,249401,403,0.053071,0.224145
