In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests as rq
from bs4 import BeautifulSoup as bs
import re
import pickle

In [2]:
# Found using Postman and intercepting traffics while visiting topuniversities.com
top_universities_url = 'https://www.topuniversities.com'
top_universities_ranking_url = top_universities_url + '/sites/default/files/qs-rankings-data/357051.txt'

the_universities_url = 'https://www.timeshighereducation.com'
the_universities_ranking_url = the_universities_url + '/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'

## Top Universities

In [4]:
def match_number(text):
    ''' [-+]?[0-9]*\.?[0-9]+ '''
    return float(re.sub('[^-+0-9.]', '', text))

def extract_number(html, selector):
    number_tag = html.select_one(selector)
    if number_tag is None:
        return float('NaN')
    else:
        return float(match_number(number_tag.get_text()))

def fetch_top_university_infos(university):
    university_request = rq.get(top_universities_url + university['url'])
    university_html = bs(university_request.text, 'html.parser')
    university_data = dict()
    
    university_data['name'] = university['title']
    university_data['rank'] = int(match_number(university['rank_display']))
    university_data['region'] = university['region']
    university_data['country'] = university['country']
    university_data['faculty_total'] = extract_number(university_html, 'div.faculty.total div.number')
    university_data['faculty_inter'] = extract_number(university_html, 'div.faculty.inter div.number')
    university_data['faculty_local'] = university_data['faculty_total'] - university_data['faculty_inter']
    university_data['student_total'] = extract_number(university_html, 'div.student.total div.number')
    university_data['student_inter'] = extract_number(university_html, 'div.inter.total div.number')
    university_data['student_local'] = university_data['student_total'] - university_data['student_inter']
    
    return university_data

try:
    top_universities_file = open('top_universities.pickle', 'rb')
    top_universities = pickle.load(top_universities_file)
except:
    top_universities_json = (rq.get(top_universities_ranking_url).json())['data']
    top_universities = pd.DataFrame(list(map(fetch_top_university_infos, top_universities_json[0:200])))
    
    top_universities.region = top_universities.region.astype('category')
    top_universities.country = top_universities.country.astype('category')
    
    top_universities_file = open('top_universities.pickle', 'wb')
    pickle.dump(top_universities, top_universities_file)
    
# Create the mapping country => region for the other scrapper
country_regions = dict()
for country_region_pair, universities in top_universities.groupby(['country', 'region']):
    country_regions[country_region_pair[0]] = country_region_pair[1]

In [None]:
top_universities

In [5]:
def ratio_stats(universities):
    universities['faculty_student_ratio'] = universities.faculty_total / universities.student_total
    universities['inter_student_ratio'] = universities.student_inter / universities.student_total
    
    top_by_faculty = (universities.sort_values('faculty_student_ratio', ascending = False))[:20]
    top_by_inter = (universities.sort_values('inter_student_ratio', ascending = False))[:20]
    
    return (top_by_faculty, top_by_inter)

In [6]:
top_universities_by_faculty, top_universities_by_inter = ratio_stats(top_universities)

top_country = top_universities.groupby('country').sum()
top_country_by_faculty, top_country_by_inter = ratio_stats(top_country)

top_regions = top_universities.groupby('region').sum()
top_regions_by_faculty, top_regions_by_inter = ratio_stats(top_regions)

top_country_by_inter

Unnamed: 0_level_0,faculty_inter,faculty_local,faculty_total,student_inter,student_local,student_total,top_rank,faculty_student_ratio,inter_student_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Australia,11382.0,10652.0,22034.0,106359.0,195635.0,301994.0,641,0.072962,0.352189
United Kingdom,30216.0,49718.0,79934.0,199426.0,384195.0,583621.0,2462,0.136962,0.341705
Hong Kong,6296.0,3870.0,10166.0,24499.0,54339.0,78838.0,246,0.128948,0.310751
Austria,1572.0,2545.0,4117.0,19667.0,43779.0,63446.0,336,0.06489,0.30998
Switzerland,9208.0,6115.0,15323.0,32995.0,76117.0,109112.0,655,0.140434,0.302396
Singapore,6079.0,3365.0,9444.0,16168.0,42298.0,58466.0,26,0.16153,0.276537
Canada,10734.0,18583.0,29317.0,73239.0,208275.0,281514.0,626,0.10414,0.260161
New Zealand,1404.0,1909.0,3313.0,12439.0,35734.0,48173.0,233,0.068773,0.258215
Ireland,1171.0,1682.0,2853.0,8187.0,26607.0,34794.0,256,0.081997,0.235299
Netherlands,5683.0,14604.0,20287.0,46044.0,151587.0,197631.0,1197,0.102651,0.23298


## Times Higher Education

In [7]:
import math

def fetch_the_university_infos(university):
    university_data = dict()
    
    student_faculty_ratio = match_number(university['stats_student_staff_ratio'])
    inter_student_ratio = match_number(university['stats_pc_intl_students']) / 100.0
    
    university_data['name'] = university['name']
    university_data['rank'] = int(match_number(university['rank']))
    university_data['region'] = country_regions[university['location']]
    university_data['country'] = university['location']
    university_data['student_total'] = int(match_number(university['stats_number_students']))
    university_data['student_inter'] = int(university_data['student_total'] * inter_student_ratio)
    university_data['student_local'] = university_data['student_total'] - university_data['student_inter']
    university_data['faculty_total'] = int(university_data['student_total'] / student_faculty_ratio)
    university_data['faculty_inter'] = float('NaN')
    university_data['faculty_local'] = float('NaN')
    
    return university_data
    
try:
    the_universities_file = open('the_universities.pickle', 'rb')
    the_universities = pickle.load(the_universities_file)
except:
    the_universities_json = (rq.get(the_universities_ranking_url).json())['data']
    the_universities = pd.DataFrame(list(map(fetch_the_university_infos, the_universities_json[0:200])))
        
    the_universities.region = the_universities.region.astype('category')
    the_universities.country = the_universities.country.astype('category')
    
    the_universities_file = open('the_universities.pickle', 'wb')
    pickle.dump(the_universities, the_universities_file)

In [None]:
the_universities

In [8]:
the_universities_by_faculty, the_universities_by_inter = ratio_stats(the_universities)

the_country = the_universities.groupby('country').sum()
the_country_by_faculty, the_country_by_inter = ratio_stats(the_country)

the_regions = the_universities.groupby('region').sum()
the_regions_by_faculty, the_regions_by_inter = ratio_stats(the_regions)

the_country_by_inter

Unnamed: 0_level_0,faculty_inter,faculty_local,faculty_total,student_inter,student_local,student_total,the_rank,faculty_student_ratio,inter_student_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Luxembourg,,,340,2832,2137,4969,179,0.068424,0.569934
United Kingdom,,,44425,213045,383404,596449,3428,0.074482,0.357189
Hong Kong,,,4140,25157,52506,77663,443,0.053307,0.323925
Australia,,,9937,83807,184823,268630,616,0.036991,0.311979
Singapore,,,3364,17084,39017,56101,74,0.059963,0.304522
Switzerland,,,10048,32743,75109,107852,666,0.093165,0.303592
New Zealand,,,1614,8800,21548,30348,192,0.053183,0.28997
Ireland,,,708,4362,11795,16157,117,0.04382,0.269976
Austria,,,1700,9197,26178,35375,165,0.048057,0.259986
Canada,,,13236,55902,193499,249401,403,0.053071,0.224145


# Merging datasets

In [10]:
from difflib import SequenceMatcher
from collections import defaultdict
from operator import itemgetter

def proximity(a, b):
    return SequenceMatcher(None, a, b).ratio()

proximities = dict()
for top_index, top_uni in top_universities.iterrows():
    for the_index, the_uni in the_universities.iterrows():
        if top_uni['country'] != the_uni['country']:
            continue
        
        prox = proximity(top_uni['name'], the_uni['name'])
        if prox > .5:
            proximities[(top_index, the_index)] = prox
        
proximities

{(0, 2): 0.6753246753246753,
 (0, 4): 0.925,
 (0, 32): 0.6756756756756757,
 (1, 3): 1.0,
 (1, 5): 0.7567567567567568,
 (1, 6): 0.6666666666666666,
 (1, 11): 0.7058823529411765,
 (1, 12): 0.6046511627906976,
 (1, 13): 0.631578947368421,
 (1, 16): 0.6470588235294118,
 (1, 18): 0.7027027027027027,
 (1, 19): 0.6190476190476191,
 (1, 23): 0.6222222222222222,
 (1, 27): 0.6842105263157895,
 (1, 49): 0.6857142857142857,
 (1, 59): 0.7222222222222222,
 (1, 69): 0.7222222222222222,
 (1, 70): 0.7,
 (1, 76): 0.5833333333333334,
 (1, 83): 0.6363636363636364,
 (1, 85): 0.6470588235294118,
 (1, 97): 0.7428571428571429,
 (1, 105): 0.7,
 (1, 116): 0.7027027027027027,
 (1, 122): 0.65,
 (1, 125): 0.6511627906976745,
 (1, 157): 0.56,
 (1, 159): 0.6153846153846154,
 (1, 169): 0.6857142857142857,
 (1, 185): 0.5263157894736842,
 (1, 192): 0.6190476190476191,
 (2, 3): 0.7567567567567568,
 (2, 5): 1.0,
 (2, 6): 0.631578947368421,
 (2, 8): 0.5128205128205128,
 (2, 11): 0.7272727272727273,
 (2, 12): 0.57142857142

In [41]:
best_matches = sorted(proximities.items(), key = itemgetter(1), reverse = True)
top_universities['matched'] = False
the_universities['matched'] = False

merged_universities = list()
for indexes, proximity in best_matches:
    top_uni, the_uni = top_universities.loc[indexes[0]], the_universities.loc[indexes[1]]
        
    if not top_uni['matched'] and not the_uni['matched']:
        top_universities.loc[indexes[0],'matched'] = True
        the_universities.loc[indexes[1],'matched'] = True
        merged_uni = pd.concat([top_uni, the_uni], axis = 1)
        merged_universities.append(merged_uni)
        
merged_universities = pd.DataFrame(merged_universities)
merged_universities

Unnamed: 0,0
0,1 ...
1,2 ...
2,4...
3,5 ...
4,7...
5,8 ...
6,12 ...
7,13 ...
8,15 ...
9,1...


In [39]:
top_universities[top_universities.matched == False]

Unnamed: 0,country,faculty_inter,faculty_local,faculty_total,name,region,student_inter,student_local,student_total,top_rank,faculty_student_ratio,inter_student_ratio,matched
42,France,75.0,103.0,178.0,"Ecole normale supérieure, Paris",Europe,374.0,1533.0,1907.0,43,0.09334,0.19612,False
55,Japan,191.0,1372.0,1563.0,Tokyo Institute of Technology,Asia,1071.0,8761.0,9832.0,56,0.158971,0.10893,False
62,Japan,296.0,2518.0,2814.0,Osaka University,Asia,2106.0,20654.0,22760.0,63,0.123638,0.092531,False
65,Germany,674.0,2600.0,3274.0,Ludwig-Maximilians-Universität München,Europe,5084.0,29971.0,35055.0,66,0.093396,0.145029,False
67,Germany,756.0,3152.0,3908.0,Ruprecht-Karls-Universität Heidelberg,Europe,5298.0,23554.0,28852.0,68,0.13545,0.183627,False
74,Argentina,3165.0,13256.0,16421.0,Universidad de Buenos Aires (UBA),Latin America,27109.0,95192.0,122301.0,75,0.134267,0.221658,False
75,Japan,264.0,3147.0,3411.0,Tohoku University,Asia,1604.0,16223.0,17827.0,76,0.191339,0.089976,False
87,Ireland,544.0,705.0,1249.0,"Trinity College Dublin, The University of Dublin",Europe,4311.0,10650.0,14961.0,88,0.083484,0.288149,False
89,South Korea,339.0,3406.0,3745.0,Korea University,Asia,3638.0,22254.0,25892.0,90,0.144639,0.140507,False
94,Russia,373.0,6336.0,6709.0,Lomonosov Moscow State University,Europe,5098.0,25135.0,30233.0,95,0.22191,0.168624,False


In [40]:
the_universities[the_universities.matched == False]

Unnamed: 0,0
0,1 ...
1,2 ...
2,4...
3,5 ...
4,7...
5,8 ...
6,12 ...
7,13 ...
8,15 ...
9,1...
