In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests as rq
from bs4 import BeautifulSoup as bs
import re
import pickle

In [2]:
# Found using Postman and intercepting traffics while visiting topuniversities.com
top_universities_url = 'https://www.topuniversities.com'
top_universities_ranking_url = top_universities_url + '/sites/default/files/qs-rankings-data/357051.txt'

the_universities_url = 'https://www.timeshighereducation.com'
the_universities_ranking_url = the_universities_url + '/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'

## Top Universities

In [4]:
# Remove any characters that is not part of a floating number
def match_number(text):
    ''' [-+]?[0-9]*\.?[0-9]+ '''
    return float(re.sub('[^-+0-9.]', '', text))

# Extract a float number from an html document using the selector
def extract_number(html, selector):
    number_tag = html.select_one(selector)
    if number_tag is None:
        return float('NaN')
    else:
        return match_number(number_tag.get_text())

def fetch_top_university_infos(university):
    # Fetch the Webpage of the university
    university_request = rq.get(top_universities_url + university['url'])
    university_html = bs(university_request.text, 'html.parser')
    university_data = dict()
    
    # Scrap data from the JSON and the Webpage
    university_data['name'] = university['title']
    university_data['rank'] = int(match_number(university['rank_display']))
    university_data['region'] = university['region']
    university_data['country'] = university['country']
    university_data['faculty_total'] = extract_number(university_html, 'div.faculty.total div.number')
    university_data['faculty_inter'] = extract_number(university_html, 'div.faculty.inter div.number')
    university_data['faculty_local'] = university_data['faculty_total'] - university_data['faculty_inter']
    university_data['student_total'] = extract_number(university_html, 'div.student.total div.number')
    university_data['student_inter'] = extract_number(university_html, 'div.inter.total div.number')
    university_data['student_local'] = university_data['student_total'] - university_data['student_inter']
    
    return university_data

try:
    # Try to load the serialized data
    top_universities_file = open('top_universities.pickle', 'rb')
    top_universities = pickle.load(top_universities_file)
except:
    # If loading from the serialized data failed, scrap from the web!
    top_universities_json = (rq.get(top_universities_ranking_url).json())['data']
    top_universities = pd.DataFrame(list(map(fetch_top_university_infos, top_universities_json[0:200])))
    
    top_universities.region = top_universities.region.astype('category')
    top_universities.country = top_universities.country.astype('category')
    
    # Serialize the result for future runs
    top_universities_file = open('top_universities.pickle', 'wb')
    pickle.dump(top_universities, top_universities_file)
    
# Create the mapping country => region for the Times scrapper
country_regions = dict()
for country_region_pair, universities in top_universities.groupby(['country', 'region']):
    country_regions[country_region_pair[0]] = country_region_pair[1]
    
top_universities

In [5]:
# Sort the universities by faculty/student ratio and international students/local students ratio
def ratio_stats(universities):
    universities['faculty_student_ratio'] = universities.faculty_total / universities.student_total
    universities['inter_student_ratio'] = universities.student_inter / universities.student_total
    
    top_by_faculty = (universities.sort_values('faculty_student_ratio', ascending = False))[:20]
    top_by_inter = (universities.sort_values('inter_student_ratio', ascending = False))[:20]
    
    return (top_by_faculty, top_by_inter)

In [None]:
# Top universities, using ratios
top_universities_by_faculty, top_universities_by_inter = ratio_stats(top_universities)

# Top universities by country, using ratios
top_country = top_universities.groupby('country').sum()
top_country_by_faculty, top_country_by_inter = ratio_stats(top_country)

# Top universities by region, using ratios
top_regions = top_universities.groupby('region').sum()
top_regions_by_faculty, top_regions_by_inter = ratio_stats(top_regions)

## Times Higher Education

In [7]:
import math

def fetch_the_university_infos(university):
    # No need to fetch more data
    university_data = dict()
    
    student_faculty_ratio = match_number(university['stats_student_staff_ratio'])
    inter_student_ratio = match_number(university['stats_pc_intl_students']) / 100.0
    
    # Scrap data from the JSON, some computation required
    university_data['name'] = university['name']
    university_data['rank'] = int(match_number(university['rank']))
    university_data['region'] = country_regions[university['location']]
    university_data['country'] = university['location']
    university_data['student_total'] = int(match_number(university['stats_number_students']))
    university_data['student_inter'] = int(university_data['student_total'] * inter_student_ratio)
    university_data['student_local'] = university_data['student_total'] - university_data['student_inter']
    university_data['faculty_total'] = int(university_data['student_total'] / student_faculty_ratio)
    university_data['faculty_inter'] = float('NaN')
    university_data['faculty_local'] = float('NaN')
    
    return university_data
    
try:
    # Try to load the serialized data
    the_universities_file = open('the_universities.pickle', 'rb')
    the_universities = pickle.load(the_universities_file)
except:
    # If loading from the serialized data failed, scrap from the web!
    the_universities_json = (rq.get(the_universities_ranking_url).json())['data']
    the_universities = pd.DataFrame(list(map(fetch_the_university_infos, the_universities_json[0:200])))
        
    the_universities.region = the_universities.region.astype('category')
    the_universities.country = the_universities.country.astype('category')
    
    # Serialize the result for future runs
    the_universities_file = open('the_universities.pickle', 'wb')
    pickle.dump(the_universities, the_universities_file)

In [None]:
# Top universities, using ratios
the_universities_by_faculty, the_universities_by_inter = ratio_stats(the_universities)

# Top universities by country, using ratios
the_country = the_universities.groupby('country').sum()
the_country_by_faculty, the_country_by_inter = ratio_stats(the_country)

# Top universities by region, using ratios
the_regions = the_universities.groupby('region').sum()
the_regions_by_faculty, the_regions_by_inter = ratio_stats(the_regions)

# Merging datasets

In [None]:
from difflib import SequenceMatcher
from collections import defaultdict
from operator import itemgetter

def proximity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# We will compute the proximity of the name between all universities of the rankings (cross product)
proximities = dict()
for top_index, top_uni in top_universities.iterrows():
    for the_index, the_uni in the_universities.iterrows():
        # Don't need to work on universities in different countries
        if top_uni['country'] != the_uni['country']:
            continue
        
        prox = proximity(top_uni['name'], the_uni['name'])
        
        # Only keep good proximity to reduce work in the next phase
        if prox > .5:
            proximities[(top_index, the_index)] = prox
        
proximities

In [47]:

# Prepare universities to be matched
top_universities['matched'] = False
the_universities['matched'] = False

# Sort match by quality, we want the best matches to be processed first in case of conflicts
best_matches = sorted(proximities.items(), key = itemgetter(1), reverse = True)

for indexes, proximity in best_matches:
    # Get a view on both unis
    top_uni, the_uni = top_universities.loc[indexes[0]], the_universities.loc[indexes[1]]
    
    # Do the match only if both are still free
    if not top_uni['matched'] and not the_uni['matched']:
        # Mark the universities as matched
        top_universities.loc[indexes[0],'matched'] = True
        the_universities.loc[indexes[1],'matched'] = True
        # Give the same name for the merge
        the_universities.loc[indexes[1],'merge_name'] = top_uni['name']

# Use a join to merge the datasets using the name
merged_universities = top_universities.merge(the_universities, how = 'outer', left_on = 'name', right_on = 'merge_name')
merged_universities

Unnamed: 0,country_x,faculty_inter_x,faculty_local_x,faculty_total_x,name_x,region_x,student_inter_x,student_local_x,student_total_x,top_rank,...,name_y,region_y,student_inter_y,student_local_y,student_total_y,the_rank,faculty_student_ratio_y,inter_student_ratio_y,matched_y,merge_name_y
0,United States,1679.0,1303.0,2982.0,Massachusetts Institute of Technology (MIT),North America,3717.0,7350.0,11067.0,1.0,...,Massachusetts Institute of Technology,North America,3800.0,7377.0,11177.0,5.0,0.114879,0.339984,True,Massachusetts Institute of Technology (MIT)
1,United States,2042.0,2243.0,4285.0,Stanford University,North America,3611.0,12267.0,15878.0,2.0,...,Stanford University,North America,3485.0,12360.0,15845.0,3.0,0.133291,0.219943,True,Stanford University
2,United States,1311.0,3039.0,4350.0,Harvard University,North America,5266.0,17163.0,22429.0,3.0,...,Harvard University,North America,5284.0,15042.0,20326.0,6.0,0.112319,0.259963,True,Harvard University
3,United States,350.0,603.0,953.0,California Institute of Technology (Caltech),North America,647.0,1608.0,2255.0,4.0,...,California Institute of Technology,North America,596.0,1613.0,2209.0,3.0,0.153463,0.269805,True,California Institute of Technology (Caltech)
4,United Kingdom,2278.0,3212.0,5490.0,University of Cambridge,Europe,6699.0,12071.0,18770.0,5.0,...,University of Cambridge,Europe,6436.0,11953.0,18389.0,2.0,0.091740,0.349992,True,University of Cambridge
5,United Kingdom,2964.0,3786.0,6750.0,University of Oxford,Europe,7353.0,12367.0,19720.0,6.0,...,University of Oxford,Europe,7755.0,12654.0,20409.0,1.0,0.089274,0.379979,True,University of Oxford
6,United Kingdom,2554.0,3791.0,6345.0,UCL (University College London),Europe,14854.0,16226.0,31080.0,7.0,...,University College London,Europe,14848.0,15456.0,30304.0,16.0,0.095235,0.489968,True,UCL (University College London)
7,United Kingdom,2071.0,1859.0,3930.0,Imperial College London,Europe,8746.0,7344.0,16090.0,8.0,...,Imperial College London,Europe,8721.0,7136.0,15857.0,8.0,0.087658,0.549978,True,Imperial College London
8,United States,635.0,1814.0,2449.0,University of Chicago,North America,3379.0,10178.0,13557.0,9.0,...,University of Chicago,North America,3381.0,10144.0,13525.0,9.0,0.161257,0.249982,True,University of Chicago
9,Switzerland,1886.0,591.0,2477.0,ETH Zurich - Swiss Federal Institute of Techno...,Europe,7563.0,12252.0,19815.0,10.0,...,ETH Zurich – Swiss Federal Institute of Techno...,Europe,7308.0,11925.0,19233.0,10.0,0.068476,0.379972,True,ETH Zurich - Swiss Federal Institute of Techno...
