In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import seaborn
import re
import numpy as np

### Scraping topuniversities.com
Scraping topuniversities is done in two steps:
- first we download the university list ordered by the ranking (the scraping_top_university_list)
- then we open each description page of the universities to get additional information (follow_detail_url_top_university)

follow_detail_url_top_university is quite a slow function, as it has to download many webpages (200 in our case).
To scrape all the information, execute: 
`follow_detail_url_top_university(scraping_top_university_list())`

In [2]:
TOP_UNIVERSITY_BASE_URL = "https://www.topuniversities.com"
TOP_UNIVERSITY_RANK_LIST_URL = TOP_UNIVERSITY_BASE_URL + "/sites/default/files/qs-rankings-data/357051.txt?_=1508344027292"
TOP_UNIVERSITY_NUMBER_OF_UNIVERSITIES_TO_SCRAPE = 200

def scraping_top_university_list():
    """
    This function downloads the university list rank from topuniversity.com and returns it in a list.
    Each element of the list is a simple dictionary containing: name, rank, country, region, and url
    to the detailed page of the university
    """
    top_university_ranking_page = requests.get(TOP_UNIVERSITY_RANK_LIST_URL)
    ranked_university_list = top_university_ranking_page.json()['data']
    data = [{
        'name': uni['title'],
        'rank': uni['rank_display'],
        'country': uni['country'],
        'region': uni['region'],
        'url': uni['url']
    } for uni in ranked_university_list[:TOP_UNIVERSITY_NUMBER_OF_UNIVERSITIES_TO_SCRAPE]]
    return data

ACADEMIC_STAFF_NUMBERS_REGEX = re.compile(r'^[^0-9,]*([0-9,]+)[^0-9,]*$')

def parse_top_university_numbers(text):
    """
    Just an utility function that let us parse to an integer the numbers scraped from the top universisty website
    """
    return int(ACADEMIC_STAFF_NUMBERS_REGEX.match(text).group(1).replace(',', ''))

def follow_detail_url_top_university(data):
    """
    This function takes as input a list returned by the 'scraping_top_university_list' function,
    follows the link of the url (giving detailed information on the university), and adds the
    number of faculty members (international and total) and the number of students (international and total).
    If some information is missing, it is filled with the None value
    """
    output = []
    for uni in data:
        try:
            raw_detail_page = requests.get(TOP_UNIVERSITY_BASE_URL + uni['url'])
            parsed_detail_page = BeautifulSoup(raw_detail_page.text, 'html.parser')
            try:
                members_total = parse_top_university_numbers(
                    parsed_detail_page.find('div', class_="total faculty").find(class_="number").text
                )
            except:
                members_total = None
            try:
                members_international = parse_top_university_numbers(
                    parsed_detail_page.find('div', class_="inter faculty").find(class_="number").text
                )
            except:
                members_international = None
            try:
                students_total = parse_top_university_numbers(
                    parsed_detail_page.find('div', class_="total student").find(class_="number").text
                )
            except:
                students_total = None
            try:
                students_international = parse_top_university_numbers(
                    parsed_detail_page.find('div', class_="total inter").find(class_="number").text
                )
            except:
                students_international = None
                
            output.append({
                'name': uni['name'],
                'rank': uni['rank'],
                'country': uni['country'],
                'region': uni['region'],
                'members_total': members_total,
                'members_international': members_international,
                'students_total': students_total,
                'students_international': students_international
            })
        except:
            print("Error loading details of university:", uni)
            output.append({
                'name': uni['name'],
                'rank': uni['rank'],
                'country': uni['country'],
                'region': uni['region'],
                'members_total': None,
                'members_international': None,
                'students_total': None,
                'students_international': None
            })
    return output

In [3]:
top_university_data = follow_detail_url_top_university(scraping_top_university_list())

In [4]:
top_university_data[11]

{'country': 'Switzerland',
 'members_international': 1300,
 'members_total': 1695,
 'name': 'Ecole Polytechnique Fédérale de Lausanne (EPFL)',
 'rank': '12',
 'region': 'Europe',
 'students_international': 5896,
 'students_total': 10343}

In [6]:
df_top_university = pd.DataFrame(top_university_data)

In [7]:
df_top_university[np.isnan(df_top_university.members_international)]

Unnamed: 0,country,members_international,members_total,name,rank,region,students_international,students_total
51,United States,,,New York University (NYU),52,North America,,
189,India,,423.0,Indian Institute of Science (IISc) Bangalore,190,Asia,47.0,4071.0


### Scraping timeshighereducation.com
Scraping timeshighereducation.com can be done in one function this time. The detailed pages of the universities do not bring us more information. Unfortunatly, the region and number of international staff information is not given. To keep the same format as above, `scraping_higher_education_list` returns the same format but with `region` and `members_international` set to None. The number of international students and staff are not given either, but can be computed from the given stats. We set:

`members_total = number_of_staff_per_students/number_of_students`
`students_international = number_of_students * proportion_of_international_student`

In [42]:
HIGHER_EDUCATION_BASE_URL = "https://www.timeshighereducation.com"
HIGHER_EDUCATION_RANK_LIST_URL = HIGHER_EDUCATION_BASE_URL + \
    "/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"
HIGHER_EDUCATION_NUMBER_OF_UNIVERSITIES_TO_SCRAPE = 200

HIGHER_EDUCATION_PERCENTAGE_REGEX = re.compile(r'^([0-9,.]+)%$')

def parse_higher_education_percentage(text):
    """
    Just an utility function that let us parse the percentage string from highereducation. It is returned as a float
    """
    return float(HIGHER_EDUCATION_PERCENTAGE_REGEX.match(text).group(1))/100.0

def scraping_higher_education_list():
    """
    This function downloads the university list rank from timeshighereducation.com and returns it in a list.
    Each element of the list is a simple dictionary containing: name, rank, country, region, the
    number of faculty members (international and total) and the number of students (international and total)
    """
    higher_education_ranking_page = requests.get(HIGHER_EDUCATION_RANK_LIST_URL)
    ranked_university_list = higher_education_ranking_page.json()['data']
    data = []
    for uni in ranked_university_list[:HIGHER_EDUCATION_NUMBER_OF_UNIVERSITIES_TO_SCRAPE]:
        number_of_students = int(uni['stats_number_students'].replace(',', ''))
        data.append({
            'name': uni['name'],
            'rank': uni['rank'],
            'country': uni['location'],
            'region': None,
            'members_total': int(float(number_of_students) / float(uni['stats_student_staff_ratio'])),
            'members_international': None,
            'students_total': number_of_students,
            'students_international': int(
                parse_higher_education_percentage(uni['stats_pc_intl_students']) * 
                float(number_of_students))
        })
    return data

In [43]:
higher_education_data = scraping_higher_education_list()

In [44]:
df_higher_education = pd.DataFrame(higher_education_data)