In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import seaborn
import re
import numpy as np

In [2]:
TOP_UNIVERSITY_BASE_URL = "https://www.topuniversities.com"
TOP_UNIVERSITY_RANK_LIST_URL = TOP_UNIVERSITY_BASE_URL + "/sites/default/files/qs-rankings-data/357051.txt?_=1508344027292"
NUMBER_OF_UNIVERSITIES_TO_SCRAPE = 200

def scraping_top_university_list():
    """
    This function downloads the university list rank from topuniversity.com and returns it in a list.
    Each element of the list is a simple dictionary containing: name, rank, country, region, and url
    to the detailed page of the university
    """
    top_university_ranking_page = requests.get(TOP_UNIVERSITY_RANK_LIST_URL)
    ranked_university_list = top_university_ranking_page.json()['data']
    data = [{
        'name': uni['title'],
        'rank': uni['rank_display'],
        'country': uni['country'],
        'region': uni['region'],
        'url': uni['url']
    } for uni in ranked_university_list[:NUMBER_OF_UNIVERSITIES_TO_SCRAPE]]
    return data

ACADEMIC_STAFF_NUMBERS_REGEX = re.compile(r'^[^0-9,]*([0-9,]+)[^0-9,]*$')

def parse_top_university_numbers(text):
    """
    Just an utility function that let us parse to an integer the numbers scraped from the top universisty website
    """
    return int(ACADEMIC_STAFF_NUMBERS_REGEX.match(text).group(1).replace(',', ''))

def follow_detail_url_top_university(data):
    """
    This function takes as input a list returned by the 'scraping_top_university_list' function,
    follows the link of the url (giving detailed information on the university), and adds the
    number of faculty members (international and total) and the number of students (international and total).
    If some information is missing, it is filled with the None value
    """
    output = []
    for uni in data:
        try:
            raw_detail_page = requests.get(TOP_UNIVERSITY_BASE_URL + uni['url'])
            parsed_detail_page = BeautifulSoup(raw_detail_page.text, 'html.parser')
            try:
                members_total = parse_top_university_numbers(
                    parsed_detail_page.find('div', class_="total faculty").find(class_="number").text
                )
            except:
                members_total = None
            try:
                members_international = parse_top_university_numbers(
                    parsed_detail_page.find('div', class_="inter faculty").find(class_="number").text
                )
            except:
                members_international = None
            try:
                students_total = parse_top_university_numbers(
                    parsed_detail_page.find('div', class_="total student").find(class_="number").text
                )
            except:
                students_total = None
            try:
                students_international = parse_top_university_numbers(
                    parsed_detail_page.find('div', class_="total inter").find(class_="number").text
                )
            except:
                students_international = None
                
            output.append({
                'name': uni['name'],
                'rank': uni['rank'],
                'country': uni['country'],
                'region': uni['region'],
                'members_total': members_total,
                'members_international': members_international,
                'students_total': students_total,
                'students_international': students_international
            })
        except:
            print("Error loading details of university:", uni)
            output.append({
                'name': uni['name'],
                'rank': uni['rank'],
                'country': uni['country'],
                'region': uni['region'],
                'members_total': None,
                'members_international': None,
                'students_total': None,
                'students_international': None
            })
    return output

In [3]:
top_university_data = follow_detail_url_top_university(scraping_top_university_list())

In [4]:
top_university_data[11]

{'country': 'Switzerland',
 'members_international': 1300,
 'members_total': 1695,
 'name': 'Ecole Polytechnique Fédérale de Lausanne (EPFL)',
 'rank': '12',
 'region': 'Europe',
 'students_international': 5896,
 'students_total': 10343}

In [6]:
df_top_university = pd.DataFrame(top_university_data)

In [7]:
df_top_university[np.isnan(df_top_university.members_international)]

Unnamed: 0,country,members_international,members_total,name,rank,region,students_international,students_total
51,United States,,,New York University (NYU),52,North America,,
189,India,,423.0,Indian Institute of Science (IISc) Bangalore,190,Asia,47.0,4071.0
