First, we import all the needed librairies.

In [None]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import json
import math
import time
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

We noticed that the actual data from topuniversities is not directly on the webpage, but on a separate text file, which contains json information.
Thus, we first get this json, parse it, and take the first 200 entries in it.
We noticed that the univertsity with rank 199 is actually the 198th entry, and thus the last 3 universities needs to have their rank corrected.

In [None]:
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508259845358')
raw_data = json.loads(r.text)['data'][:200]

We can print the first entry of the data to see how the informations are represented.

In [None]:
raw_data[0]

We can now define functions that will help us during the processing of this json.

First, process_university takes as input the raw json of a particular uni, and outputs a dictionnary containing the name, rank, country, region, number of faculty members (international and total) and number of students (international and total) for that given uni.

It uses other functions defined below.

In [None]:
def process_university(uni):
    name = uni['title']
    rank = get_rank(uni['rank_display'])
    country = uni['country']
    region = uni['region']
    
    numbers = get_numbers(uni['url'])
    info = {'name' : name, 'rank' : rank, 'country' : country, 'region' : region}
    info.update(numbers)
    return info

As there can be ties in rank, the displayed rank is not always a integer. Furthermore, as said above, the last 3 unis have incorrect ranks and need to be fixed.

In [None]:
def get_rank(rank_display):
    rank = int(rank_display.replace("=", ""))
    if rank >= 199:
        rank -= 1
    return rank

To get the number of faculty members (international and total) and number of students (international and total), we need to get another request, and this time, we will need to parse the webpage using BeautifulSoup.

By inspecting the webpage, we noticed the classes of the elements where the numbers are contained. Once we get these elements, we further need to parse its content, to get the value as an integer.

During the parsing, we noticed that one university (NYU) did not have the same template as the others, and so its number of students is unknown. 

In [None]:
def get_numbers(url):
    r = requests.get("https://www.topuniversities.com/" + url)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    faculty_info = soup.select(".text .number")
    if len(faculty_info) >= 2:
        total_faculty = parse_int(faculty_info[0].decode_contents(formatter="html"))
        international_faculty = parse_int(faculty_info[1].decode_contents(formatter="html"))
    else:
        total_faculty = math.nan
        international_faculty = math.nan
    
    student_info = soup.select(".barp .number")
    if len(faculty_info) >= 2:
        total_student = parse_int(student_info[0].decode_contents(formatter="html"))
        international_student = parse_int(student_info[1].decode_contents(formatter="html"))
    else:
        total_student = math.nan
        international_student = math.nan
    return {'total_faculty' : total_faculty, 'international_faculty' : international_faculty, 'total_student' : total_student, 'international_student' : international_student}

In [None]:
def parse_int(str):
    return int(str.replace("\n", "").replace(" ", "").replace(",", ""))

In [None]:
#Only run this if you want to regenerate the .json
"""all_unis = []
for uni in raw_data:
    all_unis.append(process_university(uni))
    
with open('data1.json', 'w') as f:
    json.dump(all_unis, f)"""

In [None]:
with open('data1.json', 'r') as f:
    all_unis = json.load(f)

In [None]:
df = pd.DataFrame(all_unis)
df

In [None]:
#df_staff_student = df.copy()
df['staff_student_ratio'] = df['total_faculty'] / df['total_student']
df.sort_values(['staff_student_ratio'], ascending=[False])[['name', 'rank', 'staff_student_ratio']]

In [None]:
#df_int_student = df.copy()
df['international_student_ratio'] = df['international_student'] / df['total_student']
df.sort_values(['international_student_ratio'], ascending=[False])[['name', 'rank', 'staff_student_ratio']]

In [None]:
df_staff_country = df.groupby('country').mean().sort_values('staff_student_ratio', ascending=False)[['staff_student_ratio']]
df_staff_country

In [None]:
df_staff_region = df.groupby('region').mean().sort_values('staff_student_ratio', ascending=False)[['staff_student_ratio']]
df_staff_region

In [None]:
df_int_country = df.groupby('country').mean().sort_values('international_student_ratio', ascending=False)[['international_student_ratio']]
df_int_country

In [None]:
df_int_region = df.groupby('region').mean().sort_values('international_student_ratio', ascending=False)[['international_student_ratio']]
df_int_region

In [None]:
df.sort_values('rank')['staff_student_ratio'].plot(kind='bar')

In [None]:
df.sort_values('rank')['international_student_ratio'].plot(kind='bar')

In [None]:
df_staff_country['staff_student_ratio'].plot(kind='bar')

In [None]:
df_staff_region['staff_student_ratio'].plot(kind='bar')

In [None]:
df_int_country['international_student_ratio'].plot(kind='bar')

In [None]:
df_int_region['international_student_ratio'].plot(kind='bar')

In [None]:
country_region = dict(df[['country', 'region']].groupby(['country', 'region']).groups.keys())

In [None]:
r2 = requests.get('https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json')
raw_data2 = json.loads(r2.text)['data'][:200]

In [None]:
unis2 = []
for uni in raw_data2:
    name = uni['name']
    rank = uni['rank'].replace('=', '')
    country = uni['location']
    if country == 'Russian Federation':
        country = 'Russia'
    int_students = uni['stats_pc_intl_students'].replace('%', '')
    staff_student = uni['stats_student_staff_ratio']
    
    #numbers = get_numbers(uni['url'])
    info = {'name' : name, 'rank': rank, 'country': country, 'region' : country_region.get(country, 'Europe'),
            'international_student_ratio' : int(int_students) / 100.0, 'staff_student_ratio': 1 / float(staff_student)}
    #info.update(numbers)
    unis2.append(info)
df2 = pd.DataFrame(unis2)
df2

In [None]:
df_staff_country2 = df2.groupby('country').mean().sort_values('staff_student_ratio', ascending=False)[['staff_student_ratio']]
df_staff_country2

In [None]:
df_staff_region2 = df2.groupby('region').mean().sort_values('staff_student_ratio', ascending=False)[['staff_student_ratio']]
df_staff_region2

In [None]:
df_int_country2 = df2.groupby('country').mean().sort_values('international_student_ratio', ascending=False)[['international_student_ratio']]
df_int_country2

In [None]:
df_int_region2 = df2.groupby('region').mean().sort_values('international_student_ratio', ascending=False)[['international_student_ratio']]
df_int_region2

In [None]:
df2.sort_values('rank')['staff_student_ratio'].plot(kind='bar')

In [None]:
df2.sort_values('rank')['staff_student_ratio'].plot(kind='bar')

In [None]:
df_staff_country2['staff_student_ratio'].plot(kind='bar')

In [None]:
df_staff_region2['staff_student_ratio'].plot(kind='bar')

In [None]:
df_int_country2['international_student_ratio'].plot(kind='bar')

In [None]:
df_int_region2['international_student_ratio'].plot(kind='bar')

In [None]:
#mapping = {}
with open('mapping.json', 'r') as f:
    mapping = json.load(f)

In [None]:
def get_identifier(name):
    if not(name in mapping):
        mapping[name] = get_url(name)
        
    print(name + " => " + mapping[name])
    return mapping[name]


In [None]:
names1 = pd.DataFrame(df['name'].apply(get_identifier))

names1['test1'] = 1
#names1.set_index(['name'])
names2 = pd.DataFrame(df2['name'].apply(get_identifier))
names2['test2'] = 1
#names2.set_index(['name'])
names1.merge(names2, how='outer').to_csv('test.csv')


In [None]:
def get_url(name):
    r = requests.get('https://encrypted.google.com/search?q=' + name.replace(' ', '+'))
    soup = BeautifulSoup(r.text, 'html.parser')
    google_url = soup.select('.g a')[0]['href']
    url = google_url[google_url.find("://")+3:google_url.find("&")]
    
    if url.endswith("/"):
        url = url[0:-1]
    
    time.sleep(5)
    return url

In [None]:
with open('mapping.json', 'w') as f:
    json.dump(mapping, f)
    
mapping