In [15]:
# Import libraries
import requests
import pandas as pd
import json

from bs4 import BeautifulSoup

In [2]:
BASE_URL = 'https://www.topuniversities.com'
# Make the request
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508079297350')
json_result = r.json()

In [3]:
def getUniInfo(URL):
    #International faculty member

    r = requests.get(URL)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    #Extract the div with the intels about the faculty
    facultyDiv  = soup.find("div", class_="faculty-main wrapper col-md-4")
    
    #In the case the data is not on the page
    if facultyDiv is None:
        facultyInt = -1
        facultyTot = -1
    else:
        facultyInt = extract_number(facultyDiv.find("div", class_="inter faculty"))
        facultyTot = extract_number(facultyDiv.find("div", class_="total faculty"))
    
    #Extract the div with the intels about the student

    studentInt = soup.find("div", class_="total inter")
    studentTot = soup.find("div", class_="total student")
    #In the case the data is not on the page
    
    if studentInt is None:
        studentInt = -1
    else:
        studentInt = extract_number(studentInt)
        
    if studentTot is None:
        studentTot = -1
    else:
        studentTot = extract_number(studentTot)
        
    return facultyInt,facultyTot,studentInt,studentTot

def extract_number(t):
    #Answer on how to extract the int out of string found here:
    #https://stackoverflow.com/questions/26825729/extract-number-from-string-python
    
    #In case the faculty doesn't have international faculty member
    if t is None:
        return -1
    
    return int(''.join(filter(str.isdigit, t.find("div", class_="number").text)))

In [4]:
#Building the dataFrame
df = pd.DataFrame(columns = ['Rank','Name','Country','Region','FacultyMemberInt','FacultyMemberTot','StudentInt','StudentTot'])

nb = 0
for i in json_result.keys():
    for x in json_result[i]:
        
        if nb == 200:
            break
        
        #print(x['title'])
        nb = nb + 1
        facultyInt,facultyTot,studentInt,studentTot = getUniInfo(BASE_URL + x['url'])
        current_uni = [x['rank_display'],x['title'],x['country'],x['region'],facultyInt,facultyTot,studentInt,studentTot]
        df.loc[len(df)] = current_uni

In [21]:
df.tail()

Unnamed: 0,Rank,Name,Country,Region,FacultyMemberInt,FacultyMemberTot,StudentInt,StudentTot
195,=195,Universitat Autònoma de Barcelona,Spain,Europe,230,2187,3848,31986
196,=195,Texas A&M University,United States,North America,206,3446,4900,60294
197,199,Instituto Tecnológico y de Estudios Superiores...,Mexico,Latin America,821,1822,1412,13376
198,200,Maastricht University,Netherlands,Europe,502,1277,8234,16385
199,201,Universidad de Chile,Chile,Latin America,127,2256,2134,38848


In [7]:
def getBestRatio(data,col1,col2):
    """Compute and return the best ratio col1 / col2.FilteringBy indicate the column header we want to filter by"""
            
    bestRatio = 0
    bestName = ''
    
    for index, row in data.iterrows():

        if row[col1] > 0 and row[col2] > 0:
            currentRatio = row[col1] / row[col2]
            if bestRatio < currentRatio:
                bestRatio = currentRatio
                bestName = row['Name']
                
                
    return bestName,bestRatio  

In [11]:
#Get the best ratio of international faculty member.
uniWithBestFacRatio, maxIntFacRatio = getBestRatio(df,'FacultyMemberInt','FacultyMemberTot')
print("Best ratio of international faculty member:\n  Name = {n}, Ratio = {r}".format(n=uniWithBestFacRatio,r=maxIntFacRatio))   

#Get the best ratio of international student.
uniWithBestStudRatio, maxIntStudRatio = getBestRatio(df,'StudentInt','StudentTot')
print("Best ratio of international student:\n  Name = {n}, Ratio = {r}".format(n=uniWithBestStudRatio,r=maxIntStudRatio)) 

Best ratio of international faculty member:
  Name = Ecole Polytechnique Fédérale de Lausanne (EPFL), Ratio = 0.7669616519174042
Best ratio of international student:
  Name = London School of Economics and Political Science (LSE), Ratio = 0.6913934426229508


In [12]:
#Get the list of the countries in the dataFrame
countryStats = pd.DataFrame(columns = ['Name','FacultyMemberInt','FacultyMemberTot','StudentInt','StudentTot'])
countries = df.Country.unique()

for country in countries:
    
    #Delete the row with -1
    currentCountry = df[(df.Country == country)&(df.StudentInt >=0)&(df.StudentTot >=0)&(df.FacultyMemberInt >=0)&(df.FacultyMemberTot >=0)]
    
    #Sum the columns
    IntStud = currentCountry.StudentInt.sum()
    TotStud = currentCountry.StudentTot.sum()
    IntFac = currentCountry.FacultyMemberInt.sum()
    TotFac = currentCountry.FacultyMemberTot.sum()
    
    currentCountry = [country,IntFac,TotFac,IntStud,TotStud]
    countryStats.loc[len(countryStats)] = (currentCountry)
    
CountryWithBestFacRatio, maxIntFacCountryRatio = getBestRatio(countryStats,'FacultyMemberInt','FacultyMemberTot')
print("Name = {n}, Ratio = {r}".format(n=CountryWithBestFacRatio,r=maxIntFacCountryRatio)) 

CountryWithBestStudRatio, maxIntStudCountryRatio = getBestRatio(countryStats,'StudentInt','StudentTot')
print("Name = {n}, Ratio = {r}".format(n=CountryWithBestStudRatio,r=maxIntStudCountryRatio)) 

Name = Singapore, Ratio = 0.6436891147818721
Name = Australia, Ratio = 0.3521891163400597


In [14]:
#Get the list of the countries in the dataFrame
RegionStats = pd.DataFrame(columns = ['Name','FacultyMemberInt','FacultyMemberTot','StudentInt','StudentTot'])
regions = df.Region.unique()

for region in regions:
    
    #Delete the row with -1
    currentRegion = df[(df.Region == region)&(df.StudentInt >=0)&(df.StudentTot >=0)&(df.FacultyMemberInt >=0)&(df.FacultyMemberTot >=0)]
    
    #Sum the columns
    IntStud = currentRegion.StudentInt.sum()
    TotStud = currentRegion.StudentTot.sum()
    IntFac = currentRegion.FacultyMemberInt.sum()
    TotFac = currentRegion.FacultyMemberTot.sum()
    
    currentRegion = [region,IntFac,TotFac,IntStud,TotStud]
    RegionStats.loc[len(RegionStats)] = (currentRegion)
    
RegionWithBestFacRatio, maxIntFacRegionRatio = getBestRatio(RegionStats,'FacultyMemberInt','FacultyMemberTot')
print("Name = {n}, Ratio = {r}".format(n=RegionWithBestFacRatio,r=maxIntFacRegionRatio)) 

RegionWithBestStudRatio, maxIntStudRegionRatio = getBestRatio(RegionStats,'StudentInt','StudentTot')
print("Name = {n}, Ratio = {r}".format(n=RegionWithBestStudRatio,r=maxIntStudRegionRatio)) 

Name = Oceania, Ratio = 0.5044383950763404
Name = Oceania, Ratio = 0.3392609811889755


## Retreiving Data From timeshighereducation website:

In [105]:
def extract_number(t):
    #Answer on how to extract the int out of string found here:
    #https://stackoverflow.com/questions/26825729/extract-number-from-string-python
    
    #In case the faculty doesn't have international faculty member
    if t is None:
        return -1
    
    return int(''.join(filter(str.isdigit, t)))

In [186]:
API_URL_DATA = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"#Building the dataFrame
json_result = requests.get(API_URL_DATA).json()["data"]
times_ranking = pd.DataFrame(columns = ['Rank','Name','Country','FacultyMemberTot','StudentInt','StudentTot'])
nb = 0
for x in json_result:        
    if nb == 200:
        break        
    studentTot = extract_number(x['stats_number_students']) 
    studentInt = int(extract_number(x['stats_pc_intl_students'])*studentTot/100)
    facultyTot = int(studentTot/float(x['stats_student_staff_ratio']))
    current_uni = [x['rank'],x['name'],x['location'],facultyTot,studentInt,studentTot]
    times_ranking.loc[nb] = current_uni
    nb = nb + 1

In [187]:
times_ranking.tail()

Unnamed: 0,Rank,Name,Country,FacultyMemberTot,StudentInt,StudentTot
195,196,Paris-Sorbonne University – Paris 4,France,729,3306,22042
196,197,"Royal Holloway, University of London",United Kingdom,579,3572,8931
197,=198,"University of California, Riverside",United States,1001,2903,20740
198,=198,University of Gothenburg,Sweden,1837,2389,18378
199,=198,National Taiwan University,Taiwan,2761,2540,31758


In [193]:
def getBestRatio2(data,col1,col2):
    """Compute and return the best ratio col1 / col2.FilteringBy indicate the column header we want to filter by"""
                
    bestName = ''
    
    if(col1=="StudentTot" and col2=="FacultyMemberTot"):
        #the best ratio is the lowest one
        bestRatio = data.loc[0][col2]
        for index, row in data.iterrows():
            if row[col1] > 0 and row[col2] > 0:
                currentRatio = row[col1] / row[col2]
                if bestRatio > currentRatio:
                    bestRatio = currentRatio
                    bestName = row['Name']
    else:
        #the best ratio is the highest one
        bestRatio = 0
        for index, row in data.iterrows():
            if row[col1] > 0 and row[col2] > 0:
                currentRatio = row[col1] / row[col2]
                if bestRatio < currentRatio:
                    bestRatio = currentRatio
                    bestName = row['Name']                
                
    return bestName,bestRatio  

In [197]:
# a) Get the best ratio between faculty members and students,
uniWithBestRatio, maxRatio = getBestRatio2(times_ranking,'StudentTot','FacultyMemberTot')
print("Best ratio between faculty members and students:\n  Name = {n}, Ratio 'number of students per staff' = {r}".format(n=uniWithBestRatio,r=maxRatio))   

# b) Get the best ratio of international student.
uniWithBestStudRatio, maxIntStudRatio = getBestRatio2(times_ranking,'StudentInt','StudentTot')
print("Best ratio of international student:\n  Name = {n}, Ratio = {r}".format(n=uniWithBestStudRatio,r=maxIntStudRatio)) 

Best ratio between faculty members and students:
  Name = Vanderbilt University, Ratio 'number of students per staff' = 3.300632041769717
Best ratio of international student:
  Name = London School of Economics and Political Science, Ratio = 0.7099850968703427


In [198]:
#Get the list of the countries in the dataFrame
countryStats = pd.DataFrame(columns = ['Name','FacultyMemberTot','StudentInt','StudentTot'])
countries = times_ranking.Country.unique()

for country in countries:
    
    #Delete the row with -1
    currentCountry = times_ranking[(times_ranking.Country == country)&(times_ranking.StudentInt >=0)&(times_ranking.StudentTot >=0)&(times_ranking.FacultyMemberTot >=0)]
    
    #Sum the columns
    IntStud = currentCountry.StudentInt.sum()
    TotStud = currentCountry.StudentTot.sum()
    TotFac = currentCountry.FacultyMemberTot.sum()
    
    currentCountry = [country,TotFac,IntStud,TotStud]
    countryStats.loc[len(countryStats)] = (currentCountry)

# a) Get the best ratio between faculty members and students by country
CountryWithBestFacRatio, maxIntFacCountryRatio = getBestRatio2(countryStats,'StudentTot','FacultyMemberTot')
print("Best ratio between faculty members and students by country:\n  Name = {n}, Ratio 'number of students per staff' = {r}".format(n=CountryWithBestFacRatio,r=maxIntFacCountryRatio)) 

# b) Get the best ratio of international student by country
CountryWithBestStudRatio, maxIntStudCountryRatio = getBestRatio2(countryStats,'StudentInt','StudentTot')
print("Best ratio of international student vy country:\n  Name = {n}, Ratio = {r}".format(n=CountryWithBestStudRatio,r=maxIntStudCountryRatio)) 

Best ratio between faculty members and students by country:
  Name = Denmark, Ratio 'number of students per staff' = 6.05039637599094
Best ratio of international student vy country:
  Name = Luxembourg, Ratio = 0.5699335882471323


## Merging 

In [203]:
df = df.set_index(['Name'])
times_ranking = times_ranking.set_index(["Name"])

In [210]:
df.head()

Unnamed: 0_level_0,Rank,Country,Region,FacultyMemberInt,FacultyMemberTot,StudentInt,StudentTot
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Massachusetts Institute of Technology (MIT),1,United States,North America,1679,2982,3717,11067
Stanford University,2,United States,North America,2042,4285,3611,15878
Harvard University,3,United States,North America,1311,4350,5266,22429
California Institute of Technology (Caltech),4,United States,North America,350,953,647,2255
University of Cambridge,5,United Kingdom,Europe,2278,5490,6699,18770


In [211]:
times_ranking.head()

Unnamed: 0_level_0,Rank,Country,FacultyMemberTot,StudentInt,StudentTot
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
University of Oxford,1,United Kingdom,1822,7755,20409
University of Cambridge,2,United Kingdom,1687,6436,18389
California Institute of Technology,=3,United States,339,596,2209
Stanford University,=3,United States,2112,3485,15845
Massachusetts Institute of Technology,5,United States,1284,3800,11177


In [None]:
# clean name -> full name
# remove NaN & -1

In [208]:
pd.merge(df,times_ranking,how="outer",left_index=True,right_index=True)

Unnamed: 0_level_0,Rank_x,Country_x,Region,FacultyMemberInt,FacultyMemberTot_x,StudentInt_x,StudentTot_x,Rank_y,Country_y,FacultyMemberTot_y,StudentInt_y,StudentTot_y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Aalto University,=137,Finland,Europe,370,1257,1831,12147,190,Finland,630,2548,12744
Aarhus University,119,Denmark,Europe,602,2316,3762,26226,=109,Denmark,1759,3020,25167
Albert-Ludwigs-Universitaet Freiburg,171,Germany,Europe,413,1966,3897,23214,,,,,
Alma Mater Studiorum - University of Bologna,=188,Italy,Europe,153,2990,4195,63399,,,,,
Arizona State University,,,,,,,,=126,United States,2212,8450,44475
Australian National University,,,,,,,,48,Australia,828,5595,15986
Autonomous University of Barcelona,,,,,,,,=147,Spain,2504,5169,32309
Boston University,81,United States,North America,379,3157,7041,25662,=70,United States,2887,6208,24833
Brown University,53,United States,North America,379,1303,1825,9251,=50,United States,831,1779,8898
California Institute of Technology,,,,,,,,=3,United States,339,596,2209
