In [1]:
import pandas as pd

In [2]:
masters = pd.read_csv("../data-version2/raw/master-programs/master-programs.csv")

In [3]:
masters.head()

In [4]:
masters.info()

In [5]:
# masters program table will not contain university name, city and country because they will be in a separate table

column_names : dict = {
    
    "pace"  : "schedule",
    "field" : "field_of_study",
    
}

columns_to_drop = ["university", "city", "country"]

masters_clean = masters.drop(columns_to_drop, axis=1)
masters_clean.rename(column_names, axis=1, inplace=True)

universities = masters[["university", "city", "country"]]

In [6]:
masters_clean.head()

In [7]:
universities.head()

In [8]:
universities.info()

In [9]:
# we have duplicates in dataset need to remove them
university_clean = universities.drop_duplicates()

In [10]:
university_clean.info()

In [11]:
university_clean = university_clean.fillna("NULL")

In [12]:
university_clean.head()

In [13]:
# we need id, email, name, rank, city_id for each university 

# id for each university

university_clean.reset_index(drop=True, inplace=True)
university_clean.head()


In [14]:

university_clean.reset_index(inplace=True)
university_clean = university_clean.rename(columns={"index" : "id"})
university_clean.head()


In [15]:
# name for each university

university_clean = university_clean.rename(columns={"university" : "name"})
university_clean.head()


In [16]:
# some universities do not have valid name 
# remove universities with invalid names

import re

def clean_name(name: str) -> bool:
    """
    
    Simple check university names. A valid name should only contain A-Z characters or space.
    
    """
        
    clean_name = re.sub('[^a-zA-Z ]', '', name)
    
    return " ".join(clean_name.split())


In [17]:
university_clean["name"]    = university_clean["name"].apply(clean_name)
university_clean["city"]    = university_clean["city"].apply(clean_name)
university_clean["country"] = university_clean["country"].apply(clean_name)

In [18]:
university_clean.head(20)

In [19]:
# unique email for each university
# this is not a valid email but kind of a username for each university

def create_email(name: str) -> str:
    """
    Generates unique email adress for each university from name.
        
    :param name: name of university
    :return    : unique email from name
    
    """
    
    name_split = name.split()
    
    # some universities have long name no need long emails just use at most three word
    name_split = name_split[:3]
    
    # lower case
    name_lower = [name.lower() for name in name_split]
    
    # unique university
    email = ".".join(name_lower)
    
    return f"{email}@findmymasters.com"
    


In [20]:
university_clean["email"] = university_clean["name"].apply(create_email)

In [21]:
university_clean.head(20)

In [22]:

# we need rank for each university

# for rank I found 2022 ranking scores for universities we are going to match from rankings file

rankings = pd.read_excel("./raw/universities/university-rankings.xlsx", header=3)
rankings.head()



Unnamed: 0,rank in country,rank in subregion,rank display,rank display2,institution,country code,country,size,focus,research,...,er rank,fsr score,fsr rank,cpf score,cpf rank,ifr score,ifr rank,isr score,isr rank,score scaled
0,1,1,1,1,Massachusetts Institute of Technology (MIT),US,United States,M,CO,VH,...,4,100.0,12,100.0,6,100.0,45,91.4,105,100.0
1,1,1,2,5,University of Oxford,UK,United Kingdom,L,FC,VH,...,3,100.0,5,96.0,34,99.5,83,98.5,52,99.5
2,2,2,3=,2,Stanford University,US,United States,L,FC,VH,...,5,100.0,9,99.9,10,99.8,73,67.0,208,98.7
3,2,2,3=,7,University of Cambridge,UK,United Kingdom,L,FC,VH,...,2,100.0,10,92.1,48,100.0,57,97.7,64,98.7
4,3,3,5,3,Harvard University,US,United States,L,FC,VH,...,1,99.1,37,100.0,3,84.2,188,70.1,196,98.0


In [23]:

# we need rank display and instution name

rankings = rankings[["rank display", "institution"]]
rankings.head(20)


Unnamed: 0,rank display,institution
0,1,Massachusetts Institute of Technology (MIT)
1,2,University of Oxford
2,3=,Stanford University
3,3=,University of Cambridge
4,5,Harvard University
5,6,California Institute of Technology (Caltech)
6,7,Imperial College London
7,8=,ETH Zurich - Swiss Federal Institute of Techno...
8,8=,UCL
9,10,University of Chicago


In [24]:

# convert rankings csv to dict 
# it will make easier to find ranking

rankings_dict = {}

for index in rankings.index:
    rankings_dict[rankings.loc[index, "institution"]] = rankings.loc[index, "rank display"].strip('=').strip()

for key in list(rankings_dict.keys())[:10]:
    print(f"{key} = {rankings_dict[key]}")
    

Massachusetts Institute of Technology (MIT)  = 1
University of Oxford = 2
Stanford University = 3
University of Cambridge = 3
Harvard University = 5
California Institute of Technology (Caltech) = 6
Imperial College London = 7
ETH Zurich - Swiss Federal Institute of Technology = 8
UCL = 8
University of Chicago = 10


In [25]:

# the common words like 'university', 'science', 'technology' can be problem for similarity check
# we can find most common words and remove them

from collections import Counter

words_in_name = []

for name in university_clean["name"].values:
    
    words_in_name.extend([word.lower() for word in name.split()])


counter = Counter(words_in_name)

print(counter.most_common(30))

[('university', 2948), ('of', 2460), ('college', 719), ('school', 710), ('and', 513), ('de', 389), ('business', 355), ('state', 317), ('sciences', 298), ('the', 287), ('institute', 249), ('universidad', 195), ('technology', 161), ('education', 157), ('arts', 157), ('international', 147), ('online', 132), ('management', 131), ('science', 114), ('engineering', 112), ('applied', 110), ('faculty', 99), ('for', 96), ('national', 95), ('graduate', 88), ('studies', 88), ('in', 81), ('economics', 76), ('design', 73), ('new', 69)]


In [26]:

# we can remove most common 28 words when we compare names

words_to_remove = []

for common in counter.most_common(30):
    words_to_remove.append(common[0])

print(words_to_remove)  

['university', 'of', 'college', 'school', 'and', 'de', 'business', 'state', 'sciences', 'the', 'institute', 'universidad', 'technology', 'education', 'arts', 'international', 'online', 'management', 'science', 'engineering', 'applied', 'faculty', 'for', 'national', 'graduate', 'studies', 'in', 'economics', 'design', 'new']


In [27]:
import difflib


def remove_common_from_name(name: str) -> str:
    """
    
    Remove common words from name.
    
    :param name: name of university
    :return    : filtered name of university
    
    """
    
    filtered = []
    
    for word in name.split():
        
        if word.lower() in words_to_remove:
            continue
            
        filtered.append(word)
    
    return " ".join(filtered)


# taken from stackoverlow answer: https://stackoverflow.com/a/1471603/16634193
def is_similar(seq1, seq2):
    return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio() > 0.90

rankings_dict_copy = rankings_dict.copy()

def get_university_rank(row: str) -> str:
    """
    
    Get university rank from rankings table. 
    Because name may not match fully we will use similarity function
    
    :param name: row from university table
    :return    : rank of university if find
    
    
    """
    
    univesity_id = row["id"]
    name         = row["name"]
    
    if name == "NULL":
        return name
            
    for university in rankings_dict_copy.keys():
        
        filtered_university = remove_common_from_name(name=university)
        filtered_name       = remove_common_from_name(name=name)
                
        if is_similar(filtered_university, filtered_name):
            
            # keep rank
            rank = rankings_dict_copy[university]
        
            # if we found delete university to make it a little faster
            del rankings_dict_copy[university]
            
            print(f"[{univesity_id:05}/4573]{university}")
            print(f"            {name} => rank: {rank}\n")
        
            return rank
        
    return "NULL"
    

In [28]:
university_clean["rank"] = university_clean.apply(get_university_rank, axis=1)

[00010/4573]University of Haifa
            University of Haifa International School => rank: 701-750

[00014/4573]University of Bath
            University of Bath School of Management => rank: 166

[00015/4573]University of Groningen
            University of Groningen => rank: 128

[00023/4573]University of Delaware
            University of Delaware Online => rank: 531-540

[00028/4573]Universidade de Santiago de Compostela
            Universidade Santiago de Compostela => rank: 701-750

[00033/4573]Lund University
            Lund University => rank: 87

[00034/4573]Rochester Institute of Technology (RIT)
            Rochester Institute of Technology RIT => rank: 1001-1200

[00035/4573]Universidad de Costa Rica
            National University Costa Rica => rank: 531-540

[00046/4573]University of Wroclaw
            Wroclaw University of Science and Technology => rank: 801-1000

[00048/4573]Dublin City University
            Dublin City University => rank: 490

[00055/4573]Swanse

[00535/4573]HSE University
            HSE University => rank: 305

[00540/4573]Aalto University
            Aalto University => rank: 112

[00541/4573]Management and Science University
            The New School => rank: 601-650

[00562/4573]University College Dublin
            University College Dublin => rank: 173

[00568/4573]Diponegoro University
            Diponegoro University => rank: 1001-1200

[00573/4573]University of Hertfordshire
            University of Hertfordshire => rank: 801-1000

[00576/4573]University of Rhode Island 
            Rhode Island School of Design => rank: 1001-1200

[00577/4573]USI - Università della Svizzera italiana
            USI Universitue della Svizzera italiana => rank: 240

[00582/4573]Czech Technical University in Prague
            Czech Technical University in Prague => rank: 403

[00587/4573]City University of Hong Kong
            College of Engineering City University of Hong Kong => rank: 53

[00589/4573]Sapienza University of Rome
 

[01044/4573]Hongik University 
            Hongik University => rank: 1201+

[01047/4573]Florida State University
            University of Florida College of the Arts => rank: 475

[01057/4573]Stanford University
            Stanford University School of Engineering => rank: 3

[01062/4573]Bauman Moscow State Technical University
            BAUMAN Moscow State Technical University => rank: 281

[01064/4573]Beijing Institute of Technology
            Beijing Institute Of Technology => rank: 373

[01071/4573]University of Limerick
            University of Limerick => rank: 501-510

[01088/4573]University of Turin
            University of Turin => rank: 485

[01092/4573]Universidad Rey Juan Carlos
            Universidad Rey Juan Carlos T => rank: 1001-1200

[01110/4573]The University of Queensland
            The University Of Queensland => rank: 47

[01122/4573]University of Geneva
            The International University in Geneva => rank: 105

[01128/4573]Osaka City University
   

[01738/4573]Mendel University in Brno
            Mendel University in Brno => rank: 801-1000

[01791/4573]National Central University
            National Central University => rank: 521-530

[01796/4573]Lebanese University
            Lebanese International University => rank: 701-750

[01815/4573]Syracuse University
            Syracuse University School of Education => rank: 651-700

[01825/4573]Hunan University 
            Hunan University => rank: 571-580

[01852/4573]University of Florence
            The Florence Institute of Design International => rank: 451

[01858/4573]Carnegie Mellon University
            Carnegie Mellon University School of Design => rank: 53

[01867/4573]University of East London
            East London University => rank: 801-1000

[01869/4573]Yale University
            Yale University => rank: 14

[01884/4573]University of Bayreuth
            University of Bayreuth => rank: 521-530

[01888/4573]Loughborough University
            Loughborough Univer

[02771/4573]The "Gheorghe Asachi" Technical University of Iasi
            Gheorghe Asachi Technical University Of Iasi => rank: 1201+

[02777/4573]Khalifa University of Science and Technology
            Khalifa University => rank: 183

[02785/4573]University of Cologne
            Cologne University of Applied Sciences => rank: 311

[02806/4573]Inha University
            Inha University => rank: 561-570

[02835/4573]Lancaster University
            Lancaster University Faculty of Science and Technology => rank: 132

[02837/4573]Auburn University
            Auburn University College of Engineering => rank: 801-1000

[02843/4573]University of Bordeaux
            University of Bordeaux => rank: 501-510

[02851/4573]George Washington University
            The George Washington University School of Engineering Applied Science => rank: 355

[02863/4573]Vilnius Gediminas Technical University
            Vilnius Gediminas Technical University => rank: 751-800

[02879/4573]Universität des

[04466/4573]Chung Yuan Christian University
            Chung Yuan Christian University => rank: 1001-1200

[04471/4573]University of Macau
            University of Macau => rank: 322

[04505/4573]Boston College
            Boston University College of Engineering => rank: 494

[04516/4573]James Cook University
            James Cook University Online => rank: 424

[04520/4573]Southern University of Science and Technology
            Southern Institute of Technology => rank: 275

[04529/4573]Lebanese American University
            Lebanese American University Online => rank: 581-590

[04545/4573]Ateneo de Manila University
            Ateneo de Manila University => rank: 601-650

[04564/4573]Kyoto Institute of Technology
            Kyoto University Graduate School of Management => rank: 801-1000



In [29]:
university_clean.head(20)

Unnamed: 0,id,name,city,country,email,rank
0,0,Kent State University College of Aeronautics a...,Kent,USA,kent.state.university@findmymasters.com,
1,1,Manderson Graduate School of Business The Univ...,Tuscaloosa,USA,manderson.graduate.school@findmymasters.com,
2,2,University of Argentinian Social Studies,,Argentina,university.of.argentinian@findmymasters.com,
3,3,EM Normandie Business School,Le Havre,France,em.normandie.business@findmymasters.com,
4,4,Izmir University of Economics,Izmir,Turkey,izmir.university.of@findmymasters.com,
5,5,IMF Smart Education,Madrid,Spain,imf.smart.education@findmymasters.com,
6,6,Ural Federal University,Yekaterinburg,Russia,ural.federal.university@findmymasters.com,
7,7,Universitatea Nicolae Titulescu din Bucuresti,Bucharest,Romania,universitatea.nicolae.titulescu@findmymasters.com,
8,8,University of WisconsinMadison La Follette Sch...,Madison,USA,university.of.wisconsinmadison@findmymasters.com,
9,9,Northeastern University Global Pathways,Boston,USA,northeastern.university.global@findmymasters.com,


In [30]:

# need to find city_id for universities table

cities = pd.read_csv("./clean/cities_clean.csv")
cities.head()


Unnamed: 0,id,city,country,purchasing_power_index,safety_index,health_care_index,climate_index,cost_of_living_index,property_price_to_income_ratio,traffic_commute_time_index,pollution_index,quality_of_life_index
0,0,Kabul,Afghanistan,21.12,21.47,29.06,,23.82,14.82,56.17,92.0,
1,1,Tirana,Albania,31.24,56.95,48.69,86.43,38.1,16.02,39.93,87.82,90.91
2,2,Algiers,Algeria,21.75,45.56,54.45,94.82,29.91,28.56,56.08,70.38,78.36
3,3,Constantine,Algeria,24.92,55.32,40.28,84.71,27.61,14.0,55.5,68.82,91.58
4,4,Jijel,Algeria,,54.66,43.06,,,13.02,20.0,53.45,


In [31]:

# convert cities csv to dict 
# it will make easier to find city id

cities_dict = {}

for index in cities.index:
    cities_dict[cities.loc[index, "city"]] = cities.loc[index, "id"]

for key in list(cities_dict.keys())[:10]:
    print(f"{key} = {cities_dict[key]}")


Kabul = 0
Tirana = 1
Algiers = 2
Constantine = 3
Jijel = 4
Oran = 5
Setif = 6
Skikda = 7
Tebessa = 8
Tiziouzou = 9


In [32]:

def get_city_id(row: str) -> str:
    """
    
    Get city id for university
    Because name may not match fully we will use similarity function
    
    :param name: row from university table
    :return    : city id if we find
    
    
    """
    
    univesity_id    = row["id"]
    university_city = row["city"]
            
    if university_city == "NULL":
        return university_city
    
    for city in cities_dict.keys():
                
        if is_similar(university_city, city):
            
            # keep city id
            city_id = cities_dict[city]
            
            print(f"[{univesity_id:05}/4573]{university_city}")
            print(f"            {city} => id: {city_id}\n")
        
            return city_id
        
    return "NULL"
    

In [33]:
university_clean["city_id"] = university_clean.apply(get_city_id, axis=1)

[00004/4573]Izmir
            Izmir => id: 810

[00005/4573]Madrid
            Madrid => id: 742

[00006/4573]Yekaterinburg
            Yekaterinburg => id: 682

[00007/4573]Bucharest
            Bucharest => id: 632

[00010/4573]Haifa
            Haifa => id: 382

[00011/4573]Hong Kong
            Hong Kong => id: 282

[00013/4573]Athens
            Athens => id: 261

[00016/4573]San Jose
            San Jose => id: 179

[00018/4573]Rotterdam
            Rotterdam => id: 547

[00019/4573]London
            London => id: 119

[00020/4573]Rennes
            Rennes => id: 237

[00021/4573]London
            London => id: 119

[00028/4573]Santiago de Compostela
            Santiago de Compostela => id: 751

[00039/4573]Berlin
            Berlin => id: 246

[00042/4573]Lbeck
            Lbeck => id: 253

[00046/4573]Wrocaw
            Wroclaw => id: 619

[00047/4573]Porto
            Porto => id: 626

[00048/4573]Dublin
            Dublin => id: 379

[00052/4573]Dublin
            Dublin =

[00452/4573]Dublin
            Dublin => id: 379

[00455/4573]Singapore
            Singapore => id: 703

[00458/4573]Kuala Lumpur
            Kuala Lumpur => id: 463

[00459/4573]Berlin
            Berlin => id: 246

[00462/4573]Amsterdam
            Amsterdam => id: 542

[00463/4573]Milan
            Milan => id: 399

[00464/4573]Cotonou
            Cotonou => id: 66

[00467/4573]Dalian
            Dalian => id: 155

[00468/4573]Rome
            Rome => id: 405

[00469/4573]Kuala Lumpur
            Kuala Lumpur => id: 463

[00470/4573]Singapore
            Singapore => id: 703

[00472/4573]Paris
            Paris => id: 235

[00474/4573]Medelln
            Medellin => id: 174

[00478/4573]Chisinau
            Chisinau => id: 522

[00479/4573]Riga
            Riga => id: 442

[00483/4573]Samara
            Samara => id: 665

[00484/4573]Barcelona
            Barcelona => id: 725

[00488/4573]Brussels
            Brussels => id: 63

[00495/4573]Vancouver
            Vancouver => id: 13

[00879/4573]Milan
            Milan => id: 399

[00883/4573]Helsinki
            Helsinki => id: 215

[00888/4573]Istanbul
            Istanbul => id: 809

[00892/4573]Cambridge
            Cambridge => id: 550

[00894/4573]Rotterdam
            Rotterdam => id: 547

[00895/4573]Gothenburg
            Gothenburg => id: 760

[00897/4573]Mexico City
            Mexico City => id: 503

[00899/4573]Valletta
            Valletta => id: 480

[00900/4573]Paris
            Paris => id: 235

[00902/4573]Budapest
            Budapest => id: 283

[00903/4573]Moscow
            Moscow => id: 652

[00907/4573]Pozna
            Poznan => id: 617

[00908/4573]Paris
            Paris => id: 235

[00910/4573]Paris
            Paris => id: 235

[00916/4573]Katowice
            Katowice => id: 614

[00919/4573]Milan
            Milan => id: 399

[00921/4573]Valencia
            Valencia => id: 752

[00925/4573]Barcelona
            Barcelona => id: 725

[00926/4573]Sydney
            Sydney => id: 43

[0

[01246/4573]Dalian
            Dalian => id: 155

[01251/4573]Hamburg
            Hamburg => id: 251

[01252/4573]Barcelona
            Barcelona => id: 725

[01253/4573]Villach
            Villach => id: 49

[01254/4573]Saint Petersburg
            Saint Petersburg => id: 664

[01256/4573]Bucharest
            Bucharest => id: 632

[01257/4573]Madrid
            Madrid => id: 742

[01262/4573]Madrid
            Madrid => id: 742

[01265/4573]Lisbon
            Lisbon => id: 624

[01270/4573]Helsinki
            Helsinki => id: 215

[01272/4573]Yakutsk
            Yakutsk => id: 681

[01273/4573]Sofia
            Sofia => id: 100

[01278/4573]Barcelona
            Barcelona => id: 725

[01279/4573]Paris
            Paris => id: 235

[01280/4573]Irkutsk
            Irkutsk => id: 640

[01281/4573]Paris
            Paris => id: 235

[01287/4573]Cape Town
            Cape Town => id: 711

[01295/4573]Guatemala City
            Guatemala City => id: 276

[01299/4573]Auckland
            Au

[01618/4573]Moscow
            Moscow => id: 652

[01620/4573]Moscow
            Moscow => id: 652

[01624/4573]Auckland
            Auckland => id: 549

[01626/4573]Brussels
            Brussels => id: 63

[01628/4573]Barcelona
            Barcelona => id: 725

[01629/4573]Barcelona
            Barcelona => id: 725

[01630/4573]Monterrey
            Monterrey => id: 504

[01634/4573]Thessaloniki
            Thessaloniki => id: 272

[01635/4573]Barcelona
            Barcelona => id: 725

[01640/4573]Prague
            Prague => id: 190

[01643/4573]Zrich
            Zurich => id: 771

[01645/4573]Rennes
            Rennes => id: 237

[01647/4573]Taipei
            Taipei => id: 777

[01650/4573]Milan
            Milan => id: 399

[01653/4573]Munich
            Munich => id: 254

[01655/4573]Warsaw
            Warsaw => id: 618

[01659/4573]Madrid
            Madrid => id: 742

[01663/4573]Budapest
            Budapest => id: 283

[01665/4573]Toulouse
            Toulouse => id: 240

[0

[02085/4573]Barcelona
            Barcelona => id: 725

[02089/4573]Singapore
            Singapore => id: 703

[02090/4573]Guadalajara
            Guadalajara => id: 736

[02091/4573]Mexico City
            Mexico City => id: 503

[02092/4573]Sydney
            Sydney => id: 43

[02093/4573]Barcelona
            Barcelona => id: 725

[02096/4573]Madrid
            Madrid => id: 742

[02102/4573]Irkutsk
            Irkutsk => id: 640

[02103/4573]Mexico City
            Mexico City => id: 503

[02110/4573]Yogyakarta
            Yogyakarta => id: 365

[02114/4573]Paris
            Paris => id: 235

[02125/4573]Ankara
            Ankara => id: 802

[02129/4573]Kuala Lumpur
            Kuala Lumpur => id: 463

[02135/4573]Singapore
            Singapore => id: 703

[02136/4573]Chisinau
            Chisinau => id: 522

[02143/4573]Milan
            Milan => id: 399

[02145/4573]Novosibirsk
            Novosibirsk => id: 657

[02146/4573]Geneva
            Geneva => id: 770

[02147/4573]Lon

[02766/4573]Madrid
            Madrid => id: 742

[02772/4573]London
            London => id: 119

[02775/4573]Nizhny Novgorod
            Nizhny Novgorod => id: 654

[02776/4573]Thessaloniki
            Thessaloniki => id: 272

[02778/4573]Paris
            Paris => id: 235

[02780/4573]Moscow
            Moscow => id: 652

[02782/4573]Vienna
            Vienna => id: 48

[02784/4573]Budapest
            Budapest => id: 283

[02785/4573]Cologne
            Cologne => id: 247

[02786/4573]Lisbon
            Lisbon => id: 624

[02787/4573]Rome
            Rome => id: 405

[02794/4573]Mexico City
            Mexico City => id: 503

[02795/4573]Sofia
            Sofia => id: 100

[02797/4573]Moscow
            Moscow => id: 652

[02799/4573]Reykjavk
            Reykjavik => id: 287

[02801/4573]Pau
            Pau => id: 236

[02802/4573]London
            London => id: 119

[02803/4573]Irkutsk
            Irkutsk => id: 640

[02805/4573]Madrid
            Madrid => id: 742

[02813/4573]

[03298/4573]Yangon
            Yangon => id: 536

[03299/4573]Florianpolis
            Florianopolis => id: 84

[03302/4573]Buenos Aires
            Buenos Aires => id: 13

[03312/4573]Medelln
            Medellin => id: 174

[03313/4573]Mexico City
            Mexico City => id: 503

[03321/4573]Cairo
            Cairo => id: 204

[03323/4573]Santiago
            Santiago => id: 593

[03329/4573]Porto Alegre
            Porto Alegre => id: 90

[03330/4573]Warsaw
            Warsaw => id: 618

[03332/4573]Warsaw
            Warsaw => id: 618

[03335/4573]San Francisco
            San Francisco => id: 178

[03338/4573]Kobe
            Kobe => id: 419

[03344/4573]Kursk
            Kursk => id: 649

[03346/4573]Leon
            Leon => id: 741

[03347/4573]Warsaw
            Warsaw => id: 618

[03359/4573]Hiroshima
            Hiroshima => id: 417

[03371/4573]Taipei
            Taipei => id: 777

[03376/4573]Kaunas
            Kaunas => id: 449

[03377/4573]Bogot
            Bogota => i

[03878/4573]Adelaide
            Adelaide => id: 21

[03879/4573]Porto
            Porto => id: 626

[03880/4573]Istanbul
            Istanbul => id: 809

[03881/4573]Dublin
            Dublin => id: 379

[03884/4573]London
            London => id: 119

[03888/4573]Belgrade
            Belgrade => id: 696

[03891/4573]Bucharest
            Bucharest => id: 632

[03896/4573]Tbilisi
            Tbilisi => id: 244

[03900/4573]Rome
            Rome => id: 405

[03903/4573]Hamburg
            Hamburg => id: 251

[03905/4573]Athens
            Athens => id: 261

[03908/4573]Rome
            Rome => id: 405

[03910/4573]Madrid
            Madrid => id: 742

[03914/4573]Rome
            Rome => id: 405

[03916/4573]Moscow
            Moscow => id: 652

[03917/4573]Rotterdam
            Rotterdam => id: 547

[03924/4573]Taipei
            Taipei => id: 777

[03927/4573]Tbilisi
            Tbilisi => id: 244

[03930/4573]Puerto Vallarta
            Puerto Vallarta => id: 507

[03934/4573]Taipe

[04330/4573]Madrid
            Madrid => id: 742

[04333/4573]Berlin
            Berlin => id: 246

[04337/4573]Montreal
            Montreal => id: 123

[04341/4573]Nicosia
            Nicosia => id: 187

[04342/4573]Zrich
            Zurich => id: 771

[04343/4573]Bogot
            Bogota => id: 170

[04344/4573]Culiacn
            Culiacan => id: 492

[04345/4573]Warsaw
            Warsaw => id: 618

[04348/4573]Buenos Aires
            Buenos Aires => id: 13

[04349/4573]Paris
            Paris => id: 235

[04352/4573]Manila
            Manila => id: 609

[04354/4573]Monterrey
            Monterrey => id: 504

[04363/4573]Kaunas
            Kaunas => id: 449

[04366/4573]Barcelona
            Barcelona => id: 725

[04367/4573]Valencia
            Valencia => id: 752

[04368/4573]Riga
            Riga => id: 442

[04370/4573]Antwerp
            Antwerp => id: 62

[04375/4573]Rome
            Rome => id: 405

[04376/4573]Athens
            Athens => id: 261

[04380/4573]London
      

In [34]:
university_clean.head()

Unnamed: 0,id,name,city,country,email,rank,city_id
0,0,Kent State University College of Aeronautics a...,Kent,USA,kent.state.university@findmymasters.com,,
1,1,Manderson Graduate School of Business The Univ...,Tuscaloosa,USA,manderson.graduate.school@findmymasters.com,,
2,2,University of Argentinian Social Studies,,Argentina,university.of.argentinian@findmymasters.com,,
3,3,EM Normandie Business School,Le Havre,France,em.normandie.business@findmymasters.com,,
4,4,Izmir University of Economics,Izmir,Turkey,izmir.university.of@findmymasters.com,,810.0


In [35]:

# save universities csv

university_clean.to_csv("./clean/university_clean.csv", index=False)


In [36]:
# just check saved data

universities = pd.read_csv("../data-version2/clean/university_clean.csv")
universities.head()


Unnamed: 0,id,name,city,country,email,rank,city_id
0,0,Kent State University College of Aeronautics a...,Kent,USA,kent.state.university@findmymasters.com,,
1,1,Manderson Graduate School of Business The Univ...,Tuscaloosa,USA,manderson.graduate.school@findmymasters.com,,
2,2,University of Argentinian Social Studies,,Argentina,university.of.argentinian@findmymasters.com,,
3,3,EM Normandie Business School,Le Havre,France,em.normandie.business@findmymasters.com,,
4,4,Izmir University of Economics,Izmir,Turkey,izmir.university.of@findmymasters.com,,810.0
