In [1]:
import pandas as pd

In [2]:
universities = pd.read_csv("./universities_final.csv")
universities.head()

Unnamed: 0,id,name,rank,city_id,image
0,0,Kent State University College Of Aeronautics A...,,,https://keystoneacademic-res.cloudinary.com/im...
1,1,Manderson Graduate School Of Business The Univ...,,,https://keystoneacademic-res.cloudinary.com/im...
2,2,University Of Argentinian Social Studies,,,https://keystoneacademic-res.cloudinary.com/im...
3,3,Em Normandie Business School,,,https://keystoneacademic-res.cloudinary.com/im...
4,4,Izmir University Of Economics,,810.0,https://keystoneacademic-res.cloudinary.com/im...


In [3]:
universities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4360 entries, 0 to 4359
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       4360 non-null   int64  
 1   name     4360 non-null   object 
 2   rank     432 non-null    float64
 3   city_id  1442 non-null   float64
 4   image    3959 non-null   object 
dtypes: float64(2), int64(1), object(2)
memory usage: 170.4+ KB


In [4]:
universities["id"].value_counts()

0       1
2904    1
2910    1
2909    1
2908    1
       ..
1459    1
1460    1
1461    1
1462    1
4359    1
Name: id, Length: 4360, dtype: int64

In [5]:

# for each university we will have university-admin
# we will create university admins and add them table

# for each admin we need

# id 
# username
# firstname
# lastname
# user_id
# university_id

admin_columns = ["id", "username", "firstname", "lastname", "user_id", "university_id"]

# for each admin user we need

# id
# email = NULL
# passwordHashed
# emailVerified = false
# registrationDate = NULL
# role = UNIVERSITYADMIN
# username

user_columns = ["email", "passwordHashed", "emailVerified", "registrationDate", "role", "username"]


In [6]:

# we need username and password for each admin lets create them
universities.head()


Unnamed: 0,id,name,rank,city_id,image
0,0,Kent State University College Of Aeronautics A...,,,https://keystoneacademic-res.cloudinary.com/im...
1,1,Manderson Graduate School Of Business The Univ...,,,https://keystoneacademic-res.cloudinary.com/im...
2,2,University Of Argentinian Social Studies,,,https://keystoneacademic-res.cloudinary.com/im...
3,3,Em Normandie Business School,,,https://keystoneacademic-res.cloudinary.com/im...
4,4,Izmir University Of Economics,,810.0,https://keystoneacademic-res.cloudinary.com/im...


In [7]:
universities[universities.duplicated(subset=['name'])]['name'].value_counts()

Series([], Name: name, dtype: int64)

In [8]:

# for create username we can remove common words from the names and merge to create unique and meaningfull names
from collections import Counter

words_in_name = []

for name in universities["name"].values:
    
    words_in_name.extend([word.lower() for word in name.split()])


counter = Counter(words_in_name)

print(counter.most_common(10))


[('university', 2856), ('of', 2395), ('college', 712), ('school', 679), ('and', 504), ('de', 357), ('business', 332), ('state', 314), ('the', 283), ('sciences', 280)]


In [9]:
# we can remove most common 50 words

words_to_remove = []

for common in counter.most_common(1):
    words_to_remove.append(common[0])

print(words_to_remove)

['university']


In [10]:
def remove_commons_from_name(name: str) -> str:
    """
    
    Remove common words from name.
    
    :param name: name of university
    :return    : filtered name of university
    
    """
    
    filtered = []
    
    for word in name.split():
        
        if word.lower() in words_to_remove:
            continue
            
        filtered.append(word)
    
    return " ".join(filtered)

In [11]:
clean_names = list(universities["name"].apply(remove_commons_from_name))

In [12]:
len(clean_names)

4360

In [13]:
usernames : set = set()
    
for index, name in enumerate(clean_names):
    
    username = ""
    
    # use first word for username
    splits = name.split()
    
    if splits[0] != "":
        username += splits[0]
        
    if len(splits) > 1:
        username += splits[1]
    
    if len(splits) > 2:
        username += splits[2]
        
    if len(splits) > 3:
        username += splits[3]
        
    if len(splits) > 4:
        username += splits[4]
        
    usernames.add(username + f"{index}")
    
len(usernames)

4360

In [14]:
admins = pd.DataFrame(columns=["username", "firstname", "lastname", "user_id", "university_id"])

In [15]:
admins["username"] = pd.Series(data=list(usernames))

In [16]:
admins = admins.reset_index()

In [17]:
admins.rename(columns={"index" : "id"}, inplace=True)

In [18]:
admins["user_id"]       = admins["id"]
admins["university_id"] = admins["id"]

In [19]:
admins.fillna("NULL", inplace=True)

In [20]:
admins

Unnamed: 0,id,username,firstname,lastname,user_id,university_id
0,0,HuntsvilleBibleCollege3576,,,0,0
1,1,UnicocCollegesOfColombiaInstituciufn3296,,,1,1
2,2,ColumbiaGraduateSchoolOfArchitecture628,,,2,2
3,3,CaroYCuervoInstitute3529,,,3,3
4,4,AsiaPacificOfTechnologyInnovation1203,,,4,4
...,...,...,...,...,...,...
4355,4355,ShanghaiNationalAccountingInstitute3857,,,4355,4355
4356,4356,CollegeOfFineAppliedArts746,,,4356,4356
4357,4357,MiriamCollege4158,,,4357,4357
4358,4358,OfAlbany2037,,,4358,4358


In [21]:

# generate password for university admins
users = pd.DataFrame(columns=user_columns)
users.head()


Unnamed: 0,email,passwordHashed,emailVerified,registrationDate,role,username


In [22]:
users['username'] = admins['username']
users.head()

Unnamed: 0,email,passwordHashed,emailVerified,registrationDate,role,username
0,,,,,,HuntsvilleBibleCollege3576
1,,,,,,UnicocCollegesOfColombiaInstituciufn3296
2,,,,,,ColumbiaGraduateSchoolOfArchitecture628
3,,,,,,CaroYCuervoInstitute3529
4,,,,,,AsiaPacificOfTechnologyInnovation1203


In [23]:
users['passwordHashed'].fillna("admin", inplace=True)
users['emailVerified'].fillna("false", inplace=True)
users['role'].fillna("UNIVERSITYADMIN", inplace=True)

In [24]:
users.head()

Unnamed: 0,email,passwordHashed,emailVerified,registrationDate,role,username
0,,admin,False,,UNIVERSITYADMIN,HuntsvilleBibleCollege3576
1,,admin,False,,UNIVERSITYADMIN,UnicocCollegesOfColombiaInstituciufn3296
2,,admin,False,,UNIVERSITYADMIN,ColumbiaGraduateSchoolOfArchitecture628
3,,admin,False,,UNIVERSITYADMIN,CaroYCuervoInstitute3529
4,,admin,False,,UNIVERSITYADMIN,AsiaPacificOfTechnologyInnovation1203


In [25]:
users.fillna("NULL", inplace=True)

In [26]:
users.head()

Unnamed: 0,email,passwordHashed,emailVerified,registrationDate,role,username
0,,admin,False,,UNIVERSITYADMIN,HuntsvilleBibleCollege3576
1,,admin,False,,UNIVERSITYADMIN,UnicocCollegesOfColombiaInstituciufn3296
2,,admin,False,,UNIVERSITYADMIN,ColumbiaGraduateSchoolOfArchitecture628
3,,admin,False,,UNIVERSITYADMIN,CaroYCuervoInstitute3529
4,,admin,False,,UNIVERSITYADMIN,AsiaPacificOfTechnologyInnovation1203


In [27]:
users.reset_index(inplace=True)

In [28]:
users.head()

Unnamed: 0,index,email,passwordHashed,emailVerified,registrationDate,role,username
0,0,,admin,False,,UNIVERSITYADMIN,HuntsvilleBibleCollege3576
1,1,,admin,False,,UNIVERSITYADMIN,UnicocCollegesOfColombiaInstituciufn3296
2,2,,admin,False,,UNIVERSITYADMIN,ColumbiaGraduateSchoolOfArchitecture628
3,3,,admin,False,,UNIVERSITYADMIN,CaroYCuervoInstitute3529
4,4,,admin,False,,UNIVERSITYADMIN,AsiaPacificOfTechnologyInnovation1203


In [29]:
users.rename(columns={'index': 'id'}, inplace=True)
users.head()

Unnamed: 0,id,email,passwordHashed,emailVerified,registrationDate,role,username
0,0,,admin,False,,UNIVERSITYADMIN,HuntsvilleBibleCollege3576
1,1,,admin,False,,UNIVERSITYADMIN,UnicocCollegesOfColombiaInstituciufn3296
2,2,,admin,False,,UNIVERSITYADMIN,ColumbiaGraduateSchoolOfArchitecture628
3,3,,admin,False,,UNIVERSITYADMIN,CaroYCuervoInstitute3529
4,4,,admin,False,,UNIVERSITYADMIN,AsiaPacificOfTechnologyInnovation1203


In [30]:

# reorder university columns
universities = universities[["id", "name", "image", "city_id", "rank"]]


In [31]:
universities = universities.rename(columns={'city_id': 'cityId'})

In [32]:

# save users, admins and universities

# admins.to_csv("admins_pg.csv", index=False)
# users.to_csv("users_pg.csv",   index=False)
# universities.to_csv("universities_pg.csv",   index=False)


In [33]:
users = users[["id", "email", "passwordHashed", "emailVerified", "role", "username", "registrationDate"]]
users.head()

Unnamed: 0,id,email,passwordHashed,emailVerified,role,username,registrationDate
0,0,,admin,False,UNIVERSITYADMIN,HuntsvilleBibleCollege3576,
1,1,,admin,False,UNIVERSITYADMIN,UnicocCollegesOfColombiaInstituciufn3296,
2,2,,admin,False,UNIVERSITYADMIN,ColumbiaGraduateSchoolOfArchitecture628,
3,3,,admin,False,UNIVERSITYADMIN,CaroYCuervoInstitute3529,
4,4,,admin,False,UNIVERSITYADMIN,AsiaPacificOfTechnologyInnovation1203,


In [34]:
users["registrationDate"] = users["registrationDate"].replace("NULL", "")

In [37]:
users.head()

Unnamed: 0,id,email,passwordHashed,emailVerified,role,username,registrationDate
0,0,,admin,False,UNIVERSITYADMIN,HuntsvilleBibleCollege3576,
1,1,,admin,False,UNIVERSITYADMIN,UnicocCollegesOfColombiaInstituciufn3296,
2,2,,admin,False,UNIVERSITYADMIN,ColumbiaGraduateSchoolOfArchitecture628,
3,3,,admin,False,UNIVERSITYADMIN,CaroYCuervoInstitute3529,
4,4,,admin,False,UNIVERSITYADMIN,AsiaPacificOfTechnologyInnovation1203,


In [38]:
users = users[["id", "email", "passwordHashed", "registrationDate", "role", "username", "emailVerified"]]
users.head()

Unnamed: 0,id,email,passwordHashed,registrationDate,role,username,emailVerified
0,0,,admin,,UNIVERSITYADMIN,HuntsvilleBibleCollege3576,False
1,1,,admin,,UNIVERSITYADMIN,UnicocCollegesOfColombiaInstituciufn3296,False
2,2,,admin,,UNIVERSITYADMIN,ColumbiaGraduateSchoolOfArchitecture628,False
3,3,,admin,,UNIVERSITYADMIN,CaroYCuervoInstitute3529,False
4,4,,admin,,UNIVERSITYADMIN,AsiaPacificOfTechnologyInnovation1203,False


In [43]:
users["email"] = users["email"].replace("NULL", "")
users.head()

Unnamed: 0,id,email,passwordHashed,registrationDate,role,username,emailVerified
0,0,,admin,,UNIVERSITYADMIN,HuntsvilleBibleCollege3576,False
1,1,,admin,,UNIVERSITYADMIN,UnicocCollegesOfColombiaInstituciufn3296,False
2,2,,admin,,UNIVERSITYADMIN,ColumbiaGraduateSchoolOfArchitecture628,False
3,3,,admin,,UNIVERSITYADMIN,CaroYCuervoInstitute3529,False
4,4,,admin,,UNIVERSITYADMIN,AsiaPacificOfTechnologyInnovation1203,False


In [44]:
users.to_csv("users_pg.csv",   index=False)

In [45]:
universities.head()

Unnamed: 0,id,name,image,cityId,rank
0,0,Kent State University College Of Aeronautics A...,https://keystoneacademic-res.cloudinary.com/im...,,
1,1,Manderson Graduate School Of Business The Univ...,https://keystoneacademic-res.cloudinary.com/im...,,
2,2,University Of Argentinian Social Studies,https://keystoneacademic-res.cloudinary.com/im...,,
3,3,Em Normandie Business School,https://keystoneacademic-res.cloudinary.com/im...,,
4,4,Izmir University Of Economics,https://keystoneacademic-res.cloudinary.com/im...,810.0,


In [47]:
universities = universities.fillna("")

In [None]:
def float_int(row)

In [60]:
universities["cityId"] = universities["cityId"].apply(lambda x: int(x) if x != "" else "")

In [61]:
universities.to_csv("universities_pg.csv",   index=False)