# This notebook is aimed at scrapping the websites for the names and year of establishment of higher institutions in Nigeria using Python's Beautiful Soup.

In [1]:
#importing the dependancies
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

__Federal Universities__

__Scrapping the names and year of establishment of Federal Government owned universities in Nigeria.__

__Web scrapping involves four steps namely:__

__a. document load (downloading html contents of the page),__


__b. parsing (format the downloaded content into useful format),__

__c. extraction (extracting useful information)__

__d. transformation (saving the information into a structured format).__

In [4]:
#document load
result = requests.get("https://www.nuc.edu.ng/nigerian-univerisities/federal-univeristies/")

In [None]:
#result.content

In [8]:
#parsing the content
Federal_Universities = BeautifulSoup(result.content, "html.parser")

__Extracting the University names__

In [9]:
#extraction
Federal_Universities.find("td", {"class":"column-2"})

<td class="column-2">Abubakar Tafawa Balewa University, Bauchi</td>

In [10]:
University_name = Federal_Universities.find_all("td", {"class":"column-2"})  # finds all the name 

In [11]:
#Transformation
Name = []
for name in University_name:
    Name.append(name.text) #extracts the text and appends it to list "Name"

__Extracting the date of establishment__

In [13]:
Year_established = Federal_Universities.find_all("td", {"class":"column-5"})

In [14]:
Year_Founded = []
for year in Year_established:
    Year_Founded.append(year.text)

__Transforming the data into a dataframe__

In [18]:
Fed_Uni_data = pd.DataFrame({'Name': Name, 'Year of establishment': Year_Founded})

In [22]:
#adding columns for Ownership and category
Fed_Uni_data['Ownership'] = 'Federal Government'
Fed_Uni_data['Category'] = 'University'

In [25]:
Fed_Uni_data = Fed_Uni_data.iloc[0:50, :] 

In [27]:
#Fed_Uni_data

__State University__

In [39]:
state_uni = requests.get("https://www.nuc.edu.ng/nigerian-univerisities/state-univerisity/")

In [40]:
state_uni_content = BeautifulSoup(state_uni.content, "html.parser")

In [41]:
State_Uni_names = state_uni_content.find_all("td", {"class":"column-2"})

In [42]:
State_University = []
for uni in State_Uni_names:
    State_University.append(uni.text)

In [45]:
len(State_University)

60

In [49]:
State_Year_Founded = state_uni_content.find_all("td", {"class":"column-5"})

In [50]:
State_University_Date = []
for date in State_Year_Founded:
    State_University_Date.append(date.text)

In [51]:
len(State_University_Date)

60

In [52]:
State_Uni_data = pd.DataFrame({'Name': State_University, 'Year of establishment': State_University_Date})
State_Uni_data['Ownership'] = 'State Government'
State_Uni_data['Category'] = 'University'

In [54]:
#State_Uni_data

__Private-owned University__

In [28]:
pri_uni = requests.get("https://www.nuc.edu.ng/nigerian-univerisities/private-univeristies/")

In [29]:
pri_uni_content = BeautifulSoup(pri_uni.content, "html.parser")

In [30]:
Pri_Uni = pri_uni_content.find_all("td", {"class":"column-2"})

In [31]:
Private_University = []
for name in Pri_Uni:
    Private_University.append(name.text)

In [32]:
#checking the length of the list
len(Private_University)

111

In [33]:
Date_founded = pri_uni_content.find_all("td", {"class":"column-5"})

In [34]:
Private_University_Date = []
for date in Date_founded:
    Private_University_Date.append(date.text)

In [35]:
len(Private_University_Date)

111

In [36]:
Pri_Uni_data = pd.DataFrame({'Name': Private_University, 'Year of establishment': Private_University_Date})
Pri_Uni_data['Ownership'] = 'Private'
Pri_Uni_data['Category'] = 'University'

In [38]:
#Pri_Uni_data

__Federal Polytechnic__

In [55]:
federal_poly = requests.get("https://net.nbte.gov.ng/Federal%20Polytechnics")

In [56]:
federal_poly_content = BeautifulSoup(federal_poly.content, "lxml")

In [None]:
#This extraction technique is different from the one used above because the needed information is in a table.

In [57]:
fed_poly_table = federal_poly_content.find('table', {'class' : "MsoTableColorfulGridAccent3"})

In [58]:
header = []
rows = []
for i, row in enumerate(fed_poly_table.find_all('tr')):
    if i == 0:
        header = [el.text.strip() for el in row.find_all('th')]
    else:
        rows.append([el.text.strip() for el in row.find_all('td')])

In [59]:
len(rows)

40

In [60]:
#defining a function that returns a dataframe of the institutions' name, year, ownership and category
def name_date_list(rows,  ownership, category):
    name = []
    for i in rows:
        name.append(i[1]) #name is the second item in the list (index 1)
   
    year = []
    for i in rows:
        year.append(i[2]) #year is the third item in the list (index 2)
    
    a = {'Name': name, 'Year of establishment': year, 'Ownership': ownership,'Category' : category}
    return  pd.DataFrame(a)

In [61]:
Fed_Poly_data = name_date_list(rows, 'Federal Government','Polytechnic')

In [63]:
#Fed_Poly_data

__State Polytechnics__

In [64]:
state_poly = requests.get("https://net.nbte.gov.ng/state%20polytechnics")

In [65]:
state_poly_content = BeautifulSoup(state_poly.content, "lxml")

In [66]:
state_poly_table = state_poly_content.find('table', {'class' : "MsoTableColorfulGridAccent3"})

In [67]:
header = []
statepoly = []
for i, row in enumerate(state_poly_table.find_all('tr')):
    if i == 0:
        header = [el.text.strip() for el in row.find_all('th')]
    else:
        statepoly.append([el.text.strip() for el in row.find_all('td')])

In [69]:
State_Poly_data = name_date_list(statepoly, 'State Government', 'Polytechnic')

In [71]:
#removing '\n' from the names
State_Poly_data['Name'] = [i.replace('\n', '') for i in State_Poly_data['Name']]

In [72]:
#filling some missing data
State_Poly_data['Year of establishment'][26] = 2013
State_Poly_data['Year of establishment'][2] = 1988
State_Poly_data['Year of establishment'][24] = 2007

In [74]:
#State_Poly_data