# Web Scrapping with Beautiful Soup

In [93]:
import requests as r
import pandas as pd
from bs4 import BeautifulSoup
import os
import wptools as wp
import time
import json

In [9]:
url = 'https://www.theabusites.com/webometrics-ranking-2019/'
html = r.get(url)

folder = 'top_100_universities_in_ng'
if not os.path.exists(folder):
    os.makedirs(folder)
with open(os.path.join(folder, "./webometrics_ranking_2019.html"), mode='wb') as file:
    file.write(html.content)

In [44]:
with open(os.path.join(folder, "./webometrics_ranking_2019.html")) as file:
    soup = BeautifulSoup(file, 'lxml')
    table = soup.find('table')

In [46]:
rows = table.find_all('tr')
sub = rows[1:]
sub
top = []
for i in range(len(sub)):
    rank = sub[i].find_all('td')[0].text
    world_rank = sub[i].find_all('td')[1].text
    universities = sub[i].find_all('td')[2].text
    url = sub[i].find_all('a')[0]['href']
    presence_rank = sub[i].find_all('td')[4].text
    impact_rank = sub[i].find_all('td')[5].text
    openness_rank = sub[i].find_all('td')[6].text
    excellence_rank = sub[i].find_all('td')[7].text
    
    top.append({
        'ranking':int(rank),
        'world_rank':int(world_rank),
        'universities':universities,
        'website':url,
        'presence_rank':presence_rank,
        'impact_rank':impact_rank,
        'openness_rank':openness_rank,
        'excellence_rank':excellence_rank
    })

col=['ranking', 'world_rank', 'universities', 'website', 'presence_rank',
    'impact_rank', 'openness_rank', 'excellence_rank']
df = pd.DataFrame(top, columns=col)

In [47]:
df

Unnamed: 0,ranking,world_rank,universities,website,presence_rank,impact_rank,openness_rank,excellence_rank
0,1,1322,University of Ibadan,https://www.ui.edu.ng/,2113,2088,1057,1561
1,2,1742,Covenant University Ota,http://covenantuniversity.edu.ng/,1169,3884,1356,1797
2,3,1805,University of Nigeria,http://www.unn.edu.ng/,1311,3279,1038,2243
3,4,1984,University of Lagos,https://unilag.edu.ng/,161,4143,1521,2312
4,5,2053,Obafemi Awolowo University,http://oauife.edu.ng/,2916,4560,1616,2025
...,...,...,...,...,...,...,...,...
95,96,14054,Yobe State University (Bukar Abba Ibrahim Univ...,https://www.ysu.edu.ng/,17040,15459,6490,6084
96,97,14091,Taraba State University Jalingo,http://www.tsuniversity.edu.ng/,18418,14883,6690,6084
97,98,14110,Ondo State University of Science & Technology ...,http://www.osustech.edu.ng/,23528,12868,7168,6084
98,99,14347,Anchor University Lagos,https://aul.edu.ng/,9304,17841,5779,6084


In [124]:
top_100_urls = [
#     'https://en.wikipedia.org/wiki/University_of_Ibadan',
#     'https://en.wikipedia.org/wiki/Covenant_University',
#     'https://en.wikipedia.org/wiki/University_of_Nigeria',
    'https://en.wikipedia.org/wiki/University_of_Lagos'
]

In [125]:
start = time.time()
with open(os.path.join(folder, './wikipedia_info.txt'), 'w') as opened_file:
    for url in top_100_urls:
        page = wp.page(url.split('/')[-1], silent=True)
        pg = page.get()
        all_info = pg.data['infobox']

        json.dump(all_info, opened_file)
        opened_file.write('\n')

print("%s seconds"%(time.time() - start))

10.620264768600464 seconds


In [123]:
info = []
with open(f'{folder}/wikipedia_info.txt', 'r') as f:
    for line in f:
        each_uni = json.loads(line)
        name = each_uni['name']
        motto = each_uni['motto']
        estab = each_uni['established']
        typ = each_uni['type'].strip("[]")
        chanc = each_uni['chancellor'].strip("[]")
        vice_chanc = each_uni['vice_chancellor'].strip("[]")
        stu = each_uni['students']
        undrgrd = each_uni['undergrad']
        pstgrd = each_uni['postgrad']
        acad_staff = each_uni['academic_staff']
        adminstratv_staff = each_uni['administrative_staff']
        city = each_uni['city'].strip("[]")
        state = each_uni['state'].strip("[]")
        camp = each_uni['campus']

        info.append({
            'name': name,
            'motto': motto,
            'estab'
        })

In [82]:
info = pg.data['infobox']
info

{'name': 'University of Nigeria',
 'native_name': 'Nsukka',
 'image': 'UNN Fountain.png',
 'motto': "''To Restore the Dignity of Man''",
 'established': '1955',
 'type': '[[public university|Public]]',
 'vice_chancellor': '[[Charles Igwe Arizechukwu]]',
 'students': '36,000',
 'city': '[[Nsukka]]',
 'state': '[[Enugu state|Enugu]]',
 'country': '[[Nigeria]]',
 'campus': 'Rural<br /> {{convert|871|ha|acre}} (Nsukka campus)<br />Urban<br /> {{convert|200|ha|acre}} (Enugu campus)<br/> {{convert|500|ha|acre}} (Ituku-Ozalla campus)',
 'former_names': '{{plainlist|\n* University of Nigeria (1960 - 1967)\n* University of Biafra (6 July 1967 – 15 January 1970)}}',
 'nickname': 'Lions and Lionesses',
 'mascot': 'Lion',
 'website': '[https://unn.edu.ng/ unn.edu.ng]',
 'coor': '{{coord|6|51|24|N|7|23|45|E|type:edu|format|=|dms|display|=|inline}}',
 'founder': '[[Nnamdi Azikiwe]]',
 'faculty': '1,519',
 'colors': 'Green and white<br/> {{color box|#008000}} &nbsp; {{color box|#FFFFFF}}'}

In [80]:
name = info['name']
motto = info['motto']
est = info['established']
est

'1955'