# Task 1 : WEB SCRAPING

### Importing packages

In [1]:
import pandas as pd

In [2]:
from bs4 import BeautifulSoup

In [3]:
import requests

### Scraping the webpage data

In [4]:
url = "https://www.citypopulation.de/en/nepal/mun/admin/"

In [5]:
page = requests.get(url)

In [6]:
soup = BeautifulSoup(page.text)

In [7]:
soup

<!DOCTYPE html>
<html lang="en">
<head itemscope="" itemtype="https://schema.org/WebPage">
<meta charset="utf-8"/>
<meta content="Nepal: Municipal Division (Districts and Local Units) with population statistics, charts and maps." itemprop="description" name="description"/>
<title itemprop="name">Nepal: Municipal Division (Districts and Local Units) - Population Statistics, Charts and Map</title>
<meta content="Thomas Brinkhoff: City Population, http://www.citypopulation.de" itemprop="author" name="author"/>
<link href="/favicon.ico" rel="shortcut icon"/>
<link href="http://creativecommons.org/licenses/by/4.0/" itemprop="license" rel="licence"/>
<script>var pagemode = 'adminpage'; var pagecat = 'admin'; var isAdmin = true; var pageid = 'nepal-mun-admin'; var pagelang = 'en'; var pagelabel = "Municipal Division"; var popDate = 'C 2021-11-25'; var popcolnum  = 3; var start_x = 84.132; var start_y = 28.398; var start_level = 7; var swap_width = 1560; var hor_percent = 45; var vert_percent 

### Scraping the tabular informations

In [8]:
table = soup.find_all('table')

In [9]:
table

[<table class="data" id="tl">
 <colgroup><col data-width="0"/><col data-width="0"/><col data-width="575"/><col data-width="705"/><col data-width="445"/><col data-width="380"/><col data-width="0"/><col style="visibility:collapse"/></colgroup>
 <thead>
 <tr><th class="rname" data-coltype="name" onclick="javascript:sort('tl',0,false)"><a href="javascript:sort('tl',0,false)">Name</a></th><th class="rstatus" data-coltype="status" onclick="javascript:sort('tl',1,false)"><a href="javascript:sort('tl',1,false)">Status</a></th><th class="rnative" data-coltype="othername" onclick="javascript:sort('tl',2,false)"><a href="javascript:sort('tl',2,false)">Transcription</a></th><th class="rnative" data-coltype="othername" onclick="javascript:sort('tl',3,false)"><a href="javascript:sort('tl',3,false)">Native</a></th><th class="rpop" data-coldate="2001-05-28" data-colhead="C 2001-05-28" data-coltype="pop" onclick="javascript:sort('tl',4,true)"><a href="javascript:sort('tl',4,true)">Population</a><br/><s

#### Title of the table

In [10]:
table_title = soup.find_all('th')[:7]

In [11]:
table_title

[<th class="rname" data-coltype="name" onclick="javascript:sort('tl',0,false)"><a href="javascript:sort('tl',0,false)">Name</a></th>,
 <th class="rstatus" data-coltype="status" onclick="javascript:sort('tl',1,false)"><a href="javascript:sort('tl',1,false)">Status</a></th>,
 <th class="rnative" data-coltype="othername" onclick="javascript:sort('tl',2,false)"><a href="javascript:sort('tl',2,false)">Transcription</a></th>,
 <th class="rnative" data-coltype="othername" onclick="javascript:sort('tl',3,false)"><a href="javascript:sort('tl',3,false)">Native</a></th>,
 <th class="rpop" data-coldate="2001-05-28" data-colhead="C 2001-05-28" data-coltype="pop" onclick="javascript:sort('tl',4,true)"><a href="javascript:sort('tl',4,true)">Population</a><br/><span class="unit">Census<br/>2001-05-28</span></th>,
 <th class="rpop" data-coldate="2011-06-22" data-colhead="C 2011-06-22" data-coltype="pop" onclick="javascript:sort('tl',5,true)"><a href="javascript:sort('tl',5,true)">Population</a><br/><sp

In [12]:
table_data = [title.text for title in table_title][0:7]
table_data

['Name',
 'Status',
 'Transcription',
 'Native',
 'PopulationCensus2001-05-28',
 'PopulationCensus2011-06-22',
 'PopulationCensus2021-11-25']

In [13]:
df = pd.DataFrame(columns = table_data)

In [14]:
df

Unnamed: 0,Name,Status,Transcription,Native,PopulationCensus2001-05-28,PopulationCensus2011-06-22,PopulationCensus2021-11-25


### District Informations

In [15]:
district_names_table = soup.find_all('tr', class_ = 'rname')
for dis_names in district_names_table:
    district_names = dis_names.find_all('td')[:7]
    individual_dis_names = [data.text for data in district_names]
    print(individual_dis_names)
    length = len(df)
    df.loc[length] = individual_dis_names

['Achham', 'District', 'Achāma', 'अछाम जिल्ला', '231,285', '257,477', '228,852']
['Arghakhanchi', 'District', 'Arghākhā̃cī', 'अर्घाखाँची जिल्ला', '208,391', '197,632', '177,086']
['Baglung', 'District', 'Bāgaluṅa', 'बागलुङ जिल्ला', '268,937', '268,613', '249,211']
['Baitadi', 'District', 'Baitaḍī', 'बैतडी जिल्ला', '234,418', '250,898', '242,157']
['Bajhang', 'District', 'Bajhāṅa', 'बझाङ जिल्ला', '167,026', '195,159', '189,085']
['Bajura', 'District', 'Bājurā', 'बाजुरा जिल्ला', '108,781', '134,912', '138,523']
['Banke', 'District', 'Bā̃kē', 'बाँके जिल्ला', '385,840', '491,313', '603,194']
['Bara', 'District', 'Bārā', 'बारा जिल्ला', '559,135', '687,708', '763,137']
['Bardiya', 'District', 'Bardiyā', 'बर्दिया जिल्ला', '382,649', '426,576', '459,900']
['Bhaktapur', 'District', 'Bhaktapura', 'भक्तपुर जिल्ला', '225,461', '304,651', '432,132']
['Bhojpur', 'District', 'Bhōjapura', 'भोजपुर जिल्ला', '203,018', '182,459', '157,923']
['Chitwan', 'District', 'Citavana', 'चितवन जिल्ला', '472,048', '

In [16]:
df

Unnamed: 0,Name,Status,Transcription,Native,PopulationCensus2001-05-28,PopulationCensus2011-06-22,PopulationCensus2021-11-25
0,Achham,District,Achāma,अछाम जिल्ला,231285,257477,228852
1,Arghakhanchi,District,Arghākhā̃cī,अर्घाखाँची जिल्ला,208391,197632,177086
2,Baglung,District,Bāgaluṅa,बागलुङ जिल्ला,268937,268613,249211
3,Baitadi,District,Baitaḍī,बैतडी जिल्ला,234418,250898,242157
4,Bajhang,District,Bajhāṅa,बझाङ जिल्ला,167026,195159,189085
...,...,...,...,...,...,...,...
72,Tanahun,District,Tanahũ,तनहुँ जिल्ला,315237,323288,321153
73,Taplejung,District,Tāplējuṅa,ताप्लेजुङ जिल्ला,134698,127461,120590
74,Tehrathum,District,Tēhrathuma,तेह्रथुम जिल्ला,113111,101577,88731
75,Udayapur,District,Udayapura,उदयपुर जिल्ला,287689,317532,340721


### Scraping the municipalities from the table

In [17]:
admin2_data = soup.find_all('tbody', class_ = 'admin2')
for admin2 in admin2_data:
    municipality_data = admin2.find_all('tr')
    for names in municipality_data:
        municipality_rows = names.find_all('td')
        municipality_names = [data.text for data in municipality_rows][:7]
        print(municipality_names)
        length = len(df)
        df.loc[length] = municipality_names
        
        
        
    # print(x_data)

['Bannigadhi Jaygadh', 'Rural Municipality', 'Bānnigaḍhī Jayagaḍha', 'बान्निगढी जयगढ गाउँपालिका', '17,102', '17,426', '13,519']
['Chaurpati', 'Rural Municipality', 'Caurapāṭī', 'चौरपाटी गाउँपालिका', '23,323', '25,215', '21,681']
['Dhakari', 'Rural Municipality', 'Ḍhakārī', 'ढकारी गाउँपालिका', '...', '21,570', '21,998']
['Kamalbazar', 'Municipality', 'Kamalabajāra', 'कमलबजार नगरपालिका', '...', '23,770', '21,032']
['Khaptad National Park', 'National Park', 'Khaptaḍa Rāṣṭriya Nikuñja', 'खप्तड राष्ट्रिय निकुञ्ज', '0', '0', '0']
['Mangalsen', 'Municipality', 'Maṅgalasēna', 'मंगलसेन नगरपालिका', '...', '33,191', '26,557']
['Mellekh', 'Rural Municipality', 'Mēllēkha', 'मेल्लेख गाउँपालिका', '21,219', '24,728', '22,785']
['Panchadewal Binayak', 'Municipality', 'Pañcadēvala Vināyaka', 'पञ्चदेवल विनायक नगरपालिका', '...', '27,495', '26,088']
['Ramaroshan', 'Rural Municipality', 'Rāmārōśana', 'रामारोशन गाउँपालिका', '20,749', '25,172', '23,600']
['Sanphebagar', 'Municipality', 'Sā̃phēbagara', 'साँफेब

In [18]:
df.head()

Unnamed: 0,Name,Status,Transcription,Native,PopulationCensus2001-05-28,PopulationCensus2011-06-22,PopulationCensus2021-11-25
0,Achham,District,Achāma,अछाम जिल्ला,231285,257477,228852
1,Arghakhanchi,District,Arghākhā̃cī,अर्घाखाँची जिल्ला,208391,197632,177086
2,Baglung,District,Bāgaluṅa,बागलुङ जिल्ला,268937,268613,249211
3,Baitadi,District,Baitaḍī,बैतडी जिल्ला,234418,250898,242157
4,Bajhang,District,Bajhāṅa,बझाङ जिल्ला,167026,195159,189085


In [19]:
df.tail()

Unnamed: 0,Name,Status,Transcription,Native,PopulationCensus2001-05-28,PopulationCensus2011-06-22,PopulationCensus2021-11-25
845,Banphikot,Rural Municipality,Bā̃phikōṭa,बाँफिकोट गाउँपालिका,...,18696,21033
846,Chaurjahari,Municipality,Caurajahārī,चौरजहारी नगरपालिका,...,27583,28956
847,Musikot,Municipality,Musikōṭa,मुसिकोट नगरपालिका,...,33882,34270
848,Sani Bheri,Rural Municipality,Sānī Bhērī,सानी भेरी गाउँपालिका,...,22194,24759
849,Triveni,Rural Municipality,Trivēṇī,त्रिवेणी गाउँपालिका,...,19404,20525


## Scraping the overall table data as it is

In [20]:
current_district = None
rows = []

get_body = soup.find_all('tbody')
for tbody in get_body:
    cls = tbody.get("class", [])

    if "admin1" in cls:
        district_row = tbody.find("tr", class_ = 'rname')
        district_names = district_row.find_all("td")

        current_district = district_names[0].get_text(strip=True)

        rows.append({
            "District": current_district,
            "Name": current_district,
            "Level": "District",
            "Status": district_names[1].get_text(strip=True),
            "PopulationCensus2001-05-28": district_names[4].get_text(strip=True),
            "PopulationCensus2011-06-22": district_names[5].get_text(strip=True),
            "PopulationCensus2021-11-25": district_names[6].get_text(strip=True),
        })

    elif "admin2" in cls:
        municipality_rows = tbody.find_all("tr")
        for names in municipality_rows:
            municipality_name = names.find_all("td")

            rows.append({
                "District": current_district,   
                "Name": municipality_name[0].get_text(strip=True),
                "Level": municipality_name[1].get_text(strip=True),
                "Status": municipality_name[1].get_text(strip=True),
                "PopulationCensus2001-05-28": municipality_name[4].get_text(strip=True),
                "PopulationCensus2011-06-22": municipality_name[5].get_text(strip=True),
                "PopulationCensus2021-11-25": municipality_name[6].get_text(strip=True),
            })

df = pd.DataFrame(rows)


In [21]:
df.tail(20)

Unnamed: 0,District,Name,Level,Status,PopulationCensus2001-05-28,PopulationCensus2011-06-22,PopulationCensus2021-11-25
830,Tehrathum,Menchhayayem,Rural Municipality,Rural Municipality,9694,8078,6678
831,Tehrathum,Myanglung,Municipality,Municipality,...,20337,18750
832,Tehrathum,Phedap,Rural Municipality,Rural Municipality,20596,17700,15169
833,Udayapur,Udayapur,District,District,287689,317532,340721
834,Udayapur,Belaka,Municipality,Municipality,38727,42450,51043
835,Udayapur,Chaudandigadhi,Municipality,Municipality,45023,48750,53631
836,Udayapur,Katari,Municipality,Municipality,50505,56545,59507
837,Udayapur,Koshi Tappu Wildlife Reserve,Wildlife Reserve,Wildlife Reserve,0,0,0
838,Udayapur,Limchungbung[Sunkoshi],Rural Municipality,Rural Municipality,13800,11992,9689
839,Udayapur,Rautamai,Rural Municipality,Rural Municipality,23113,23481,20324


### Cleaning Populations data

In [22]:
def clean_pop(x):
    return None if x == "..." else int(x.replace(",", ""))

df['PopulationCensus2001-05-28'] = df['PopulationCensus2001-05-28'].apply(clean_pop)
df['PopulationCensus2011-06-22'] = df['PopulationCensus2011-06-22'].apply(clean_pop)
df['PopulationCensus2021-11-25'] = df['PopulationCensus2021-11-25'].apply(clean_pop)


In [23]:
df.head(10)

Unnamed: 0,District,Name,Level,Status,PopulationCensus2001-05-28,PopulationCensus2011-06-22,PopulationCensus2021-11-25
0,Achham,Achham,District,District,231285.0,257477,228852
1,Achham,Bannigadhi Jaygadh,Rural Municipality,Rural Municipality,17102.0,17426,13519
2,Achham,Chaurpati,Rural Municipality,Rural Municipality,23323.0,25215,21681
3,Achham,Dhakari,Rural Municipality,Rural Municipality,,21570,21998
4,Achham,Kamalbazar,Municipality,Municipality,,23770,21032
5,Achham,Khaptad National Park,National Park,National Park,0.0,0,0
6,Achham,Mangalsen,Municipality,Municipality,,33191,26557
7,Achham,Mellekh,Rural Municipality,Rural Municipality,21219.0,24728,22785
8,Achham,Panchadewal Binayak,Municipality,Municipality,,27495,26088
9,Achham,Ramaroshan,Rural Municipality,Rural Municipality,20749.0,25172,23600


### Exporting tabular data to Excel file

In [25]:
df.to_excel("DetailedCityPopulation.xlsx", index=False)


#### We have successfully performed web scraping to extract valuable information from webpage