### Import the required packages


In [15]:
import mechanicalsoup as soup
import pandas as pd

### Scrape the elevation from the url


In [16]:
# Create a url variable

url = "https://en.m.wikipedia.org/wiki/List_of_countries_by_average_elevation"

# create a browser object

browser = soup.StatefulBrowser()

# open the url

browser.open(url)

# get the page

page = browser.get_current_page()

# get the table tag with the class 'wikitable sortable'

table = page.find("table", class_="wikitable sortable")

# get the rows using the tr tag

rows = table.find_all("tr")

### Inspect the Scraped Data


In [17]:
# print the rows

for i in range(len(rows)):
    print(rows[i].text)

Country

Elevation

  Afghanistan
1,884 m (6,181 ft)[2]
  Albania
708 m (2,323 ft)[3]
  Algeria
800 m (2,625 ft)

  Andorra
1,996 m (6,549 ft)[2]
  Angola
1,112 m (3,648 ft)

  Antarctica
2,300 m (7,546 ft)[2]
  Argentina
595 m (1,952 ft)[4]
  Armenia
1,792 m (5,879 ft)

  Australia
330 m (1,083 ft)

  Austria
910 m (2,986 ft)

  Azerbaijan
384 m (1,260 ft)[5]
  Bangladesh
85 m (279 ft)[6]
  Belarus
170 m (558 ft)[6]
  Belgium
181 m (594 ft)[6]
  Belize
173 m (568 ft)[4]
  Benin
273 m (896 ft)[6]
  Bhutan
3,280 m (10,761 ft)[2]
  Bolivia
1,192 m (3,911 ft)[4]
  Bosnia and Herzegovina
500 m (1,640 ft)[6]
  Botswana
1,013 m (3,323 ft)[6]
  Brazil
320 m (1,050 ft)[4]
  Brunei
478 m (1,568 ft)[6]
  Bulgaria
470 m (1,542 ft)

  Burkina Faso
297 m (974 ft)[6]
  Burundi
1,504 m (4,934 ft)[6]
  Cambodia
126 m (413 ft)[6]
  Cameroon
667 m (2,188 ft)[6]
  Canada
487 m (1,598 ft)[6]
  Central African Republic
635 m (2,083 ft)[6]
  Chad
543 m (1,781 ft)[6]
  Chile
1,871 m (6,138 ft)[4]
  China
1,8

In [18]:
# We see that the first row is the header in str format separated by "\n"

print(rows[0].text)

# And the second row is the Country name followed by the average elevation("m" and "feet") again separated by "\n"

print(rows[1].text)

Country

Elevation

  Afghanistan
1,884 m (6,181 ft)[2]


### Load the scraped data into a dataframe


In [19]:
# Create a dataframe and append the rows in the table

df = pd.DataFrame(columns=["Country-elevations"])

# Create a loop and append the scraped rows in the table

for i in range(1, len(rows)):
    new_df = pd.DataFrame([[rows[i].text]], columns=["Country-elevations"])

    df = pd.concat([df, new_df], ignore_index=True)

df.head(1)

Unnamed: 0,Country-elevations
0,"Afghanistan\n1,884 m (6,181 ft)[2]"


### Data Cleaning


#### Clean the Elevation Column

In [20]:
# Get rid of the trailing characters "(6181 ft)[2]" in the Country-elevation column

# 1. We can use the split() method to split the string and get the first element as the Country name

df["Country"] = df["Country-elevations"].apply(lambda x: x.split("\n")[0])

# 2. Similarly, we can use the split() method to split the string and get the second element as the average elevation

df["Average-elevation"] = df["Country-elevations"].apply(
    lambda x: x.split("\n")[1])

df.head(1)

Unnamed: 0,Country-elevations,Country,Average-elevation
0,"Afghanistan\n1,884 m (6,181 ft)[2]",Afghanistan,"1,884 m (6,181 ft)[2]"


In [21]:
# 3. We can drop the trailing characters " m (6181 ft)[2]" in the Country-elevation column
# .str.replace(',', '').astype(int)

df["Numeric-Elevation"] = df["Average-elevation"].str.split(
    "m").str[0].str.rstrip()

df.head(1)

Unnamed: 0,Country-elevations,Country,Average-elevation,Numeric-Elevation
0,"Afghanistan\n1,884 m (6,181 ft)[2]",Afghanistan,"1,884 m (6,181 ft)[2]",1884


In [22]:
# 4. finally, we can get rid of the "," in the numeric-elevation column and convert the column to float
df["Numeric-Elevation"] = df["Numeric-Elevation"].str.replace(
    ",", "").astype(float)

df.head(1)

Unnamed: 0,Country-elevations,Country,Average-elevation,Numeric-Elevation
0,"Afghanistan\n1,884 m (6,181 ft)[2]",Afghanistan,"1,884 m (6,181 ft)[2]",1884.0


In [23]:
# Drop the 'Country-elevations' and 'Average-elevation' columns

df.drop(["Country-elevations", "Average-elevation"], axis=1, inplace=True)

df.head(1)

Unnamed: 0,Country,Numeric-Elevation
0,Afghanistan,1884.0


#### Clean the Country Column

In [24]:
# Inspect the Country column strings

df["Country"][0]

'\xa0\xa0Afghanistan'

In [25]:
# Clean the leading and trailing white spaces in the Country column

df["Country"] = df["Country"].str.strip()

# Inspect the Country column strings

df["Country"][0]

'Afghanistan'

### Save the dataframe as a CSV file

In [26]:
# Save the dataframe as a csv file

df.to_csv("clean-elevation-data.csv", index=False)

In [27]:
# find country Cyprus in the dataframe countries

df[df["Country"] == "Cyprus"]

Unnamed: 0,Country,Numeric-Elevation
36,Cyprus,91.0


In [29]:
df_airquality = pd.read_csv("AirQuality_RenewableEnergy_data.csv")

In [30]:
df_airquality["Country"].isin(df["Country"])

0      True
1      True
2      True
3      True
4      True
5      True
6     False
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17    False
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32    False
33     True
34     True
35     True
36     True
37     True
38     True
39     True
40     True
41     True
42     True
43    False
44     True
45     True
46     True
47     True
48     True
49     True
50     True
51     True
Name: Country, dtype: bool

In [31]:
df_airquality["Country"][6]

'European Union'

In [32]:
df_airquality["Country"][17]

'Malta'

In [33]:
df[df["Country"] == "Malta"]

Unnamed: 0,Country,Numeric-Elevation
