In [1]:
import pandas as pd
import pymongo
import json


### Extract and Clean Data

#### Population of Cities

In [2]:
# url to scrape for the city population
cities_url ="https://worldpopulationreview.com/world-cities"
# Use panda's `read_html` to parse the url
df_cityPop = pd.read_html(cities_url, header=0)[0]
# rename the columns
df_cityPop.rename(columns={'Name':'City', 
                           '2020 Population':'2020',
                           '2019 Population':'2019'
                          },inplace=True)
# Replace null values with 0
df_cityPop.fillna(0,inplace = True)
df_cityPop

Unnamed: 0,Rank,City,Country,2020,2019,Change
0,1,Tokyo,Japan,37393128,37435192.0,-0.11%
1,2,Delhi,India,30290936,29399140.0,3.03%
2,3,Shanghai,China,27058480,26317104.0,2.82%
3,4,Sao Paulo,Brazil,22043028,21846508.0,0.90%
4,5,Mexico City,Mexico,21782378,21671908.0,0.51%
...,...,...,...,...,...,...
1140,1141,Tanta,Egypt,501423,0.0,0
1141,1142,Sacramento,United States,500930,500930.0,0.00%
1142,1143,Chuxiong,China,500867,0.0,0
1143,1144,Douai Lens,France,500776,501078.0,-0.06%


#### Live Population Data Scraped

In [3]:
# url to scrape for the Live population data
countries_url ="https://worldpopulationreview.com"
# Use panda's `read_html` to parse the url
df_LatestPop = pd.read_html(countries_url, header=0)[0]
# eliminating unnessasary data
df_LatestPop = df_LatestPop.iloc[:,[1,2,5,6,7,8]]
# rename the columns
df_LatestPop.rename(columns={'2019 Density':'2019 Density_PerSqKm', 
                             'Growth Rate':'Growth_Percentage', 
                             'World %':'World_Percentage'
                            },inplace=True)
df_LatestPop['2019 Density_PerSqKm'] = df_LatestPop['2019 Density_PerSqKm'].str.rsplit('/', 0).str.get(0)
df_LatestPop['Growth_Percentage'] = df_LatestPop['Growth_Percentage'].str.rsplit('%', 0).str.get(0)
df_LatestPop['World_Percentage'] = df_LatestPop['World_Percentage'].str.rsplit('%', 0).str.get(0)
df_LatestPop

Unnamed: 0,Country,2021 (Live),2019 Density_PerSqKm,Growth_Percentage,World_Percentage,Rank
0,China,1442166775,148,0.39,18.47,1
1,India,1387177258,420,0.99,17.70,2
2,United States,332041150,35,0.59,4.25,3
3,Indonesia,275056196,144,1.07,3.51,4
4,Pakistan,223154088,250,2.00,2.83,5
...,...,...,...,...,...,...
227,Montserrat,4992,49,0.06,0.00,228
228,Falkland Islands,3480,0,3.05,0.00,229
229,Niue,1626,6,0.68,0.00,230
230,Tokelau,1357,113,1.27,0.00,231


#### Population of Countries

In [4]:
# read Countries population data from csv(source:https://worldpopulationreview.com) into dataframe
df_countries = pd.read_csv('static/data/csvData.csv')
# eliminating unnessasary data
df_countries = df_countries.iloc[:,[1,2,3,6,7,8,9]]
# rename the columns
df_countries.rename(columns={'name':'Country',
                             'pop2020':'2020',
                             'pop2019':'2019',
                             'pop2015':'2015',
                             'pop2010':'2010',
                             'pop2000':'2000',
                             'pop1990':'1990' 
                            },inplace=True)
df_countries

Unnamed: 0,Country,2020,2019,2015,2010,2000,1990
0,China,1439323.776,1433783.686,1406847.870,1368810.615,1290550.765,1176883.674
1,India,1380004.385,1366417.754,1310152.403,1234281.170,1056575.549,873277.798
2,United States,331002.651,329064.917,320878.310,309011.475,281710.909,252120.309
3,Indonesia,273523.615,270625.568,258383.256,241834.215,211513.823,181413.402
4,Pakistan,220892.340,216565.318,199426.964,179424.641,142343.578,107647.921
...,...,...,...,...,...,...,...
227,Montserrat,4.992,4.989,4.967,4.899,4.929,10.615
228,Falkland Islands,3.480,3.377,2.834,2.901,2.892,1.982
229,Niue,1.626,1.615,1.619,1.618,1.899,2.329
230,Tokelau,1.357,1.340,1.252,1.140,1.554,1.608


In [5]:
# Another Dataset
# Cleaning csv Population data from https://datacatalog.worldbank.org
# reading csv's into dataframes
df_population = pd.read_csv('static/data/population.csv');

# Creating a list of required row indexes
row_list = []
for x in range(217):
    row_list.append(x)
row_list.append(263)

# Function to Clean each dataframes
def clean_dataFrames(df):
    # eliminating unnecessary data
    df = df.iloc[row_list, [2,11,12,13]]
    # renaming columns
    df = df.rename(columns = lambda x : (str(x)[:-9]))
    df.rename(columns= {df.columns[0]: "Country"}, inplace = True)
    return df

# Calling clean_dataFrames function passing the dataframe as parameter
df_population = clean_dataFrames(df_population)

# Checking for countries that has records in df_countries, but not in df_population
mismatch_df = df_countries[~df_countries.Country.isin(df_population.Country)]

# Renaming the Countries to match the dataframes if Country name is df_countries a substring of 
# Country name in df_population
for country in mismatch_df['Country']: 
    df_population["Country"].loc[df_population['Country'].str.contains(country)] = country

# Changing the Other Country names in df_population to match with df_countries
df_population["Country"].loc[df_population.Country == "Congo, Dem. Rep."] = "DR Congo"
df_population["Country"].loc[df_population.Country == "Congo, Rep."] = "Republic of the Congo"
df_population["Country"].loc[df_population.Country == "Korea, Rep."] = "South Korea"
df_population["Country"].loc[df_population.Country == "Korea, Dem. People’s Rep."] = "North Korea"
df_population["Country"].loc[df_population.Country == "Cote d'Ivoire"] = "Ivory Coast"
df_population["Country"].loc[df_population.Country == "Lao PDR"] = "Laos"
df_population["Country"].loc[df_population.Country == "Macao SAR, China"] = "Macau"
df_population["Country"].loc[df_population.Country == "Kyrgyz Republic"] = "Kyrgyzstan"
df_population["Country"].loc[df_population.Country == "Slovak Republic"] = "Slovakia"
df_population["Country"].loc[df_population.Country == "Eswatini"] = "Swaziland"
df_population["Country"].loc[df_population.Country == "Cabo Verde"] = "Cape Verde"
df_population["Country"].loc[df_population.Country == "St. Lucia"] = "Saint Lucia"
df_population["Country"].loc[df_population.Country == "St. Vincent and the Grenadines"] = "Saint Vincent and the Grenadines"
df_population["Country"].loc[df_population.Country == "Virgin Islands (U.S.)"] = "United States Virgin Islands"
df_population["Country"].loc[df_population.Country == "St. Kitts and Nevis"] = "Saint Kitts and Nevis"
df_population["Country"].loc[df_population.Country == "St. Martin (French part)"] = "Saint Martin"

mismatch_df = df_countries[~df_countries.Country.isin(df_population.Country)]

mismatch_df

Unnamed: 0,Country,2020,2019,2015,2010,2000,1990
55,Taiwan,23816.775,23773.876,23557.477,23187.551,21966.527,20478.52
120,Palestine,5101.414,4981.42,4529.166,4055.631,3224.003,2101.446
161,Reunion,895.312,888.927,863.363,830.519,736.71,610.582
169,Western Sahara,597.339,582.463,526.216,480.274,314.118,217.258
175,Guadeloupe,400.124,400.056,400.255,406.071,422.051,389.249
178,Martinique,375.265,375.554,378.478,394.663,387.004,358.449
181,French Guiana,298.682,290.832,260.999,233.002,163.165,115.784
185,Mayotte,272.815,266.15,240.02,208.718,150.331,94.784
220,Cook Islands,17.564,17.548,17.586,18.391,17.93,18.191
221,Anguilla,15.003,14.869,14.279,13.438,11.252,8.899


In [6]:
# merging two dataframes for additional years
df_countries = df_countries.merge(df_population, on="Country", how="left")
# reordering the columns
df_countries = df_countries.iloc[:,[0,1,2,9,8,7,3,4,5,6]]
# Replace null values with 0
df_countries.fillna(0,inplace = True)

df_countries

Unnamed: 0,Country,2020,2019,2018,2017,2016,2015,2010,2000,1990
0,China,1439323.776,1433783.686,1392730000,1386395000,1378665000,1406847.870,1368810.615,1290550.765,1176883.674
1,India,1380004.385,1366417.754,1352617328,1338658835,1324509589,1310152.403,1234281.170,1056575.549,873277.798
2,United States,331002.651,329064.917,326687501,324985539,322941311,320878.310,309011.475,281710.909,252120.309
3,Indonesia,273523.615,270625.568,267663435,264645886,261554226,258383.256,241834.215,211513.823,181413.402
4,Pakistan,220892.340,216565.318,212215030,207896686,203627284,199426.964,179424.641,142343.578,107647.921
...,...,...,...,...,...,...,...,...,...,...
227,Montserrat,4.992,4.989,0,0,0,4.967,4.899,4.929,10.615
228,Falkland Islands,3.480,3.377,0,0,0,2.834,2.901,2.892,1.982
229,Niue,1.626,1.615,0,0,0,1.619,1.618,1.899,2.329
230,Tokelau,1.357,1.340,0,0,0,1.252,1.140,1.554,1.608


### Load Data into MongoDB

In [7]:
# Loading Data into MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

db_name = "populationDB"
# Drop database if exists
if bool(db_name in client.database_names()):
    client.drop_database(db_name)

# Creating Database and collection in mongodb
db = client[db_name]
countriesPop = db["countriesPopulation"]
citiesPop = db["citiesPopulation"]
latestPop = db["latestPopulation"]


# Function to insert Dataframes into mongodb collections
def insertToDB(df, collection):
    df.reset_index(inplace=True) # Reset Index
    data_dict = df.to_dict("records") # Convert to dictionary
    collection.insert_one({"index":"populationData","data":data_dict}) 
    
# insert into DB
insertToDB(df_countries, countriesPop)
insertToDB(df_cityPop, citiesPop)
insertToDB(df_LatestPop, latestPop)


print(db.list_collection_names())

  import sys


['citiesPopulation', 'latestPopulation', 'countriesPopulation']
