In [72]:
import pandas as pd
import pymongo
import json
import math


pd.set_option("display.max_rows", 300)

### Extract and Clean Data

#### Population of Cities

In [43]:
# url to scrape for the city population
cities_url ="https://worldpopulationreview.com/world-cities"
# Use panda's `read_html` to parse the url
df_cityPop = pd.read_html(cities_url, header=0)[0]
# rename the columns
df_cityPop.rename(columns={'Name':'City', 
                           '2020 Population':'2020',
                           '2019 Population':'2019'
                          },inplace=True)
# Replace null values with 0
df_cityPop.fillna(0,inplace = True)
df_cityPop

Unnamed: 0,Rank,City,Country,2020,2019,Change
0,1,Tokyo,Japan,37393128,37435192.0,-0.11%
1,2,Delhi,India,30290936,29399140.0,3.03%
2,3,Shanghai,China,27058480,26317104.0,2.82%
3,4,Sao Paulo,Brazil,22043028,21846508.0,0.90%
4,5,Mexico City,Mexico,21782378,21671908.0,0.51%
...,...,...,...,...,...,...
1140,1141,Tanta,Egypt,501423,0.0,0
1141,1142,Sacramento,United States,500930,500930.0,0.00%
1142,1143,Chuxiong,China,500867,0.0,0
1143,1144,Douai Lens,France,500776,501078.0,-0.06%


#### Live Population Data Scraped

In [44]:
# url to scrape for the Live population data
countries_url ="https://worldpopulationreview.com"
# Use panda's `read_html` to parse the url
df_LatestPop = pd.read_html(countries_url, header=0)[0]
# eliminating unnessasary data
df_LatestPop = df_LatestPop.iloc[:,[1,2,5,6,7,8]]
# rename the columns
df_LatestPop.rename(columns={'2019 Density':'Density_PerSqKm', 
                             'Growth Rate':'Growth_Percentage', 
                             'World %':'World_Percentage'
                            },inplace=True)
                            
# Converting string values to numbers
df_LatestPop['Density_PerSqKm'] = pd.to_numeric(df_LatestPop['Density_PerSqKm'].str.rsplit('/', 0).str.get(0).str.replace(r',', ''))
df_LatestPop['Growth_Percentage'] = pd.to_numeric(df_LatestPop['Growth_Percentage'].str.rsplit('%', 0).str.get(0))
df_LatestPop['World_Percentage'] = pd.to_numeric(df_LatestPop['World_Percentage'].str.rsplit('%', 0).str.get(0))
df_LatestPop

Unnamed: 0,Country,2021 (Live),Density_PerSqKm,Growth_Percentage,World_Percentage,Rank
0,China,1442166775,148,0.39,18.47,1
1,India,1387177258,420,0.99,17.70,2
2,United States,332041150,35,0.59,4.25,3
3,Indonesia,275056196,144,1.07,3.51,4
4,Pakistan,223154088,250,2.00,2.83,5
...,...,...,...,...,...,...
227,Montserrat,4992,49,0.06,0.00,228
228,Falkland Islands,3480,0,3.05,0.00,229
229,Niue,1626,6,0.68,0.00,230
230,Tokelau,1357,113,1.27,0.00,231


#### Population of Countries

In [119]:
# read Countries population data from csv(source:https://worldpopulationreview.com) into dataframe
df_countries = pd.read_csv('static/data/csvData.csv')
# eliminating unnessasary data
df_countries = df_countries.iloc[:,[1,2,3,6,7,8,9]]
# rename the columns
df_countries.rename(columns={'name':'Country',
                             'pop2020':'2020',
                             'pop2019':'2019',
                             'pop2015':'2015',
                             'pop2010':'2010',
                             'pop2000':'2000',
                             'pop1990':'1990' 
                            },inplace=True)

# Removing decimal point from data
# Loop through the columns
for col in df_countries:
    # performing operations on columns other than Country column
    if col != "Country":
        df_countries[col] = df_countries[col].astype(str)  # Converting to string

        df_countries[col] = [x.split(".") for x in df_countries[col]]    # Split into 2 strings at the decimal point

        # concatenating both strings choosing only 3 digits from the second string(decimal part)
        df_countries[col] = [ x[0] + x[1][0:3] if len(x[1]) >= 3 \
                         else x[0] + x[1][0:3] + '0' if len(x[1]) == 2 \
                         else x[0] + x[1][0:3] + '00' \
                            for x in df_countries[col]]

        df_countries[col] = df_countries[col].astype(int)     # Convering back to number 




df_countries

Unnamed: 0,Country,2020,2019,2015,2010,2000,1990
0,China,1439323776,1433783686,1406847870,1368810615,1290550765,1176883674
1,India,1380004385,1366417754,1310152403,1234281170,1056575549,873277798
2,United States,331002651,329064917,320878310,309011475,281710909,252120308
3,Indonesia,273523615,270625568,258383256,241834215,211513823,181413402
4,Pakistan,220892340,216565318,199426963,179424641,142343578,107647921
5,Brazil,212559417,211049527,204471768,195713635,174790340,149003223
6,Nigeria,206139589,200963599,181137448,158503197,122283850,95212450
7,Bangladesh,164689383,163046161,156256275,147575430,127657854,103171955
8,Russia,145934462,145872256,144985057,143479274,146404903,147531561
9,Mexico,128932753,127575529,121858258,114092963,98899845,83943132


In [132]:
# Another Dataset
# Cleaning csv Population data from https://datacatalog.worldbank.org
# reading csv's into dataframes
df_population = pd.read_csv('static/data/population.csv')

# Creating a list of required row indexes
row_list = []
for x in range(217):
    row_list.append(x)
row_list.append(263)

# Function to Clean each dataframes
def clean_dataFrames(df):
    # eliminating unnecessary data
    df = df.iloc[row_list, [2,11,12,13]]
    # renaming columns
    df = df.rename(columns = lambda x : (str(x)[:-9]))
    df.rename(columns= {df.columns[0]: "Country"}, inplace = True)
    return df

# Calling clean_dataFrames function passing the dataframe as parameter
df_population = clean_dataFrames(df_population)

df_population.drop(df_population.index[df_population['Country'] == 'Eritrea'], inplace = True)

# Loop through the columns
for col in df_population:
    # performing operations on columns other than Country column
    if col != "Country":
        df_population[col] = df_population[col].astype(float)  # Converting string to integer
        df_population[col] = df_population[col].astype(int)  # Converting string to integer

# Checking for countries that has records in df_countries, but not in df_population
mismatch_df = df_countries[~df_countries.Country.isin(df_population.Country)]

# Renaming the Countries to match the dataframes if Country name is df_countries a substring of 
# Country name in df_population
for country in mismatch_df['Country']: 
    df_population["Country"].loc[df_population['Country'].str.contains(country)] = country

# Changing the Other Country names in df_population to match with df_countries
df_population["Country"].loc[df_population.Country == "Congo, Dem. Rep."] = "DR Congo"
df_population["Country"].loc[df_population.Country == "Congo, Rep."] = "Republic of the Congo"
df_population["Country"].loc[df_population.Country == "Korea, Rep."] = "South Korea"
df_population["Country"].loc[df_population.Country == "Korea, Dem. People’s Rep."] = "North Korea"
df_population["Country"].loc[df_population.Country == "Cote d'Ivoire"] = "Ivory Coast"
df_population["Country"].loc[df_population.Country == "Lao PDR"] = "Laos"
df_population["Country"].loc[df_population.Country == "Macao SAR, China"] = "Macau"
df_population["Country"].loc[df_population.Country == "Kyrgyz Republic"] = "Kyrgyzstan"
df_population["Country"].loc[df_population.Country == "Slovak Republic"] = "Slovakia"
df_population["Country"].loc[df_population.Country == "Eswatini"] = "Swaziland"
df_population["Country"].loc[df_population.Country == "Cabo Verde"] = "Cape Verde"
df_population["Country"].loc[df_population.Country == "St. Lucia"] = "Saint Lucia"
df_population["Country"].loc[df_population.Country == "St. Vincent and the Grenadines"] = "Saint Vincent and the Grenadines"
df_population["Country"].loc[df_population.Country == "Virgin Islands (U.S.)"] = "United States Virgin Islands"
df_population["Country"].loc[df_population.Country == "St. Kitts and Nevis"] = "Saint Kitts and Nevis"
df_population["Country"].loc[df_population.Country == "St. Martin (French part)"] = "Saint Martin"

mismatch_df = df_countries[~df_countries.Country.isin(df_population.Country)]

mismatch_df

Unnamed: 0,Country,2020,2019,2015,2010,2000,1990
55,Taiwan,23816775,23773876,23557477,23187551,21966527,20478520
120,Palestine,5101414,4981420,4529166,4055631,3224002,2101446
132,Eritrea,3546421,3497117,3342817,3170435,2292416,2258653
161,Reunion,895312,888927,863363,830518,736710,610582
169,Western Sahara,597339,582463,526216,480274,314118,217257
175,Guadeloupe,400124,400056,400255,406070,422051,389249
178,Martinique,375265,375554,378478,394663,387004,358449
181,French Guiana,298682,290832,260999,233002,163165,115783
185,Mayotte,272815,266150,240020,208718,150331,94783
220,Cook Islands,17564,17548,17586,18391,17930,18191


In [133]:
# merging two dataframes for additional years
df_countries = df_countries.merge(df_population, on="Country", how="left")
# reordering the columns
df_countries = df_countries.iloc[:,[0,1,2,9,8,7,3,4,5,6]]
# Replace null values with 0
df_countries.fillna(0,inplace = True)

df_countries

Unnamed: 0,Country,2020,2019,2018,2017,2016,2015,2010,2000,1990
0,China,1439323776,1433783686,1392730000.0,1386395000.0,1378665000.0,1406847870,1368810615,1290550765,1176883674
1,India,1380004385,1366417754,1352617000.0,1338659000.0,1324510000.0,1310152403,1234281170,1056575549,873277798
2,United States,331002651,329064917,326687500.0,324985500.0,322941300.0,320878310,309011475,281710909,252120308
3,Indonesia,273523615,270625568,267663400.0,264645900.0,261554200.0,258383256,241834215,211513823,181413402
4,Pakistan,220892340,216565318,212215000.0,207896700.0,203627300.0,199426963,179424641,142343578,107647921
5,Brazil,212559417,211049527,209469300.0,207833800.0,206163100.0,204471768,195713635,174790340,149003223
6,Nigeria,206139589,200963599,195874700.0,190873300.0,185960300.0,181137448,158503197,122283850,95212450
7,Bangladesh,164689383,163046161,161356000.0,159670600.0,157970800.0,156256275,147575430,127657854,103171955
8,Russia,145934462,145872256,144477900.0,144496700.0,144342400.0,144985057,143479274,146404903,147531561
9,Mexico,128932753,127575529,126190800.0,124777300.0,123333400.0,121858258,114092963,98899845,83943132


#### Population Density

In [111]:
# Cleaning csv Population data from https://datacatalog.worldbank.org
# reading csv's into dataframes
df_density = pd.read_csv('static/data/populationDensity.csv')
# df_density =  clean_dataFrames(df_density, 14)
df_density

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],2000 [YR2000],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
0,Population density (people per sq. km of land ...,EN.POP.DNST,Afghanistan,AFG,19.0122047605919,31.8291103758846,46.1315029255889,47.7305639800263,49.4280381705113,51.1147780534877,52.7120715007812,54.1971142358239,55.5959930153479,56.937760009803,..,..
1,Population density (people per sq. km of land ...,EN.POP.DNST,Albania,ALB,119.946788321168,112.738211678832,106.02901459854,105.854051094891,105.660291970803,105.441751824818,105.135145985401,104.967189781022,104.870693430657,104.612262773723,..,..
2,Population density (people per sq. km of land ...,EN.POP.DNST,Algeria,DZA,10.8151473292635,13.0334272422683,15.3927145700202,15.6960402898721,16.0135581549623,16.3425424269652,16.680252672416,17.0259574932612,17.3777146120063,17.7300750711665,..,..
3,Population density (people per sq. km of land ...,EN.POP.DNST,American Samoa,ASM,236.735,289.105,278.795,278.335,278.565,278.955,279.06,278.705,278.1,277.325,..,..
4,Population density (people per sq. km of land ...,EN.POP.DNST,Andorra,AND,115.976595744681,139.127659574468,178.185106382979,175.376595744681,171.859574468085,168.53829787234,165.98085106383,164.46170212766,163.831914893617,163.842553191489,..,..
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,,,,,,,,,,,,,,,,
265,,,,,,,,,,,,,,,,
266,,,,,,,,,,,,,,,,
267,Data from database: World Development Indicators,,,,,,,,,,,,,,,


### Load Data into MongoDB

In [112]:
# Loading Data into MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

db_name = "populationDB"
# # Drop database if exists
if bool(db_name in client.list_database_names()):
    client.drop_database(db_name)

# Creating Database and collection in mongodb
db = client[db_name]
countriesPop = db["countriesPopulation"]
citiesPop = db["citiesPopulation"]
latestPop = db["latestPopulation"]


# Function to insert Dataframes into mongodb collections
def insertToDB(df, collection):
    df.reset_index(inplace=True) # Reset Index
    data_dict = df.to_dict("records") # Convert to dictionary
    collection.insert_one({"data":data_dict}) # Insert dict to collection


# Calling function to insert each dataframes into mongoDB collections
insertToDB(df_countries, countriesPop)
insertToDB(df_cityPop, citiesPop)
insertToDB(df_LatestPop, latestPop)


print(db.list_collection_names())

['citiesPopulation', 'countriesPopulation', 'latestPopulation']
