In [429]:
import pandas as pd
import pymongo
import json
import math


pd.set_option("display.max_rows", 300)

### Extract and Clean Data

#### Population of Cities

In [430]:
# url to scrape for the city population
cities_url ="https://worldpopulationreview.com/world-cities"
# Use panda's `read_html` to parse the url
df_cityPop = pd.read_html(cities_url, header=0)[0]
# rename the columns
df_cityPop.rename(columns={'Name':'City', 
                           '2020 Population':'2020',
                           '2019 Population':'2019'
                          },inplace=True)
# Replace null values with 0
df_cityPop.fillna(0,inplace = True)
df_cityPop.head()

Unnamed: 0,Rank,City,Country,2020,2019,Change
0,1,Tokyo,Japan,37393128,37435192.0,-0.11%
1,2,Delhi,India,30290936,29399140.0,3.03%
2,3,Shanghai,China,27058480,26317104.0,2.82%
3,4,Sao Paulo,Brazil,22043028,21846508.0,0.90%
4,5,Mexico City,Mexico,21782378,21671908.0,0.51%


#### Live Population Data Scraped

In [431]:
# url to scrape for the Live population data
countries_url ="https://worldpopulationreview.com"
# Use panda's `read_html` to parse the url
df_LatestPop = pd.read_html(countries_url, header=0)[0]
# eliminating unnessasary data
df_LatestPop = df_LatestPop.iloc[:,[1,2,5,6,7,8]]
# rename the columns
df_LatestPop.rename(columns={'2019 Density':'Density_PerSqKm', 
                             'Growth Rate':'Growth_Percentage', 
                             'World %':'World_Percentage'
                            },inplace=True)
                            
# Converting string values to numbers
df_LatestPop['Density_PerSqKm'] = pd.to_numeric(df_LatestPop['Density_PerSqKm'].str.rsplit('/', 0).str.get(0).str.replace(r',', ''))
df_LatestPop['Growth_Percentage'] = pd.to_numeric(df_LatestPop['Growth_Percentage'].str.rsplit('%', 0).str.get(0))
df_LatestPop['World_Percentage'] = pd.to_numeric(df_LatestPop['World_Percentage'].str.rsplit('%', 0).str.get(0))
df_LatestPop.head()

Unnamed: 0,Country,2021 (Live),Density_PerSqKm,Growth_Percentage,World_Percentage,Rank
0,China,1442166775,148,0.39,18.47,1
1,India,1387177258,420,0.99,17.7,2
2,United States,332041150,35,0.59,4.25,3
3,Indonesia,275056196,144,1.07,3.51,4
4,Pakistan,223154088,250,2.0,2.83,5


#### Scraping Country codes to merge datasets with

In [432]:
# url to scrape for ISO 3166 country codes Alpha-2 and Alpha-3 from www.iban.com
country_code_url ="https://www.iban.com/country-codes"
# Use panda's `read_html` to parse the url
df_countryCode = pd.read_html(country_code_url, header=0)[0]
# eliminating unnessasary data
df_countryCode = df_countryCode.iloc[:,[1,2]]
# rename the columns
df_countryCode.rename(columns={'Alpha-2 code':'Country_Code',
                               'Alpha-3 code':'Country_Code_3'
                              },inplace=True)
df_countryCode.head()

Unnamed: 0,Country_Code,Country_Code_3
0,AF,AFG
1,AX,ALA
2,AL,ALB
3,DZ,DZA
4,AS,ASM


#### Population of Countries

In [433]:
# read Countries population data from csv(source:https://worldpopulationreview.com) into dataframe
df_countries = pd.read_csv('static/data/csvData.csv')
# rename the columns
df_countries.rename(columns={'cca2':'Country_Code',
                             'name':'Country',
                             'pop2020':'2020',
                             'pop2019':'2019',
                             'pop2015':'2015',
                             'pop2010':'2010',
                             'pop2000':'2000',
                             'pop1990':'1990' 
                            },inplace=True)

# eliminating unnessasary data
df_countries = df_countries.iloc[:,[0,1,2,3,6,7,8,9]]

# Removing decimal point from data
# Loop through the columns
for col in df_countries:
    # performing operations on columns other than Country column
    if col not in ["Country_Code", "Country"]:
        df_countries[col] = df_countries[col].astype(str)  # Converting to string

        df_countries[col] = [x.split(".") for x in df_countries[col]]    # Split into 2 strings at the decimal point

        # concatenating both strings choosing only 3 digits from the second string(decimal part)
        df_countries[col] = [ x[0] + x[1][0:3] if len(x[1]) >= 3 \
                         else x[0] + x[1][0:3] + '0' if len(x[1]) == 2 \
                         else x[0] + x[1][0:3] + '00' \
                            for x in df_countries[col]]

        df_countries[col] = df_countries[col].astype(int)     # Converting back to number 

df_countries.head()

Unnamed: 0,Country_Code,Country,2020,2019,2015,2010,2000,1990
0,CN,China,1439323776,1433783686,1406847870,1368810615,1290550765,1176883674
1,IN,India,1380004385,1366417754,1310152403,1234281170,1056575549,873277798
2,US,United States,331002651,329064917,320878310,309011475,281710909,252120308
3,ID,Indonesia,273523615,270625568,258383256,241834215,211513823,181413402
4,PK,Pakistan,220892340,216565318,199426963,179424641,142343578,107647921


In [434]:
# Another Dataset
# Cleaning csv Population data from https://datacatalog.worldbank.org
# reading csv's into dataframes
df_population = pd.read_csv('static/data/population.csv')

# Function to Clean each dataframes
def clean_dataFrames(df, col_list):
    # eliminating unnecessary data
    df = df.iloc[0:217, col_list]
    # renaming columns
    df.rename(columns= {df.columns[0]: "Name"}, inplace = True)
    df = df.rename(columns = lambda x : (str(x)[:-9]))
    df.rename(columns= {df.columns[0]: "Country", df.columns[1]: "Country_Code_3"}, inplace = True)
    return df

# list of required column indexes
col_list = [2,3,11,12,13]
# Calling clean_dataFrames function passing the dataframe as parameter
df_population = clean_dataFrames(df_population, col_list)

# Removing row with no values for the required years(Country Eritrea)
df_population.drop(df_population.index[df_population['Country'] == 'Eritrea'], inplace = True)

# Loop through the columns to covert values from string to 
for col in df_population:
    # performing operations on columns other than Country and Country_Code columns
    if col not in ["Country_Code_3", "Country"]:
        df_population[col] = df_population[col].astype(float)  # Converting string to number

df_population.head()

Unnamed: 0,Country,Country_Code_3,2016,2017,2018
0,Afghanistan,AFG,35383128.0,36296400.0,37172386.0
1,Albania,ALB,2876101.0,2873457.0,2866376.0
2,Algeria,DZA,40551404.0,41389198.0,42228429.0
3,American Samoa,ASM,55741.0,55620.0,55465.0
4,Andorra,AND,77297.0,77001.0,77006.0


In [435]:
# merging two dataframes for additional years data

# merging df_population with df_countryCode
df_population = df_countryCode.merge(df_population, on="Country_Code_3", how="right")
# removing Country_Code_3 column
del df_population['Country_Code_3'] 


# merging df_population with df_countries
df_countries = df_countries.merge(df_population, on="Country_Code", how="left")
# removing Country_Code_3 column
del df_countries['Country_y']
# renaming columns
df_countries.rename(columns= {"Country_x": "Country"}, inplace = True)
# reordering the columns
df_countries = df_countries.iloc[:,[0,1,2,3,10,9,8,4,5,6,7]]
# # Replace null values with 0
df_countries.fillna(0,inplace = True)

df_countries.head()

Unnamed: 0,Country_Code,Country,2020,2019,2018,2017,2016,2015,2010,2000,1990
0,CN,China,1439323776,1433783686,1392730000.0,1386395000.0,1378665000.0,1406847870,1368810615,1290550765,1176883674
1,IN,India,1380004385,1366417754,1352617000.0,1338659000.0,1324510000.0,1310152403,1234281170,1056575549,873277798
2,US,United States,331002651,329064917,326687500.0,324985500.0,322941300.0,320878310,309011475,281710909,252120308
3,ID,Indonesia,273523615,270625568,267663400.0,264645900.0,261554200.0,258383256,241834215,211513823,181413402
4,PK,Pakistan,220892340,216565318,212215000.0,207896700.0,203627300.0,199426963,179424641,142343578,107647921


### Load Data into MongoDB

In [436]:
# Loading Data into MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

db_name = "populationDB"
# # Drop database if exists
if bool(db_name in client.list_database_names()):
    client.drop_database(db_name)

# Creating Database and collection in mongodb
db = client[db_name]
countriesPop = db["countriesPopulation"]
citiesPop = db["citiesPopulation"]
latestPop = db["latestPopulation"]


# Function to insert Dataframes into mongodb collections
def insertToDB(df, collection):
    data_dict = df.to_dict("records") # Convert to dictionary
    # removing index from data
    data_dict = [{k: v for k, v in d.items() if k != 'index'} for d in data_dict]
    collection.insert_one({"data":data_dict}) # Insert dict to collection


# Calling function to insert each dataframes into mongoDB collections
insertToDB(df_countries, countriesPop)
insertToDB(df_cityPop, citiesPop)
insertToDB(df_LatestPop, latestPop)


print(db.list_collection_names())

['countriesPopulation', 'citiesPopulation', 'latestPopulation']
