# Imports

In [2]:
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import pandas as pd
from xml.etree import ElementTree

import matplotlib.pyplot as plt
# Only works inside notebook
%matplotlib inline 
#import matplotlib as mpl
#mpl.rcParams['agg.path.chunksize'] = 10000 # assists with processor speed

# import preprocessing
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer

from sklearn.neighbors import KNeighborsClassifier

# import helper files from local environment
from env import user, password, host
import QMCBT_00_quicktips as qt
import QMCBT_01_acquire as acquire
import QMCBT_02_prepare as prepare
import QMCBT_03_explore as explore
import QMCBT_04_model as model
import QMCBT_05_evaluate as evaluate
import QMCBT_explore_evaluate as ee
import QMCBT_wrangle as w

# allows import reload without needing to clear kernel and rerun
from importlib import reload
# reload(packagename) 

import warnings
warnings.filterwarnings("ignore")

**CUSTOM EXPLORATION FUNCTIONS
nunique_column_all(df): PRINT NUNIQUE OF ALL COLUMNS
nunique_column_objects(df): PRINT NUNIQUE OF COLUMNS THAT ARE OBJECTS
nunique_column_qty(df): PRINT NUNIQUE OF COLUMNS THAT ARE *NOT* OBJECTS
numeric_range(df): COMPUTE RANGE FOR ALL NUMERIC VARIABLES

**USEFUL EXPLORATORY CODE**
DFNAME.head()
DFNAME.shape
DFNAME.shape[0] #read row count
DFNAME.describe().T
DFNAME.columns.to_list()
DFNAME.COLUMNNAME.value_counts(dropna=False)
DFNAME.dtypes
DFNAME.select_dtypes(include='object').columns
DFNAME.select_dtypes(include='float').columns
pd.crosstab(DFNAME.COLUMN-1, DFNAME.COLUMN-2)


In [3]:
# Left Align Tables in Jupyter Notebook
from IPython.core.display import HTML
table_css = 'table {align:left;display:block}'
HTML('<style>{}</style>'.format(table_css))

<div class="alert alert-success">

# Life Expectancy & Mortality across the Globe
* **GHO Selections:** https://apps.who.int/gho/athena/api/GHO
* **Homepage:** https://www.who.int/data/gho/info/athena-api-examples
* **CSV Data Pull:** https://apps.who.int/gho/athena/api/GHO/WHOSIS_000001,WHOSIS_000002,WHOSIS_000003,WHOSIS_000004,WHOSIS_000005,WHOSIS_000006,WHOSIS_000007,WHOSIS_000008,WHOSIS_000009,WHOSIS_000010,WHOSIS_000012,WHOSIS_000013,WHOSIS_000014,WHOSIS_000015,WHOSIS_000016,MDG_0000000001,MDG_0000000003,MDG_0000000005,MDG_0000000007,MDG_0000000025,MDG_0000000026?format=csv

In [5]:
leam_df = pd.read_csv('leam.csv')

In [6]:
leam_df

Unnamed: 0,GHO,DATASOURCE,PUBLISHSTATE,YEAR,REGION,UNREGION,WORLDBANKINCOMEGROUP,COUNTRY,AGEGROUP,SEX,UNSDGREGION,Display Value,Numeric,Low,High,StdErr,StdDev,Comments
0,WHOSIS_000004,,PUBLISHED,2009,AFR,,,MUS,,MLE,,225,224.88920,,,,,
1,WHOSIS_000004,,PUBLISHED,2009,AFR,,,MUS,,FMLE,,102,102.27720,,,,,
2,WHOSIS_000004,,PUBLISHED,2009,AFR,,,MUS,,BTSX,,165,165.37340,,,,,
3,WHOSIS_000004,,PUBLISHED,2014,AFR,,,MUS,,MLE,,194,194.30180,,,,,
4,WHOSIS_000004,,PUBLISHED,2014,AFR,,,MUS,,FMLE,,99,98.80576,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122810,MDG_0000000003,,PUBLISHED,2010-2015,EUR,,,,YEARS15-19,FMLE,,19.9,19.92918,,,,,World population prospects: 2019 revision. Uni...
122811,MDG_0000000003,,PUBLISHED,2015-2020,EUR,,,,YEARS15-19,FMLE,,17.1,17.09032,,,,,World population prospects: 2019 revision. Uni...
122812,MDG_0000000003,,PUBLISHED,2005-2010,GLOBAL,,,,YEARS15-19,FMLE,,49.2,49.16947,,,,,World population prospects: 2019 revision. Uni...
122813,MDG_0000000003,,PUBLISHED,2010-2015,GLOBAL,,,,YEARS15-19,FMLE,,46.7,46.69116,,,,,World population prospects: 2019 revision. Uni...


In [9]:
leam_df.isnull().sum().sort_values()

GHO                          0
PUBLISHSTATE                 0
YEAR                         0
Numeric                      0
Display Value                0
REGION                     800
COUNTRY                   3411
SEX                      10541
High                     26727
Low                      26727
Comments                114650
AGEGROUP                117470
WORLDBANKINCOMEGROUP    121954
UNSDGREGION             122395
UNREGION                122661
DATASOURCE              122809
StdDev                  122815
StdErr                  122815
dtype: int64

In [48]:
leam_df.columns.value_counts()

GHO                     1
DATASOURCE              1
StdDev                  1
StdErr                  1
High                    1
Low                     1
Numeric                 1
Display Value           1
UNSDGREGION             1
SEX                     1
AGEGROUP                1
COUNTRY                 1
WORLDBANKINCOMEGROUP    1
UNREGION                1
REGION                  1
YEAR                    1
PUBLISHSTATE            1
Comments                1
dtype: int64

In [38]:
leam_df.shape[0]

122815

In [42]:
ee.nunique_column_all(leam_df)

MDG_0000000007    36084
MDG_0000000001    35933
WHOSIS_000004      9690
WHOSIS_000003      9648
WHOSIS_000016      6262
MDG_0000000003     5345
WHOSIS_000014      4353
MDG_0000000026     3420
WHOSIS_000002      2328
WHOSIS_000007      2328
WHOSIS_000015      2328
WHOSIS_000001      2328
MDG_0000000025     2153
WHOSIS_000006       615
Name: GHO, dtype: int64

NUT_TCD2019SMART    1
NUT_TCD2019MICS     1
NUT_NGA2018SMART    1
NUT_NGA2018DHS      1
NUT_PAK2018NNS      1
NUT_PAK2018DHS      1
Name: DATASOURCE, dtype: int64

PUBLISHED    122815
Name: PUBLISHSTATE, dtype: int64

2000         4689
2010         4685
2015         4661
2019         3911
2014         2342
             ... 
1940            1
1933            1
1932            1
2020-2021       1
2010-2013       1
Name: YEAR, Length: 207, dtype: int64

EUR       31696
AFR       29557
AMR       23368
WPR       16407
EMR       13519
SEAR       7092
GLOBAL      376
Name: REGION, dtype: int64

143.0    8
62.0     8
9.0      8
145.0    8


#### Misc

In [39]:
leam_df.DATASOURCE.unique()

array([nan, 'NUT_TCD2019SMART', 'NUT_TCD2019MICS', 'NUT_NGA2018SMART',
       'NUT_NGA2018DHS', 'NUT_PAK2018NNS', 'NUT_PAK2018DHS'], dtype=object)

In [15]:
leam_df.PUBLISHSTATE.unique()

array(['PUBLISHED'], dtype=object)

In [18]:
leam_df.Numeric.nunique(), leam_df.Numeric.min(), leam_df.Numeric.max(), 

(112780, 0.0, 2480.0)

In [10]:
leam_df.YEAR.unique()

array([2009, 2014, 2015, 2002, 2003, 2008, 2013, 2007, 2001, 2012, 2006,
       2000, 2011, 2005, 2016, 2010, 2004, 2017, 2018, 2019, 1990, 1991,
       1993, 1996, 1999, 2020, 1959, 1961, 1962, 1963, 1964, 1966, 1967,
       1969, 1970, 1972, 1973, 1975, 1978, 1980, 1981, 1983, 1984, 1986,
       1987, 1989, 1992, 1994, 1995, 1997, 1998, 1960, 1965, 1971, 1976,
       1977, 1982, 1988, 1985, 1974, 1968, 1979, 1951, 1952, 1953, 1957,
       1958, 1956, 1949, 1954, 1955, 1950, 1946, 1947, 1948, 1944, 1943,
       1945, 1941, 1942, 1940, 1934, 1939, 1937, 1938, 1935, 1936, 1933,
       1932, '1983', '1986', '1989', '1991', '1992', '1994', '1995',
       '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2005',
       '2006', '2008', '2009', '2011', '2014', '2017', '2019', '2020',
       '1970', '1973', '1975', '1976', '1978', '1979', '1981', '1982',
       '1984', '1987', '1990', '2004', '2007', '2012', '2015', '1956',
       '1958', '1959', '1961', '1962', '1964', '1965', '1966', 

#### GHO Stats

In [11]:
leam_df.GHO.unique()

array(['WHOSIS_000004', 'MDG_0000000026', 'WHOSIS_000014',
       'WHOSIS_000002', 'WHOSIS_000007', 'WHOSIS_000015', 'WHOSIS_000001',
       'MDG_0000000001', 'WHOSIS_000016', 'WHOSIS_000003',
       'MDG_0000000007', 'WHOSIS_000006', 'MDG_0000000003',
       'MDG_0000000025'], dtype=object)

In [12]:
leam_df.GHO.isnull().sum()

0

In [13]:
leam_df.GHO.value_counts()

MDG_0000000007    36084
MDG_0000000001    35933
WHOSIS_000004      9690
WHOSIS_000003      9648
WHOSIS_000016      6262
MDG_0000000003     5345
WHOSIS_000014      4353
MDG_0000000026     3420
WHOSIS_000002      2328
WHOSIS_000007      2328
WHOSIS_000015      2328
WHOSIS_000001      2328
MDG_0000000025     2153
WHOSIS_000006       615
Name: GHO, dtype: int64

#### MDG Stats

<div class="alert alert-info">

#### Over 35K value counts
* 'MDG_0000000007': 'Under-five mortality rate (per 1000 live births)', 
* 'MDG_0000000001': 'Infant mortality rate (between birth and 11 months per 1000 live births)', 

#### Between 2-5k value counts
* 'MDG_0000000003': 'Adolescent birth rate (per 1000 women)', 
* 'MDG_0000000026': 'Maternal mortality ratio (per 100 000 live births)', 
* 'MDG_0000000025': 'Births attended by skilled health personnel (%)', 

#### MDG...05 returned no data
* 'MDG_0000000005': 'Contraceptive prevalence (%)', 

In [36]:
print(f'MDG_0000000007    {round(36084/leam_df.shape[0]*100,2)}%')
print(f'MDG_0000000001    {round(35933/leam_df.shape[0]*100,2)}%')
print(f'MDG_0000000003     {round(5345/leam_df.shape[0]*100,2)}%')
print(f'MDG_0000000026     {round(3420/leam_df.shape[0]*100,2)}%')
print(f'MDG_0000000025     {round(2153/leam_df.shape[0]*100,2)}%')

MDG_0000000007    29.38%
MDG_0000000001    29.26%
MDG_0000000003     4.35%
MDG_0000000026     2.78%
MDG_0000000025     1.75%


'MDG_0000000001': 'Infant mortality rate (between birth and 11 months per 1000 live births)', 
'MDG_0000000003': 'Adolescent birth rate (per 1000 women)', 
'MDG_0000000005': 'Contraceptive prevalence (%)', 
'MDG_0000000007': 'Under-five mortality rate (per 1000 live births)', 
'MDG_0000000025': 'Births attended by skilled health personnel (%)', 
'MDG_0000000026': 'Maternal mortality ratio (per 100 000 live births)', 

|[GHO Code](https://apps.who.int/gho/athena/api/GHO)      |[Documentation](https://www.who.int/data/gho/indicator-metadata-registry)|[Global Health Observatory](https://www.who.int/data/gho) (GHO) Code Description|
|:-------------|:-----------:|:------------------------------------------------|
|MDG_0000000001|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/1)|Infant mortality rate (between birth and 11 months per 1000 live births)| 
|MDG_0000000003|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/4669)|Adolescent birth rate (per 1000 women)| 
|MDG_0000000005|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/5)|Contraceptive prevalence (%)| 
|MDG_0000000007|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/7)|Under-five mortality rate (per 1000 live births)| 
|MDG_0000000025|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/25)|Births attended by skilled health personnel (%)| 
|MDG_0000000026|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/26)|Maternal mortality ratio (per 100 000 live births)| 

#### WHOSIS Stats

<div class="alert alert-info">

#### About 24% of data
* 'WHOSIS_000004': 'Adult mortality rate (probability of dying between 15 and 60 years per 1000 population)',
* 'WHOSIS_000003': 'Neonatal mortality rate (0 to 27 days) per 1000 live births)',

#### Between 10-15% of data
* 'WHOSIS_000016': 'Mortality rate among children ages 5 to 9 years (per 1000 children aged 5)', 
* 'WHOSIS_000014': 'Stillbirth rate (per 1000 total births)', 

#### These are all exactly the same record count at 5.84%
* 'WHOSIS_000001': 'Life expectancy at birth (years)', 
* 'WHOSIS_000002': 'Healthy life expectancy (HALE) at birth (years)', 
* 'WHOSIS_000007': 'Healthy life expectancy (HALE) at age 60 (years)', 
* 'WHOSIS_000015': 'Life expectancy at age 60 (years)', 

#### Very little representation at only 1.54% with just 615 records
* 'WHOSIS_000006': 'Infants exclusively breastfed for the first six months of life (%)', 

In [37]:
print(f'WHOSIS_000004    {round(9690/leam_df.shape[0]*100,2)}%')
print(f'WHOSIS_000003    {round(9648/leam_df.shape[0]*100,2)}%')
print(f'WHOSIS_000016     {round(6262/leam_df.shape[0]*100,2)}%')
print(f'WHOSIS_000014     {round(4353/leam_df.shape[0]*100,2)}%')
print(f'WHOSIS_000002     {round(2328/leam_df.shape[0]*100,2)}%')
print(f'WHOSIS_000007     {round(2328/leam_df.shape[0]*100,2)}%')
print(f'WHOSIS_000015     {round(2328/leam_df.shape[0]*100,2)}%')
print(f'WHOSIS_000001     {round(2328/leam_df.shape[0]*100,2)}%')
print(f'WHOSIS_000006     {round(615/leam_df.shape[0]*100,2)}%')

WHOSIS_000004    7.89%
WHOSIS_000003    7.86%
WHOSIS_000016     5.1%
WHOSIS_000014     3.54%
WHOSIS_000002     1.9%
WHOSIS_000007     1.9%
WHOSIS_000015     1.9%
WHOSIS_000001     1.9%
WHOSIS_000006     0.5%


'WHOSIS_000001': 'Life expectancy at birth (years)', 
'WHOSIS_000002': 'Healthy life expectancy (HALE) at birth (years)', 
'WHOSIS_000003': 'Neonatal mortality rate (0 to 27 days) per 1000 live births) (SDG 3.2.2)',
'WHOSIS_000004': 'Adult mortality rate (probability of dying between 15 and 60 years per 1000 population)',
'WHOSIS_000006': 'Infants exclusively breastfed for the first six months of life (%)', 
'WHOSIS_000007': 'Healthy life expectancy (HALE) at age 60 (years)', 
'WHOSIS_000014': 'Stillbirth rate (per 1000 total births)', 
'WHOSIS_000015': 'Life expectancy at age 60 (years)', 
'WHOSIS_000016': 'Mortality rate among children ages 5 to 9 years (per 1000 children aged 5)', 

|[GHO Code](https://apps.who.int/gho/athena/api/GHO)      |[Documentation](https://www.who.int/data/gho/indicator-metadata-registry)|[Global Health Observatory](https://www.who.int/data/gho) (GHO) Code Description|
|:-------------|:-----------:|:------------------------------------------------|
|WHOSIS_000001|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/65)|Life expectancy at birth (years)| 
|WHOSIS_000002|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/66)|Healthy life expectancy (HALE) at birth (years)| 
|WHOSIS_000003|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/67)|Neonatal mortality rate (0 to 27 days) per 1000 live births)|
|WHOSIS_000004|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/64)|Adult mortality rate (probability of dying between 15 and 60 years per 1000 population)|
|WHOSIS_000006|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/130)|Infants exclusively breastfed for the first six months of life (%)| 
|WHOSIS_000007|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/3443)|Healthy life expectancy (HALE) at age 60 (years)| 
|WHOSIS_000014|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/2444)|Stillbirth rate (per 1000 total births)| 
|WHOSIS_000015|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/2977)|Life expectancy at age 60 (years)| 
|WHOSIS_000016|[📖](https://www.who.int/data/gho/indicator-metadata-registry/imr-details/5669)|Mortality rate among children ages 5 to 9 years (per 1000 children aged 5)| 


#### Country

In [21]:
leam_df.COUNTRY.isnull().sum()

3411

In [19]:
leam_df.COUNTRY.nunique()

195

In [29]:
print(f'Country Null is {round(leam_df.COUNTRY.isnull().sum() / leam_df.shape[0] *100, 2)} % of entire database')

Country Null is 2.78 % of entire database


In [20]:
leam_df.COUNTRY.unique()

array(['MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR',
       'NPL', 'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL',
       'PRK', 'PRT', 'PRY', 'QAT', 'ROU', 'RUS', 'RWA', 'SAU', 'SDN',
       'SEN', 'SGP', 'SLB', 'SLE', 'SLV', 'SOM', 'SRB', 'SSD', 'STP',
       'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYC', nan, 'AFG', 'AGO', 'ALB',
       'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL',
       'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ',
       'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE',
       'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV',
       'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DNK', 'DOM', 'DZA',
       'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FRA',
       'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ',
       'GRC', 'GRD', 'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN',
       'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR',
       'JPN', '

#### Sex

In [30]:
leam_df.SEX.isnull().sum()

10541

In [32]:
print(f'Sex Null is {round(leam_df.SEX.isnull().sum() / leam_df.shape[0] *100, 2)} % of entire database')

Sex Null is 8.58 % of entire database


<div class="alert alert-success">

# Checkpoint #1

1. a brief (one-sentence) description of your project
* Predict future Life Expectancy or Healthy Life Expectancy

2. a link to the data source
* **GHO Selections:** https://apps.who.int/gho/athena/api/GHO
* **Homepage:** https://www.who.int/data/gho/info/athena-api-examples
* **CSV Data Pull:** https://apps.who.int/gho/athena/api/GHO/WHOSIS_000001,WHOSIS_000002,WHOSIS_000003,WHOSIS_000004,WHOSIS_000005,WHOSIS_000006,WHOSIS_000007,WHOSIS_000008,WHOSIS_000009,WHOSIS_000010,WHOSIS_000012,WHOSIS_000013,WHOSIS_000014,WHOSIS_000015,WHOSIS_000016?format=csv 

3. your target variable
* Numeric (Expected Years of continued Life)

4. what one observation represents
* Numeric for one year in specified country according to category of life expectancy
    * 'WHOSIS_000001': 'Life expectancy at birth (years)', 
    * 'WHOSIS_000002': 'Healthy life expectancy (HALE) at birth (years)', 
    * 'WHOSIS_000007': 'Healthy life expectancy (HALE) at age 60 (years)', 
    * 'WHOSIS_000015': 'Life expectancy at age 60 (years)', 


<div class="alert alert-danger">

# Sometimes it is faster to read the unreadable data than it is to read about how to make the data readable!