### Import necessary libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Read the data from the csv file


In [2]:
# read csv
df = pd.read_csv('Life_Expectancy_Data.csv')

### About the data


The Global Health Observatory (GHO) data repository under World Health Organization (WHO) keeps track of the health status as well as many other related factors for all countries The datasets are made available to public for the purpose of health data analysis. The dataset related to life expectancy, health factors for 193 countries has been collected from the same WHO data repository website and its corresponding economic data was collected from United Nation website. Among all categories of health-related factors only those critical factors were chosen which are more representative. It has been observed that in the past 15 years , there has been a huge development in health sector resulting in improvement of human mortality rates especially in the developing nations in comparison to the past 30 years.


#### Feature information

| Heeader name                    | Description |
| ------------------------------- | ----------- |
| Country                         |             |
| Year                            |             |
| Status                          |             |
| Life expectancy                 |             |
| Adult Mortality                 |             |
| infant deaths                   |             |
| Alcohol                         |             |
| percentage expenditure          |             |
| Hepatitis B                     |             |
| Measles                         |             |
| BMI                             |             |
| under-five deaths               |             |
| Polio                           |             |
| Total expenditure               |             |
| Diphtheria                      |             |
| HIV/AIDS                        |             |
| GDP                             |             |
| Population                      |             |
| thinness 1-19 years             |             |
| thinness 5-9 years              |             |
| Income composition of resources |             |
| Schooling                       |             |


In [3]:
# country is the


In [4]:
# show first 5 rows
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [5]:
# size of data
df.shape

(2938, 22)

#### Number of rows and columns of the DataFrame


In [6]:
# get number of rows
num_rows = df.shape[0]

# get number of columns
num_cols = df.shape[1]

print('Number of rows:', num_rows)
print('Number of columns:', num_cols)


Number of rows: 2938
Number of columns: 22


#### Data types of the columns


In [7]:
# get DataFrame info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10  BMI                              2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

#### Check duplicates rows


In [8]:
# get number of duplicated rows
duplicated_rows = df.duplicated().sum()

# check if there are duplicated rows
if duplicated_rows > 0:
    print("Number of duplicated rows:", duplicated_rows)
else:
    print("No duplicated rows found")

No duplicated rows found


### Statistical


In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,2938.0,2007.519,4.613841,2000.0,2004.0,2008.0,2012.0,2015.0
Life expectancy,2928.0,69.22493,9.523867,36.3,63.1,72.1,75.7,89.0
Adult Mortality,2928.0,164.7964,124.2921,1.0,74.0,144.0,228.0,723.0
infant deaths,2938.0,30.30395,117.9265,0.0,0.0,3.0,22.0,1800.0
Alcohol,2744.0,4.602861,4.052413,0.01,0.8775,3.755,7.7025,17.87
percentage expenditure,2938.0,738.2513,1987.915,0.0,4.685343,64.91291,441.5341,19479.91
Hepatitis B,2385.0,80.94046,25.07002,1.0,77.0,92.0,97.0,99.0
Measles,2938.0,2419.592,11467.27,0.0,0.0,17.0,360.25,212183.0
BMI,2904.0,38.32125,20.04403,1.0,19.3,43.5,56.2,87.3
under-five deaths,2938.0,42.03574,160.4455,0.0,0.0,4.0,28.0,2500.0


In [10]:
# get number of unique values of Country column
df['Country'].unique().size

193

### Handle missing values


In [11]:
# check missing values
df.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
BMI                                 34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
GDP                                448
Population                         652
thinness 1-19 years                 34
thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

In [12]:
# calculate missing percentage
missing_percentage = df.isnull().mean() * 100
missing_percentage[missing_percentage.values > 0].sort_values(ascending=False)

Population                         22.191967
Hepatitis B                        18.822328
GDP                                15.248468
Total expenditure                   7.692308
Alcohol                             6.603131
Income composition of resources     5.684139
Schooling                           5.547992
BMI                                 1.157250
thinness 1-19 years                 1.157250
thinness 5-9 years                  1.157250
Polio                               0.646698
Diphtheria                          0.646698
Life expectancy                     0.340368
Adult Mortality                     0.340368
dtype: float64

In [13]:
# replacing the Null Values with mean values of the data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,
                        strategy='mean', fill_value=None)
df['Life expectancy'] = imputer.fit_transform(df[['Life expectancy']])
df['Adult Mortality'] = imputer.fit_transform(df[['Adult Mortality']])
df['Alcohol'] = imputer.fit_transform(df[['Alcohol']])
df['Hepatitis B'] = imputer.fit_transform(df[['Hepatitis B']])
df['BMI'] = imputer.fit_transform(df[['BMI']])
df['Polio'] = imputer.fit_transform(df[['Polio']])
df['Total expenditure'] = imputer.fit_transform(df[['Total expenditure']])
df['Diphtheria'] = imputer.fit_transform(df[['Diphtheria']])
df['GDP'] = imputer.fit_transform(df[['GDP']])
df['Population'] = imputer.fit_transform(df[['Population']])
df['thinness 1-19 years'] = imputer.fit_transform(
    df[['thinness 1-19 years']])
df['thinness 5-9 years'] = imputer.fit_transform(df[['thinness 5-9 years']])
df['Income composition of resources'] = imputer.fit_transform(
    df[['Income composition of resources']])
df['Schooling'] = imputer.fit_transform(df[['Schooling']])

In [14]:
# check missing values again
df.isnull().sum()

Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
BMI                                0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
HIV/AIDS                           0
GDP                                0
Population                         0
thinness 1-19 years                0
thinness 5-9 years                 0
Income composition of resources    0
Schooling                          0
dtype: int64

### Correlation


In [15]:
%%capture
# correlation matrix
corr = df.corr()

In [16]:
corr.style.background_gradient(cmap='Greens')

Unnamed: 0,Year,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
Year,1.0,0.169623,-0.078861,-0.037415,-0.048168,0.0314,0.089398,-0.082493,0.108327,-0.042937,0.09382,0.08186,0.133853,-0.139741,0.093351,0.014951,-0.047592,-0.050627,0.236333,0.203471
Life expectancy,0.169623,1.0,-0.696359,-0.196535,0.391598,0.381791,0.203771,-0.157574,0.559255,-0.222503,0.461574,0.207981,0.475418,-0.556457,0.430493,-0.019638,-0.472162,-0.466629,0.692483,0.715066
Adult Mortality,-0.078861,-0.696359,1.0,0.078747,-0.190408,-0.242814,-0.138591,0.031174,-0.381449,0.094135,-0.272694,-0.110875,-0.273014,0.523727,-0.277053,-0.012501,0.299863,0.305366,-0.440062,-0.435108
infant deaths,-0.037415,-0.196535,0.078747,1.0,-0.113812,-0.085612,-0.178783,0.501128,-0.22722,0.996629,-0.170674,-0.126564,-0.175156,0.025231,-0.107109,0.548522,0.46559,0.471228,-0.143663,-0.191757
Alcohol,-0.048168,0.391598,-0.190408,-0.113812,1.0,0.339634,0.075447,-0.051055,0.31807,-0.110777,0.213744,0.294898,0.215242,-0.04865,0.318591,-0.030765,-0.416946,-0.405881,0.416099,0.497546
percentage expenditure,0.0314,0.381791,-0.242814,-0.085612,0.339634,1.0,0.011679,-0.056596,0.228537,-0.087852,0.147203,0.173414,0.14357,-0.097857,0.88814,-0.024648,-0.25119,-0.252725,0.380374,0.388105
Hepatitis B,0.089398,0.203771,-0.138591,-0.178783,0.075447,0.011679,1.0,-0.090317,0.134929,-0.184413,0.408519,0.050084,0.499958,-0.102405,0.062318,-0.109811,-0.105144,-0.108334,0.150992,0.171755
Measles,-0.082493,-0.157574,0.031174,0.501128,-0.051055,-0.056596,-0.090317,1.0,-0.175925,0.507809,-0.136146,-0.104569,-0.141861,0.030899,-0.06806,0.23625,0.224742,0.221007,-0.115764,-0.122609
BMI,0.108327,0.559255,-0.381449,-0.22722,0.31807,0.228537,0.134929,-0.175925,1.0,-0.237586,0.282156,0.231814,0.281059,-0.243548,0.276645,-0.063238,-0.532025,-0.538911,0.479837,0.508105
under-five deaths,-0.042937,-0.222503,0.094135,0.996629,-0.110777,-0.087852,-0.184413,0.507809,-0.237586,1.0,-0.188703,-0.128269,-0.195651,0.038062,-0.11064,0.535864,0.467626,0.472099,-0.161533,-0.207111


In [17]:
# histogram of Life expectancy
df['Life expectancy '].hist(bins=50, figsize=(
    12, 5), color='#86bf91', grid=False, zorder=2, rwidth=0.9)

KeyError: 'Life expectancy '

#### Comment: 


References: https://www.kaggle.com/code/varunsaikanuri/life-expectancy-visualization
