### Import necessary libraries


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


### Read the data from the csv file


In [3]:
# read csv
df = pd.read_csv('Life_Expectancy_Data.csv')


In [8]:
# show first 5 rows
df.head()


Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


### Feature information


| Heeader name                    | Description |
| ------------------------------- | ----------- |
| Country                         |             |
| Year                            |             |
| Status                          |             |
| Life expectancy                 |             |
| Adult Mortality                 |             |
| infant deaths                   |             |
| Alcohol                         |             |
| percentage expenditure          |             |
| Hepatitis B                     |             |
| Measles                         |             |
| BMI                             |             |
| under-five deaths               |             |
| Polio                           |             |
| Total expenditure               |             |
| Diphtheria                      |             |
| HIV/AIDS                        |             |
| GDP                             |             |
| Population                      |             |
| thinness 1-19 years             |             |
| thinness 5-9 years              |             |
| Income composition of resources |             |
| Schooling                       |             |


### Number of rows and columns of the DataFrame


In [7]:
# get number of rows
num_rows = df.shape[0]

# get number of columns
num_cols = df.shape[1]

print('Number of rows:', num_rows)
print('Number of columns:', num_cols)


Number of rows: 2938
Number of columns: 22


### Data types of the columns


In [9]:
# get DataFrame info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

### Check duplicates rows


In [10]:
# get number of duplicated rows
duplicated_rows = df.duplicated().sum()

# check if there are duplicated rows
if duplicated_rows > 0:
    print("Number of duplicated rows:", duplicated_rows)
else:
    print("No duplicated rows found")


No duplicated rows found


### Statistical


In [13]:
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,2938.0,2007.519,4.613841,2000.0,2004.0,2008.0,2012.0,2015.0
Life expectancy,2928.0,69.22493,9.523867,36.3,63.1,72.1,75.7,89.0
Adult Mortality,2928.0,164.7964,124.2921,1.0,74.0,144.0,228.0,723.0
infant deaths,2938.0,30.30395,117.9265,0.0,0.0,3.0,22.0,1800.0
Alcohol,2744.0,4.602861,4.052413,0.01,0.8775,3.755,7.7025,17.87
percentage expenditure,2938.0,738.2513,1987.915,0.0,4.685343,64.91291,441.5341,19479.91
Hepatitis B,2385.0,80.94046,25.07002,1.0,77.0,92.0,97.0,99.0
Measles,2938.0,2419.592,11467.27,0.0,0.0,17.0,360.25,212183.0
BMI,2904.0,38.32125,20.04403,1.0,19.3,43.5,56.2,87.3
under-five deaths,2938.0,42.03574,160.4455,0.0,0.0,4.0,28.0,2500.0


### Handle missing values


In [14]:
# check missing values
df.isnull().sum()


Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [16]:
# calculate missing percentage
missing_percentage = df.isnull().mean() * 100
missing_percentage[missing_percentage.values > 0].sort_values(ascending=False)


Population                         22.191967
Hepatitis B                        18.822328
GDP                                15.248468
Total expenditure                   7.692308
Alcohol                             6.603131
Income composition of resources     5.684139
Schooling                           5.547992
 BMI                                1.157250
 thinness  1-19 years               1.157250
 thinness 5-9 years                 1.157250
Polio                               0.646698
Diphtheria                          0.646698
Life expectancy                     0.340368
Adult Mortality                     0.340368
dtype: float64

In [24]:
%%capture
# filling the missing values with the mean of the column.
df = df.fillna(df.mean())

In [25]:
# check missing values again
df.isnull().sum()


Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
 BMI                               0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
 HIV/AIDS                          0
GDP                                0
Population                         0
 thinness  1-19 years              0
 thinness 5-9 years                0
Income composition of resources    0
Schooling                          0
dtype: int64