In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("Life Expectancy Data.csv")
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)
df.info()

Shape: (2938, 22)

Columns:
 Index(['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infan

In [4]:
df.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [5]:
df.fillna(df.median(numeric_only=True), inplace=True)
df.isnull().sum()

Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
 BMI                               0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
 HIV/AIDS                          0
GDP                                0
Population                         0
 thinness  1-19 years              0
 thinness 5-9 years                0
Income composition of resources    0
Schooling                          0
dtype: int64

In [6]:
df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df.shape)

Shape after removing duplicates: (2938, 22)


In [7]:
df = df[df['Status'] == 'Developing']
print("Shape after filtering developing countries:", df.shape)

Shape after filtering developing countries: (2426, 22)


In [8]:
df = df[['Country',
         'Year',
         'Life expectancy ',
         'infant deaths',
         'Status']]

df.head()

Unnamed: 0,Country,Year,Life expectancy,infant deaths,Status
0,Afghanistan,2015,65.0,62,Developing
1,Afghanistan,2014,59.9,64,Developing
2,Afghanistan,2013,59.9,66,Developing
3,Afghanistan,2012,59.5,69,Developing
4,Afghanistan,2011,59.2,71,Developing


In [9]:
df.rename(columns={
    'Life expectancy ': 'LifeExpectancy',
    'infant deaths': 'InfantMortality'
}, inplace=True)
df.head()

Unnamed: 0,Country,Year,LifeExpectancy,InfantMortality,Status
0,Afghanistan,2015,65.0,62,Developing
1,Afghanistan,2014,59.9,64,Developing
2,Afghanistan,2013,59.9,66,Developing
3,Afghanistan,2012,59.5,69,Developing
4,Afghanistan,2011,59.2,71,Developing


In [10]:
Q1 = df[['LifeExpectancy', 'InfantMortality']].quantile(0.25)
Q3 = df[['LifeExpectancy', 'InfantMortality']].quantile(0.75)
IQR = Q3-Q1
df = df[~((df[['LifeExpectancy','InfantMortality']] < (Q1 - 1.5 * IQR)) |
          (df[['LifeExpectancy','InfantMortality']] > (Q3 + 1.5 * IQR))).any(axis=1)]
print("Shape after outlier removal:", df.shape)

Shape after outlier removal: (2204, 5)


In [11]:
df['InfantMortality'] = np.log1p(df['InfantMortality'])
df.head()

Unnamed: 0,Country,Year,LifeExpectancy,InfantMortality,Status
0,Afghanistan,2015,65.0,4.143135,Developing
1,Afghanistan,2014,59.9,4.174387,Developing
2,Afghanistan,2013,59.9,4.204693,Developing
16,Albania,2015,77.8,0.0,Developing
17,Albania,2014,77.5,0.0,Developing


In [12]:
scaler = StandardScaler()
df[['InfantMortality']] = scaler.fit_transform(
    df[['InfantMortality']]
)
df.head()

Unnamed: 0,Country,Year,LifeExpectancy,InfantMortality,Status
0,Afghanistan,2015,65.0,1.736966,Developing
1,Afghanistan,2014,59.9,1.759476,Developing
2,Afghanistan,2013,59.9,1.781304,Developing
16,Albania,2015,77.8,-1.247157,Developing
17,Albania,2014,77.5,-1.247157,Developing


In [13]:
df.to_csv("life_expectancy_cleaned.csv", index=False)
print("Cleaned dataset saved successfully")

Cleaned dataset saved successfully
