In [36]:
import pandas as pd

# 1. Project Overview

Objective: Analyze which countries have the best life expectancy.

Data source: Data comes from https://www.kaggle.com/datasets/shreyasg23/life-expectancy-averaged-dataset and has several different datapoints from 179 unique values.

# 2. Data Collecting and Loading

## Load Data:

In [37]:
life_expectancy = pd.read_csv('life-expectancy-data-averaged.csv')

## Initial Check:

In [38]:
life_expectancy.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status,Life_expectancy
0,Afghanistan,Asia,2007.5,71.08125,98.6125,265.804969,0.016125,64.5625,24.375,22.4625,55.375,55.125,0.0225,408.5625,27.450625,16.58125,15.58125,2.9,1.0,59.65625
1,Albania,Rest of Europe,2007.5,15.25625,17.14375,83.132969,4.696875,98.0,95.9375,25.85625,98.125,98.0625,0.025625,3071.125,2.969375,1.61875,1.7,9.24375,1.0,75.95
2,Algeria,Africa,2007.5,26.75625,31.19375,113.439281,0.400625,88.3125,93.25,24.86875,91.75,91.875,0.021875,3745.125,34.820625,6.09375,5.975,6.99375,1.0,73.7875
3,Angola,Africa,2007.5,88.76875,144.1625,297.844063,4.935625,68.8125,64.0,22.51875,35.75,55.5625,1.30375,2647.8125,21.62375,6.19375,6.66875,4.60625,1.0,52.825
4,Antigua and Barbuda,Central America and Caribbean,2007.5,9.475,11.51875,142.478813,7.755,98.25,75.4375,25.85,96.9375,98.3125,0.125,14678.75,0.085,3.425,3.375,9.01875,1.0,75.35


## Select Options

In [45]:
life_expectancy.loc[:, ['Country', 'Region', 'Year', 'Infant_deaths', 'Adult_mortality', 'Population_mln', 'Life_expectancy']]

life_expectancy = life_expectancy.drop(['Under_five_deaths', 'Alcohol_consumption', 'Hepatitis_B', 'Measles', 'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita', 'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling', 'Economy_status'], axis=1)



In [46]:
# just making sure correct columns were dropped

life_expectancy.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Adult_mortality,Population_mln,Life_expectancy,Life_expectancy_category
0,Afghanistan,Asia,2008,71.08,265.8,27.45,59.66,Medium
1,Albania,Rest of Europe,2008,15.26,83.13,2.97,75.95,High
2,Algeria,Africa,2008,26.76,113.44,34.82,73.79,High
3,Angola,Africa,2008,88.77,297.84,21.62,52.82,Medium
4,Antigua and Barbuda,Central America and Caribbean,2008,9.48,142.48,0.08,75.35,High


# 3. Data Cleaning and Preparation 

## Handle missing files: 

I will be marking any empty numbers as 'Unknown' instead of dropping them because of the other data present in the table.

In [40]:
life_expectancy_filled = life_expectancy.fillna('Unknown')

## Data Type Adjustments: 

Adjusting columns that have decimals to only having 2 decimal places, converting the column 'Year' to an int, and converting the columns 'Country' and 'Region" as categories.

In [47]:
columns_to_round = [
    'Infant_deaths', 'Adult_mortality', 'Life_expectancy'
]
life_expectancy[columns_to_round] = life_expectancy[columns_to_round].round(2)


life_expectancy['Year'] = life_expectancy['Year'].round().astype(int)


life_expectancy['Country'] = life_expectancy['Country'].astype('category')
life_expectancy['Region'] = life_expectancy['Region'].astype('category')


life_expectancy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Country                   179 non-null    category
 1   Region                    179 non-null    category
 2   Year                      179 non-null    int32   
 3   Infant_deaths             179 non-null    float64 
 4   Adult_mortality           179 non-null    float64 
 5   Population_mln            179 non-null    float64 
 6   Life_expectancy           179 non-null    float64 
 7   Life_expectancy_category  179 non-null    object  
dtypes: category(2), float64(4), int32(1), object(1)
memory usage: 14.2+ KB


## Feature Engineering: 

I will make a category that will tell if a country has high, medium, or low life expectancy. 

### Defining these:
High: Life expectancy > 70 years. 
Medium: Life expectancy is between 50 and 70 years. 
Low: Life expectancy is < 50 years. 

In [42]:
def categorize_life_expectancy(value):
    if value < 50:
        return "Low"
    elif 50 <= value < 70:
        return "Medium"
    else:
        return "High"

life_expectancy['Life_expectancy_category'] = life_expectancy['Life_expectancy'].apply(categorize_life_expectancy)


In [48]:
# just verifying adding the category worked.

life_expectancy.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Adult_mortality,Population_mln,Life_expectancy,Life_expectancy_category
0,Afghanistan,Asia,2008,71.08,265.8,27.45,59.66,Medium
1,Albania,Rest of Europe,2008,15.26,83.13,2.97,75.95,High
2,Algeria,Africa,2008,26.76,113.44,34.82,73.79,High
3,Angola,Africa,2008,88.77,297.84,21.62,52.82,Medium
4,Antigua and Barbuda,Central America and Caribbean,2008,9.48,142.48,0.08,75.35,High


# 4. Exploratory Data Analysis (EDA)

##