<a href="https://colab.research.google.com/github/Samuela31/Life-Expectancy-Analysis/blob/main/Life_Expectancy_Dataset_Analysis_Assignment71762108039.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Reading from CSV and Cleaning Data



First, the dataset is read and checked for missing values. It is found that Population, Hepatitis B, and GDP have the most number of missing values, so it may bias the analysis results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

#read dataset
df = pd.read_csv('Life Expectancy Data.csv')

#count rows and columns
num_rows, num_cols = df.shape
print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

print(df.isnull().sum())

We clean the dataset via interpolation, which is a technique used to estimate missing values based on values of neighbouring data points.

In [None]:
#clean dataset by interpolation
#Interpolation is a technique used to estimate missing
#values based on the values of neighboring data points.
df=df.interpolate()

print(df.isnull().sum())

In [None]:
#print unique values of column in dataset
countries=df['Country'].unique()
print(countries, len(countries))

#Sample Visualizations for whole dataset

In [None]:
#Create a list of all columns
cat_cols = df.columns.tolist()
cat_cols.remove('Country')
cat_cols.remove('Year')

In [None]:
#to create boxplots
for column in cat_cols:
  if column!='Status':
    plt.figure(figsize=(10,5))
    sns.boxplot(x='Year', y=column, data=df, palette='Set3')
    plt.title('Distribution of ' + column + ' by Year')
    plt.show()


In [None]:
# Create a pie chart for each column
for col in cat_cols:
  if col=='Life expectancy ':
    less_than_20 = len(df[df['Life expectancy '] < 20])
    between_20_50 = len(df[(df['Life expectancy '] >= 20) & (df['Life expectancy '] < 50)])
    between_50_70 = len(df[(df['Life expectancy '] >= 50) & (df['Life expectancy '] < 70)])
    greater_than_70 = len(df[df['Life expectancy '] >= 70])

    # Create a list of counts
    counts = [less_than_20, between_20_50, between_50_70, greater_than_70]

    # Create a list of corresponding labels
    labels = ['<20', '20-50', '50-70', '>70']

    # Plot the pie chart
    plt.figure(figsize=(8, 6))
    plt.pie(counts, labels=labels, autopct='%1.1f%%')

  else:
    plt.figure(figsize=(8, 6))
    df[col].value_counts().plot(kind='pie', autopct='%1.1f%%', cmap='coolwarm')

  plt.title(col)
  plt.show()

#Global summary (summary of entire dataset)

In [None]:
#Display summary statistics for numerical columns of entire dataset(so global values)
df.describe()

In [None]:
#countries of lowest and higest life expectancies
print("Country with lowest life expectancy", df[df['Life expectancy ']==min(df['Life expectancy '])]['Country'],
      "\n",df[df['Life expectancy ']==min(df['Life expectancy '])]['Life expectancy '])

print("Country with highest life expectancy", df[df['Life expectancy ']==max(df['Life expectancy '])]['Country'],
      "\n",df[df['Life expectancy ']==max(df['Life expectancy '])]['Life expectancy '])

In [None]:
#print median, mode, and IQR of life expectancy
print("Median: ",df['Life expectancy '].median())
print("Mode: ",df['Life expectancy '].mode())

q1 = df['Life expectancy '].quantile(0.25)
q3 = df['Life expectancy '].quantile(0.75)
iqr = q3 - q1
print("IQR: ",iqr)

In [None]:
#convert categorical column to numeric
le = LabelEncoder()
df['Status']=le.fit_transform(df['Status'])

# Calculate the correlation matrix
corr_matrix = df.corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#names of developed countries (status=0)
d=df[df['Status'] == 0]['Country'].unique()
print(d,len(d))

In [None]:
#scatter plots wrt life expectancy
dless_than_20 = df[df['Life expectancy '] < 20]
dbetween_20_50 = df[(df['Life expectancy '] >= 20) & (df['Life expectancy '] < 50)]
dbetween_50_70 = df[(df['Life expectancy '] >= 50) & (df['Life expectancy '] < 70)]
dgreater_than_70 = df[df['Life expectancy '] >= 70]

for col in cat_cols:
    if col!= 'Life expectancy ':
        plt.figure(figsize=(8, 6))
        plt.scatter(dless_than_20[col], dless_than_20['Life expectancy '], label='age<20')
        plt.scatter(dbetween_20_50[col], dbetween_20_50['Life expectancy '], label='20<=age<50')
        plt.scatter(dbetween_50_70[col], dbetween_50_70['Life expectancy '], label='50<=age70')
        plt.scatter(dgreater_than_70[col], dgreater_than_70['Life expectancy '], label='age>=70')
        plt.title("{} vs Life expectancy(age)".format(col))
        plt.xlabel(col)
        plt.ylabel("Life expectancy")
        plt.legend()
        plt.show()


In [None]:
#distribution curve
for col in cat_cols:
    if col!= 'Life expectancy ':
      plt.figure(figsize=(8, 5))
      sns.kdeplot(df[col], shade=True)
      plt.title("{} vs Life expectancy".format(col))
      plt.xlabel(col)
      plt.ylabel("Life expectancy")
      plt.show()

In [None]:
#Central Tendency
#group the data by country
grouped_by_country = df.groupby('Country')

#Calculate the mean, median, and mode for each attribute
central_tendency = grouped_by_country.aggregate(['mean', 'median', pd.Series.mode])

#resulting DataFrame
print(central_tendency)

#to print for specific country
#print(central_tendency.loc['India'])


#Continent-wise Split

In [None]:
!pip install pycountry-convert


In [None]:
#continent-wise split
import pycountry_convert as pc

continents = []
for country in countries:
  flag=0
  if country=='Bolivia (Plurinational State of)':
    country='Bolivia'
  elif country=='Iran (Islamic Republic of)':
    country='Iran'
  elif country=='Micronesia (Federated States of)':
    country='Micronesia'
  elif country=='Republic of Korea':
    country='South Korea'
  elif country=='The former Yugoslav republic of Macedonia':
    country='North Macedonia'
  elif country=='Venezuela (Bolivarian Republic of)':
    country='Venezuela'
  elif country=='Timor-Leste':
    continents.append('Asia')
    flag=1

  if flag==0:
    country_alpha2 = pc.country_name_to_country_alpha2(country)
    continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    continent_name = pc.convert_continent_code_to_continent_name(continent_code)
    continents.append(continent_name)

africa,asia,europe,namerica,samerica,australia=[],[],[],[],[],[]
for c in range(len(continents)):
  if continents[c]=='Asia':
    asia.append(countries[c])

  elif continents[c]=='Africa':
    africa.append(countries[c])

  elif continents[c]=='Europe':
    europe.append(countries[c])

  elif continents[c]=='North America':
    namerica.append(countries[c])

  elif continents[c]=='South America':
    samerica.append(countries[c])

  elif continents[c]=='Oceania':
    australia.append(countries[c])

print(asia,africa,europe,australia,namerica,samerica,sep='\n')

In [None]:
#dataset of each continent
#dataset of African countries
africa_df = df[df['Country'].isin(africa)]

#dataset of Asian countries
asia_df = df[df['Country'].isin(asia)]

#dataset of Australian countries
aus_df = df[df['Country'].isin(australia)]

#dataset of European countries
eu_df = df[df['Country'].isin(europe)]

#dataset of North American countries
na_df = df[df['Country'].isin(namerica)]

#dataset of South American countries
sa_df = df[df['Country'].isin(samerica)]

#continent dataframe list
continent_dfs = [africa_df, asia_df, eu_df, na_df, sa_df, aus_df]

#list of continent names
continents = ['Africa', 'Asia', 'Europe', 'North America', 'South America', 'Australia']

In [None]:
mean_values = []

for df in continent_dfs:
    # Calculate the mean of the 'Life expectancy' column for each dataframe
    mean = df['Life expectancy '].mean()
    # Append the mean value to the list
    mean_values.append(mean)

plt.figure(figsize=(8, 6))
sns.barplot(x=[1, 2, 3, 4, 5, 6], y=mean_values, palette='muted')
plt.xticks(ticks=[0, 1, 2, 3, 4, 5], labels=continents)
plt.title('Mean Life Expectancy by Continent')
plt.xlabel('Continent')
plt.ylabel('Mean Life Expectancy')
plt.show()


In [None]:
#ANOVA to see if life expectancy varies significantly between continents
from scipy import stats

# Perform one-way ANOVA test
f_value, p_value = stats.f_oneway(africa_df['Life expectancy '], asia_df['Life expectancy '],
                                  eu_df['Life expectancy '], na_df['Life expectancy '],
                                  sa_df['Life expectancy '], aus_df['Life expectancy '])

# Print the results
print("F-value:", f_value)
print("p-value:", p_value)

if p_value < 0.05:
    print("There is a significant difference between at least one of the continents.")
else:
    print("There is no significant difference between the continents.")


In [None]:
#ML model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score

le = LabelEncoder()
df['Status']=le.fit_transform(df['Status'])

X=df[['Status','Adult Mortality',' HIV/AIDS',' thinness  1-19 years',' thinness 5-9 years',
      'Schooling','Income composition of resources',' BMI ','GDP','Diphtheria ','Polio']]
y=df['Life expectancy ']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

r2 = r2_score(y_test, y_pred)
print("R-squared Error:", r2)

#create dataframe with actual, predicted, and country names
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Country': df.loc[X_test.index, 'Country']})

#filter predictions where predicted life expectancy is less than 65
low_expectancy_predictions = predictions_df[predictions_df['Predicted'] < 65]

'''#check if the predicted value is within a tolerance range of the actual value
tolerance = 1  # Tolerance range of ±1 year
correct_predictions = low_expectancy_predictions[
    (low_expectancy_predictions['Predicted'] >= low_expectancy_predictions['Actual'] - tolerance) &
    (low_expectancy_predictions['Predicted'] <= low_expectancy_predictions['Actual'] + tolerance)
]

#display the country names where the prediction is correct and life expectancy is less than 65
correct_countries = correct_predictions['Country'].tolist()'''

print("Predicted countries with life expectancy < 65:", low_expectancy_predictions)
print("\nActual countries:")
print(df[df['Life expectancy '] < 65][['Country', 'Life expectancy ']])
print(df[df['Life expectancy '] < 65]['Country'].unique())

l = ['Afghanistan', 'Angola', 'Benin', 'Bhutan', 'Bolivia (Plurinational State of)', 'Botswana',
     'Burkina Faso', 'Burundi', "Côte d'Ivoire", 'Cambodia', 'Cameroon', 'Central African Republic',
     'Chad', 'Comoros', 'Congo', 'Democratic Republic of the Congo', 'Djibouti', 'Equatorial Guinea',
     'Eritrea', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Haiti', 'India',
     'Iraq', 'Kazakhstan', 'Kenya', 'Kiribati', "Lao People's Democratic Republic", 'Lesotho', 'Liberia',
     'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mongolia', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru',
     'Nepal', 'Niger', 'Nigeria', 'Niue', 'Pakistan', 'Papua New Guinea', 'Russian Federation', 'Rwanda',
     'Saint Kitts and Nevis', 'Sao Tome and Principe', 'Senegal', 'Sierra Leone', 'Somalia', 'South Africa',
     'South Sudan', 'Sudan', 'Swaziland', 'Syrian Arab Republic', 'Tajikistan', 'Timor-Leste', 'Togo',
     'Turkmenistan', 'Tuvalu', 'Uganda', 'United Republic of Tanzania', 'Yemen', 'Zambia', 'Zimbabwe']

cnt=[0,0,0,0,0,0]
for c in l:
  if c in africa:
    cnt[0]=cnt[0]+1
    print(c+' belongs to Africa')

  elif c in asia:
    cnt[1]=cnt[1]+1
    print(c+' belongs to Asia')

  elif c in australia:
    cnt[2]=cnt[2]+1
    print(c+' belongs to Oceania')

  elif c in europe:
    cnt[3]=cnt[3]+1
    print(c+' belongs to Europe')

  elif c in namerica:
    cnt[4]=cnt[4]+1
    print(c+' belongs to North America')

  elif c in samerica:
    cnt[5]=cnt[5]+1
    print(c+' belongs to South America')

print("Total countries with life expectancy less than 65 years: ",len(l))
continents = ['Africa', 'Asia', 'Oceania','Europe', 'North America', 'South America']
for i in range(len(cnt)):
  print(continents[i],': ',cnt[i])

In [None]:
#continent-wise percentage of developed countries
for i in range(len(continent_dfs)):
    continent_df = continent_dfs[i]
    continent_name = continents[i]

    status_counts = continent_df['Status'].value_counts()

    plt.figure()
    plt.title(f'{continent_name} - Developed vs. Developing')
    plt.pie(status_counts, labels=status_counts.index, autopct='%1.1f%%')
    plt.show()


In [None]:
#continent-wise scatter plots
for col in cat_cols:
    if col != 'Life expectancy ':
        plt.figure(figsize=(8, 6))

        for df, continent in zip(continent_dfs, continents):
            plt.scatter(df[col], df['Life expectancy '], label=continent)

        plt.title("{} vs Life Expectancy (Continent-wise)".format(col))
        plt.xlabel(col)
        plt.ylabel("Life Expectancy")
        plt.legend()
        plt.show()


#1)Africa



In [None]:
africa_df.describe()

In [None]:
#Basic details
print("Number of African countries: ",len(africa))

#countries of lowest and higest life expectancies
print("Country with lowest life expectancy", africa_df[africa_df['Life expectancy ']==min(africa_df['Life expectancy '])]['Country'],
      "\n",africa_df[africa_df['Life expectancy ']==min(africa_df['Life expectancy '])]['Life expectancy '])

print("Country with highest life expectancy", africa_df[africa_df['Life expectancy ']==max(africa_df['Life expectancy '])]['Country'],
      "\n",africa_df[africa_df['Life expectancy ']==max(africa_df['Life expectancy '])]['Life expectancy '])

In [None]:
#print median, mode, and IQR of life expectancy
print("Median: ",africa_df['Life expectancy '].median())
print("Mode: ",africa_df['Life expectancy '].mode())

q1 = africa_df['Life expectancy '].quantile(0.25)
q3 = africa_df['Life expectancy '].quantile(0.75)
iqr = q3 - q1
print("IQR: ",iqr)

In [None]:
#Calculate the correlation matrix
cafrica=africa_df.drop(['Status'],axis=1) #since status=1 (developing) for all countries, we drop it
corr_matrix = cafrica.corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#Time series plot for each attribute(column) over the years
for attr in cat_cols:
  fig=px.line(africa_df.sort_values(by='Year'),x='Year',y=attr,color='Country',markers=True,title='<b> Country-wise {} over Years'.format(attr))
  fig.show()

'''plt.figure(figsize=(10, 6))
# Loop through each country and plot its time series
for country in africa:
    country_data = africa_df[africa_df['Country'] == country]
    plt.plot(country_data['Year'], country_data['Life expectancy '], label=country)

plt.legend(loc='upper left', bbox_to_anchor=(1,1)) #to see which line is which country's
plt.xlabel('Year')
plt.ylabel('Life Expectancy')
plt.title('Life Expectancy in African Countries')
plt.show()
'''

In [None]:
#Pivot table for each attribute(column) over the years
#It allows you to summarize the data in a table format, where the rows
#represent the countries and the columns represent the variables of interest.
#Compared to time series plot it is easy to view without congestion
for attr in cat_cols:
  if attr!='Status':
    africa_pivot = africa_df.pivot(index='Country', columns='Year', values=attr)

    #create a heatmap using seaborn
    plt.figure(figsize=(15, 15))
    sns.heatmap(africa_pivot, cmap='YlGnBu', annot=True, fmt=".1f")
    plt.title('{} in African Countries from 2000 to 2015'.format(attr))
    plt.show()


In [None]:
#distribution curve
for col in cat_cols:
    if col!= 'Life expectancy ' and col!='Status':
      plt.figure(figsize=(8, 5))
      sns.kdeplot(africa_df[col], shade=True)
      plt.title("{} vs Life expectancy".format(col))
      plt.xlabel(col)
      plt.ylabel("Life expectancy")
      plt.show()

In [None]:
#scatter plots wrt life expectancy
dless_than_20 = africa_df[africa_df['Life expectancy '] < 20]
dbetween_20_50 = africa_df[(africa_df['Life expectancy '] >= 20) & (africa_df['Life expectancy '] < 50)]
dbetween_50_70 = africa_df[(africa_df['Life expectancy '] >= 50) & (africa_df['Life expectancy '] < 70)]
dgreater_than_70 = africa_df[africa_df['Life expectancy '] >= 70]

for col in cat_cols:
    if col!= 'Life expectancy ' and col!='Status':
        plt.figure(figsize=(8, 6))
        plt.scatter(dless_than_20[col], dless_than_20['Life expectancy '], label='age<20')
        plt.scatter(dbetween_20_50[col], dbetween_20_50['Life expectancy '], label='20<=age<50')
        plt.scatter(dbetween_50_70[col], dbetween_50_70['Life expectancy '], label='50<=age70')
        plt.scatter(dgreater_than_70[col], dgreater_than_70['Life expectancy '], label='age>=70')
        plt.title("{} vs Life expectancy(age)".format(col))
        plt.xlabel(col)
        plt.ylabel("Life expectancy")
        plt.legend()
        plt.show()

In [None]:
#to create boxplots
for column in cat_cols:
  if column!='Status':
    plt.figure(figsize=(10,5))
    sns.boxplot(x='Year', y=column, data=africa_df)
    plt.title('Distribution of ' + column + ' by Year')
    plt.show()

#2)Asia

In [None]:
asia_df.describe()

In [None]:
#Basic details
print("Number of Asian countries: ",len(asia))
print("Developed countries: ",asia_df[asia_df['Status']=='Developed']['Country'].unique())

#countries of lowest and higest life expectancies
print("Country with lowest life expectancy", asia_df[asia_df['Life expectancy ']==min(asia_df['Life expectancy '])]['Country'],
      "\n",asia_df[asia_df['Life expectancy ']==min(asia_df['Life expectancy '])]['Life expectancy '])

print("Country with highest life expectancy", asia_df[asia_df['Life expectancy ']==max(asia_df['Life expectancy '])]['Country'],
      "\n",asia_df[asia_df['Life expectancy ']==max(asia_df['Life expectancy '])]['Life expectancy '])

In [None]:
#print median, mode, and IQR of life expectancy
print("Median: ",asia_df['Life expectancy '].median())
print("Mode: ",asia_df['Life expectancy '].mode())

q1 = asia_df['Life expectancy '].quantile(0.25)
q3 = asia_df['Life expectancy '].quantile(0.75)
iqr = q3 - q1
print("IQR: ",iqr)

In [None]:
#convert categorical column to numeric
le = LabelEncoder()
asia_df['Status']=le.fit_transform(asia_df['Status'])

# Calculate the correlation matrix
corr_matrix = asia_df.corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#Time series plot for each attribute(column) over the years
for attr in cat_cols:
  fig=px.line(asia_df.sort_values(by='Year'),x='Year',y=attr,color='Country',markers=True,title='<b> Country-wise {} over Years'.format(attr))
  fig.show()

In [None]:
for attr in cat_cols:
  if attr!='Status':
    asia_pivot = asia_df.pivot(index='Country', columns='Year', values=attr)

    #create a heatmap using seaborn
    plt.figure(figsize=(15, 15))
    sns.heatmap(asia_pivot, cmap='YlGnBu', annot=True, fmt=".1f")
    plt.title('{} in Asian Countries from 2000 to 2015'.format(attr))
    plt.show()

In [None]:
#to create boxplots
for column in cat_cols:
  if column!='Status':
    plt.figure(figsize=(10,5))
    sns.boxplot(x='Year', y=column, data=asia_df, palette='Spectral')
    plt.title('Distribution of ' + column + ' by Year')
    plt.show()


#3)Oceania

In [None]:
aus_df.describe()

In [None]:
#Basic details
print("Number of Australian countries: ",len(australia))
print("Developed countries: ",aus_df[aus_df['Status']=='Developed']['Country'].unique())

#countries of lowest and higest life expectancies
print("Country with lowest life expectancy", aus_df[aus_df['Life expectancy ']==min(aus_df['Life expectancy '])]['Country'],
      "\n",aus_df[aus_df['Life expectancy ']==min(aus_df['Life expectancy '])]['Life expectancy '])

print("Country with highest life expectancy", aus_df[aus_df['Life expectancy ']==max(aus_df['Life expectancy '])]['Country'],
      "\n",aus_df[aus_df['Life expectancy ']==max(aus_df['Life expectancy '])]['Life expectancy '])

In [None]:
#print median, mode, and IQR of life expectancy
print("Median: ",aus_df['Life expectancy '].median())
print("Mode: ",aus_df['Life expectancy '].mode())

q1 = aus_df['Life expectancy '].quantile(0.25)
q3 = aus_df['Life expectancy '].quantile(0.75)
iqr = q3 - q1
print("IQR: ",iqr)

In [None]:
#convert categorical column to numeric
le = LabelEncoder()
aus_df['Status']=le.fit_transform(aus_df['Status'])

# Calculate the correlation matrix
corr_matrix = aus_df.corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#Time series plot for each attribute(column) over the years
for attr in cat_cols:
  fig=px.line(aus_df.sort_values(by='Year'),x='Year',y=attr,color='Country',markers=True,title='<b> Country-wise {} over Years'.format(attr))
  fig.show()

In [None]:
for attr in cat_cols:
  if attr!='Status':
    aus_pivot = aus_df.pivot(index='Country', columns='Year', values=attr)

    #create a heatmap using seaborn
    plt.figure(figsize=(15, 15))
    sns.heatmap(aus_pivot, cmap='YlGnBu', annot=True, fmt=".1f")
    plt.title('{} in Australian Countries from 2000 to 2015'.format(attr))
    plt.show()

In [None]:
#to create boxplots
for column in cat_cols:
  if column!='Status':
    plt.figure(figsize=(10,5))
    sns.boxplot(x='Year', y=column, data=aus_df, palette='Greens')
    plt.title('Distribution of ' + column + ' by Year')
    plt.show()

#4)Europe

In [None]:
eu_df.describe()

In [None]:
#Basic details
print("Number of European countries: ",len(europe))
print("Developed countries: ",eu_df[eu_df['Status']=='Developed']['Country'].unique())

#countries of lowest and higest life expectancies
print("Country with lowest life expectancy", eu_df[eu_df['Life expectancy ']==min(eu_df['Life expectancy '])]['Country'],
      "\n",eu_df[eu_df['Life expectancy ']==min(eu_df['Life expectancy '])]['Life expectancy '])

print("Country with highest life expectancy", eu_df[eu_df['Life expectancy ']==max(eu_df['Life expectancy '])]['Country'],
      "\n",eu_df[eu_df['Life expectancy ']==max(eu_df['Life expectancy '])]['Life expectancy '])

In [None]:
#print median, mode, and IQR of life expectancy
print("Median: ",eu_df['Life expectancy '].median())
print("Mode: ",eu_df['Life expectancy '].mode())

q1 = eu_df['Life expectancy '].quantile(0.25)
q3 = eu_df['Life expectancy '].quantile(0.75)
iqr = q3 - q1
print("IQR: ",iqr)

In [None]:
#convert categorical column to numeric
le = LabelEncoder()
eu_df['Status']=le.fit_transform(eu_df['Status'])

# Calculate the correlation matrix
corr_matrix = eu_df.corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#Time series plot for each attribute(column) over the years
for attr in cat_cols:
  fig=px.line(eu_df.sort_values(by='Year'),x='Year',y=attr,color='Country',markers=True,title='<b> Country-wise {} over Years'.format(attr))
  fig.show()

In [None]:
for attr in cat_cols:
  if attr!='Status':
    eu_pivot = eu_df.pivot(index='Country', columns='Year', values=attr)

    #create a heatmap using seaborn
    plt.figure(figsize=(15, 15))
    sns.heatmap(eu_pivot, cmap='YlGnBu', annot=True, fmt=".1f")
    plt.title('{} in European Countries from 2000 to 2015'.format(attr))
    plt.show()

In [None]:
#to create boxplots
for column in cat_cols:
  if column!='Status':
    plt.figure(figsize=(10,5))
    sns.boxplot(x='Year', y=column, data=eu_df, palette='flare')
    plt.title('Distribution of ' + column + ' by Year')
    plt.show()

#5)North America

In [None]:
na_df.describe()

In [None]:
#Basic details
print("Number of North American countries: ",len(namerica))
print("Developed countries: ",na_df[na_df['Status']=='Developed']['Country'].unique())

#countries of lowest and higest life expectancies
print("Country with lowest life expectancy", na_df[na_df['Life expectancy ']==min(na_df['Life expectancy '])]['Country'],
      "\n",na_df[na_df['Life expectancy ']==min(na_df['Life expectancy '])]['Life expectancy '])

print("Country with highest life expectancy", na_df[na_df['Life expectancy ']==max(na_df['Life expectancy '])]['Country'],
      "\n",na_df[na_df['Life expectancy ']==max(na_df['Life expectancy '])]['Life expectancy '])

In [None]:
#print median, mode, and IQR of life expectancy
print("Median: ",na_df['Life expectancy '].median())
print("Mode: ",na_df['Life expectancy '].mode())

q1 = na_df['Life expectancy '].quantile(0.25)
q3 = na_df['Life expectancy '].quantile(0.75)
iqr = q3 - q1
print("IQR: ",iqr)

In [None]:
#convert categorical column to numeric
le = LabelEncoder()
na_df['Status']=le.fit_transform(na_df['Status'])

# Calculate the correlation matrix
corr_matrix = na_df.corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#Time series plot for each attribute(column) over the years
for attr in cat_cols:
  fig=px.line(na_df.sort_values(by='Year'),x='Year',y=attr,color='Country',markers=True,title='<b> Country-wise {} over Years'.format(attr))
  fig.show()

In [None]:
for attr in cat_cols:
  if attr!='Status':
    na_pivot = na_df.pivot(index='Country', columns='Year', values=attr)

    #create a heatmap using seaborn
    plt.figure(figsize=(15, 15))
    sns.heatmap(na_pivot, cmap='YlGnBu', annot=True, fmt=".1f")
    plt.title('{} in North American Countries from 2000 to 2015'.format(attr))
    plt.show()

In [None]:
#to create boxplots
for column in cat_cols:
  if column!='Status':
    plt.figure(figsize=(10,5))
    sns.boxplot(x='Year', y=column, data=na_df, palette="Reds")
    plt.title('Distribution of ' + column + ' by Year')
    plt.show()

#6)South America

In [None]:
sa_df.describe()

In [None]:
#Basic details
print("Number of South American countries: ",len(samerica))
print("Developed countries: ",sa_df[sa_df['Status']=='Developed']['Country'].unique())

#countries of lowest and higest life expectancies
print("Country with lowest life expectancy", sa_df[sa_df['Life expectancy ']==min(sa_df['Life expectancy '])]['Country'],
      "\n",sa_df[sa_df['Life expectancy ']==min(sa_df['Life expectancy '])]['Life expectancy '])

print("Country with highest life expectancy", sa_df[sa_df['Life expectancy ']==max(sa_df['Life expectancy '])]['Country'],
      "\n",sa_df[sa_df['Life expectancy ']==max(sa_df['Life expectancy '])]['Life expectancy '])

In [None]:
#print median, mode, and IQR of life expectancy
print("Median: ",sa_df['Life expectancy '].median())
print("Mode: ",sa_df['Life expectancy '].mode())

q1 = sa_df['Life expectancy '].quantile(0.25)
q3 = sa_df['Life expectancy '].quantile(0.75)
iqr = q3 - q1
print("IQR: ",iqr)

In [None]:
#Calculate the correlation matrix
csa=sa_df.drop(['Status'],axis=1) #no. of developed countries=0
corr_matrix = csa.corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#Time series plot for each attribute(column) over the years
for attr in cat_cols:
  fig=px.line(sa_df.sort_values(by='Year'),x='Year',y=attr,color='Country',markers=True,title='<b> Country-wise {} over Years'.format(attr))
  fig.show()

In [None]:
for attr in cat_cols:
  if attr!='Status':
    sa_pivot = sa_df.pivot(index='Country', columns='Year', values=attr)

    #create a heatmap using seaborn
    plt.figure(figsize=(15, 15))
    sns.heatmap(sa_pivot, cmap='YlGnBu', annot=True, fmt=".1f")
    plt.title('{} in South American Countries from 2000 to 2015'.format(attr))
    plt.show()

In [None]:
#to create boxplots
for column in cat_cols:
  if column!='Status':
    plt.figure(figsize=(10,5))
    sns.boxplot(x='Year', y=column, data=sa_df, palette="Blues")
    plt.title('Distribution of ' + column + ' by Year')
    plt.show()

#Descriptive ML Models

#1)Global

In [None]:
#Linear relationship between positively correlated variables
p={'infant deaths':'under-five deaths ',' thinness  1-19 years':' thinness 5-9 years','percentage expenditure':'GDP',
   'Income composition of resources':'Schooling'}

for k in p.keys():
    d1=df[k].values.reshape(-1, 1)
    d2=df[p[k]].values.reshape(-1, 1)

    linear_regressor = LinearRegression()
    linear_regressor.fit(d1, d2)  #linear regression
    pred = linear_regressor.predict(d1)  # make predictions

    plt.scatter(d1, d2)
    plt.plot(d1, pred, color='purple')
    plt.xlabel(k)
    plt.ylabel(p[k])
    plt.title('{} VS {}'.format(k,p[k]))
    plt.show()

In [None]:
#Logistic regression
d1 = df['Alcohol'].values
d2 = df['Status'].values

logis = LogisticRegression()
logis.fit(d1.reshape(-1, 1), d2)
d1_range = np.arange(d1.min(), d1.max(), 0.1)
#[:, 1] gives the predicted probabilities for class label 1 (Developing).
prob = logis.predict_proba(d1_range.reshape(-1, 1))[:, 1]
plt.plot(d1_range, prob, color='red')
plt.scatter(d1, d2)
plt.xlabel('Alcohol')
plt.ylabel('Probability of Status(0-Developed, 1-Developing)')
plt.show()

#2)Africa

In [None]:
#Linear relationship between positively correlated variables
p={'infant deaths':'under-five deaths ',' thinness  1-19 years':' thinness 5-9 years',
   'Income composition of resources':'Schooling','Diphtheria ':'Polio'}

for k in p.keys():
    d1=africa_df[k].values.reshape(-1, 1)
    d2=africa_df[p[k]].values.reshape(-1, 1)

    linear_regressor = LinearRegression()
    linear_regressor.fit(d1, d2)  #linear regression
    pred = linear_regressor.predict(d1)  # make predictions

    plt.scatter(d1, d2)
    plt.plot(d1, pred, color='purple')
    plt.xlabel(k)
    plt.ylabel(p[k])
    plt.title('{} VS {}'.format(k,p[k]))
    plt.show()

#3)Asia

In [None]:
#Linear relationship between positively correlated variables
p={'infant deaths':'under-five deaths ',' thinness  1-19 years':' thinness 5-9 years',
   'GDP':'percentage expenditure'}

for k in p.keys():
    d1=asia_df[k].values.reshape(-1, 1)
    d2=asia_df[p[k]].values.reshape(-1, 1)

    linear_regressor = LinearRegression()
    linear_regressor.fit(d1, d2)  #linear regression
    pred = linear_regressor.predict(d1)  # make predictions

    plt.scatter(d1, d2)
    plt.plot(d1, pred, color='purple')
    plt.xlabel(k)
    plt.ylabel(p[k])
    plt.title('{} VS {}'.format(k,p[k]))
    plt.show()

In [None]:
#Logistic regression
d1 = asia_df['percentage expenditure'].values
d2 = asia_df['Status'].values

logis = LogisticRegression()
logis.fit(d1.reshape(-1, 1), d2)
d1_range = np.arange(d1.min(), d1.max(), 0.1)
prob = logis.predict_proba(d1_range.reshape(-1, 1))[:, 1]
plt.plot(d1_range, prob, color='red')
plt.scatter(d1, d2)
plt.xlabel('Percentage expenditure')
plt.ylabel('Probability of Status(0-Developed, 1-Developing)')
plt.show()

In [None]:
#Logistic regression
d1 = asia_df['Life expectancy '].values
d2 = asia_df['Status'].values

logis = LogisticRegression()
logis.fit(d1.reshape(-1, 1), d2)
d1_range = np.arange(d1.min(), d1.max(), 0.1)
prob = logis.predict_proba(d1_range.reshape(-1, 1))[:, 1]
plt.plot(d1_range, prob, color='red')
plt.scatter(d1, d2)
plt.xlabel('Life expectancy ')
plt.ylabel('Probability of Status(0-Developed, 1-Developing)')
plt.show()

#4)Australia

In [None]:
#Linear relationship between positively correlated variables
p={'infant deaths':'under-five deaths ',' thinness  1-19 years':' thinness 5-9 years',
   ' HIV/AIDS':'under-five deaths ',' HIV/AIDS':'infant deaths','GDP':'percentage expenditure',
   'Alcohol':'Schooling','Income composition of resources':'Schooling'}

for k in p.keys():
    d1=aus_df[k].values.reshape(-1, 1)
    d2=aus_df[p[k]].values.reshape(-1, 1)

    linear_regressor = LinearRegression()
    linear_regressor.fit(d1, d2)  #linear regression
    pred = linear_regressor.predict(d1)  # make predictions

    plt.scatter(d1, d2)
    plt.plot(d1, pred, color='purple')
    plt.xlabel(k)
    plt.ylabel(p[k])
    plt.title('{} VS {}'.format(k,p[k]))
    plt.show()

In [None]:
#Logistic regression
n=['Alcohol','Schooling','GDP','percentage expenditure']

for c in n:
    d1 = aus_df[c].values
    d2 = aus_df['Status'].values

    logis = LogisticRegression()
    logis.fit(d1.reshape(-1, 1), d2)
    d1_range = np.arange(d1.min(), d1.max(), 0.1)
    prob = logis.predict_proba(d1_range.reshape(-1, 1))[:, 1]
    plt.plot(d1_range, prob, color='red')
    plt.scatter(d1, d2)
    plt.xlabel(c)
    plt.ylabel('Probability of Status(0-Developed, 1-Developing)')
    plt.show()

#5)Europe

In [None]:
#Linear relationship between positively correlated variables
p={'infant deaths':'under-five deaths ',' thinness  1-19 years':' thinness 5-9 years',
   'GDP':'percentage expenditure'}

for k in p.keys():
    d1=eu_df[k].values.reshape(-1, 1)
    d2=eu_df[p[k]].values.reshape(-1, 1)

    linear_regressor = LinearRegression()
    linear_regressor.fit(d1, d2)  #linear regression
    pred = linear_regressor.predict(d1)  # make predictions

    plt.scatter(d1, d2)
    plt.plot(d1, pred, color='purple')
    plt.xlabel(k)
    plt.ylabel(p[k])
    plt.title('{} VS {}'.format(k,p[k]))
    plt.show()

In [None]:
#Linear relationship between negatively correlated variables
n={' thinness  1-19 years':'Income composition of resources',' thinness 5-9 years':'Income composition of resources'}

for k in n.keys():
    d1=eu_df[k].values.reshape(-1, 1)
    d2=eu_df[n[k]].values.reshape(-1, 1)

    linear_regressor = LinearRegression()
    linear_regressor.fit(d1, d2)  #linear regression
    pred = linear_regressor.predict(d1)  # make predictions

    plt.scatter(d1, d2)
    plt.plot(d1, pred, color='red')
    plt.xlabel(k)
    plt.ylabel(n[k])
    plt.title('{} VS {}'.format(k,n[k]))
    plt.show()

#6)North America

In [None]:
#Linear relationship between positively correlated variables
p={'infant deaths':'under-five deaths ',' thinness  1-19 years':' thinness 5-9 years',
   'GDP':'percentage expenditure'}

for k in p.keys():
    d1=na_df[k].values.reshape(-1, 1)
    d2=na_df[p[k]].values.reshape(-1, 1)

    linear_regressor = LinearRegression()
    linear_regressor.fit(d1, d2)  #linear regression
    pred = linear_regressor.predict(d1)  # make predictions

    plt.scatter(d1, d2)
    plt.plot(d1, pred, color='purple')
    plt.xlabel(k)
    plt.ylabel(p[k])
    plt.title('{} VS {}'.format(k,p[k]))
    plt.show()

In [None]:
#Logistic regression
d1 = na_df['Total expenditure'].values
d2 = na_df['Status'].values

logis = LogisticRegression()
logis.fit(d1.reshape(-1, 1), d2)
d1_range = np.arange(d1.min(), d1.max(), 0.1)
prob = logis.predict_proba(d1_range.reshape(-1, 1))[:, 1]
plt.plot(d1_range, prob, color='red')
plt.scatter(d1, d2)
plt.xlabel('Total expenditure')
plt.ylabel('Probability of Status(0-Developed, 1-Developing)')
plt.show()

#7)South America

In [None]:
#Linear relationship between positively correlated variables
p={'infant deaths':'under-five deaths ',' thinness  1-19 years':' thinness 5-9 years'}

for k in p.keys():
    d1=sa_df[k].values.reshape(-1, 1)
    d2=sa_df[p[k]].values.reshape(-1, 1)

    linear_regressor = LinearRegression()
    linear_regressor.fit(d1, d2)  #linear regression
    pred = linear_regressor.predict(d1)  # make predictions

    plt.scatter(d1, d2)
    plt.plot(d1, pred, color='purple')
    plt.xlabel(k)
    plt.ylabel(p[k])
    plt.title('{} VS {}'.format(k,p[k]))
    plt.show()