In [None]:
# 24 May 2025

<h1 style="line-height:3rem;">GDP Vs Life Expectency </h1>

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import os 

cwd = os.getcwd() 
env_name = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost')

# Load appropriate files depending on environment
if env_name == "Localhost":
    life_df = pd.read_csv('life_expectancy.csv')
    gdp_df = pd.read_csv('gdp_per_capita.csv')
else:
    life_df = pd.read_csv('/kaggle/input/life-expectancy/life_expectancy.csv')
    gdp_df = pd.read_csv('/kaggle/input/gdp-per-capita/gdp_per_capita.csv')



print("Life Expectancy data shape:", life_df.shape)
print("GDP data shape:", gdp_df.shape)


def clustering_life_expectancy_against_gdp(
    top_n=40,
    include_countries=["Finland", "Sweden", "Denmark"],
    n_years=1
):
  
    
    # Preprocess GDP: convert wide format to long
    gdp_long = gdp_df.melt(id_vars=['Country Name'], var_name='Year', value_name='GDP per capita')
    gdp_long = gdp_long.rename(columns={'Country Name': 'Country'})
    gdp_long['Year'] = gdp_long['Year'].astype(str).str.strip()
    gdp_long = gdp_long[gdp_long['Year'].str.isnumeric()]  # keep numeric years only
    gdp_long['Year'] = gdp_long['Year'].astype(int)
    gdp_long['GDP per capita'] = pd.to_numeric(gdp_long['GDP per capita'], errors='coerce')
    
    # Life expectancy: Use average of men and women life expectancy if possible
    life_df['Life expectancy (men)'] = pd.to_numeric(life_df['Life expectancy (men)'], errors='coerce')
    life_df['Life expectancy(women)'] = pd.to_numeric(life_df['Life expectancy(women)'], errors='coerce')
    life_df['Avg_Life_Expectancy'] = life_df[['Life expectancy (men)', 'Life expectancy(women)']].mean(axis=1)
    
    # Filter life expectancy columns and drop rows missing Avg_Life_Expectancy or Year
    life_df = life_df[['Country', 'Year', 'Avg_Life_Expectancy']].dropna()
    life_df['Year'] = pd.to_numeric(life_df['Year'], errors='coerce')
    life_df = life_df.dropna(subset=['Year'])
    life_df['Year'] = life_df['Year'].astype(int)
    
    # Filter for last n_years (intersection)
    max_life_year = life_df['Year'].max()
    max_gdp_year = gdp_long['Year'].max()
    max_year = min(max_life_year, max_gdp_year)
    min_year = max_year - n_years + 1
    
    life_recent = life_df[(life_df['Year'] >= min_year) & (life_df['Year'] <= max_year)]
    gdp_recent = gdp_long[(gdp_long['Year'] >= min_year) & (gdp_long['Year'] <= max_year)]
    
    # Compute average life expectancy and GDP per capita per country in recent years
    avg_life = life_recent.groupby('Country')['Avg_Life_Expectancy'].mean()
    avg_gdp = gdp_recent.groupby('Country')['GDP per capita'].mean()
    
    # Combine into a DataFrame
    combined = pd.concat([avg_life, avg_gdp], axis=1).dropna()
    
    # Include specific countries even if not in top N by GDP
    for country in include_countries:
        if country not in combined.index:
            combined.loc[country] = [
                life_recent[life_recent['Country'] == country]['Avg_Life_Expectancy'].mean(),
                gdp_recent[gdp_recent['Country'] == country]['GDP per capita'].mean()
            ]
    
    # Sort by GDP per capita descending and select top N
    combined = combined.sort_values(by='GDP per capita', ascending=False)
    combined = combined.head(top_n)
    
    # Clustering
    kmeans = KMeans(n_clusters=3, random_state=42)
    combined['Cluster'] = kmeans.fit_predict(combined[['GDP per capita', 'Avg_Life_Expectancy']])
    
    # Plot
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(
        combined['GDP per capita'],
        combined['Avg_Life_Expectancy'],
        c=combined['Cluster'],
        cmap='viridis',
        s=100,
        alpha=0.7,
        edgecolors='k'
    )
    plt.colorbar(scatter, label='Cluster')
    
    # Add labels for each point
    for i, country in enumerate(combined.index):
        plt.text(
            combined['GDP per capita'].iloc[i],
            combined['Avg_Life_Expectancy'].iloc[i],
            country,
            fontsize=9,
            alpha=0.8,
            ha='right',
            va='bottom'
        )
    
    plt.xlabel('Average GDP per capita (last {} years)'.format(n_years))
    plt.ylabel('Average Life Expectancy (last {} years)'.format(n_years))
    plt.title('Clustering of Countries by GDP per Capita and Life Expectancy')
    plt.grid(True)
    plt.show()
    
    # Style the DataFrame with gradient
    styled_combined = combined.style.background_gradient(
        subset=['GDP per capita', 'Avg_Life_Expectancy'],
        cmap='YlGnBu'
    ).highlight_max(axis=0, color='lightgreen').format("{:,.2f}")

    return styled_combined

# Run example
result_df = clustering_life_expectancy_against_gdp()
result_df  # For Jupyter display


Life Expectancy data shape: (2938, 25)
GDP data shape: (266, 64)


UnboundLocalError: cannot access local variable 'life_df' where it is not associated with a value