# 10/16/2024

In [44]:
import pandas as pd
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", None)

df = pd.read_csv("https://raw.githubusercontent.com/wcj365/python-stats-dataviz/refs/heads/master/fall2024/data/World_Development_Indicators_(WDI).csv")

print(df.shape)
df.sample(3)

(4123, 9)


Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type
3633,2008,Tanzania,671.244446,58.086,42870884.0,TZA,Sub-Saharan Africa,Lower middle income,IDA
3411,2014,Spain,29513.65118,83.229268,46480882.0,ESP,Europe & Central Asia,High income,Not classified
2966,2006,Poland,9035.532758,75.143902,38141267.0,POL,Europe & Central Asia,High income,IBRD


In [45]:
column_names_dict = {
    "GDP per capita (current US$)" : "GDP per Capita",
    "Life expectancy at birth, total (years)" : "Life Expectancy",
    "Population, total" : "Population"
}

df = df.rename(columns=column_names_dict)
df.sample(3)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type
1426,2005,Gibraltar,,81.442,29155.0,GIB,Europe & Central Asia,High income,Not classified
2276,2019,Malawi,584.362867,64.119,18867337.0,MWI,Sub-Saharan Africa,Low income,IDA
2291,2015,Malaysia,9699.600463,75.094,31068833.0,MYS,East Asia & Pacific,Upper middle income,IBRD


In [46]:
df.sample(3).T

Unnamed: 0,2957,2512,3312
Year,2016,2008,2010
Country,Philippines,Montenegro,Slovenia
GDP per Capita,3038.152037,7367.751909,23532.480855
Life Expectancy,71.387,75.136585,79.421951
Population,104875266.0,616969.0,2048583.0
Country Code,PHL,MNE,SVN
Region,East Asia & Pacific,Europe & Central Asia,Europe & Central Asia
Income Group,Lower middle income,Upper middle income,High income
Lending Type,IBRD,IBRD,Not classified


## Section One

Summary Statistics with describe() function

In [47]:
df[["Population","Life Expectancy"]].describe()

Unnamed: 0,Population,Life Expectancy
count,4123.0,3777.0
mean,33195750.0,71.060853
std,131643000.0,8.499806
min,9791.0,42.125
25%,743620.0,65.351
50%,5872624.0,72.765
75%,21484940.0,77.529
max,1417173000.0,85.497561


In [48]:
df.describe()

Unnamed: 0,Year,GDP per Capita,Life Expectancy,Population
count,4123.0,3962.0,3777.0,4123.0
mean,2013.0,17007.692848,71.060853,33195750.0
std,5.47789,25733.109164,8.499806,131643000.0
min,2004.0,128.538423,42.125,9791.0
25%,2008.0,1862.8924,65.351,743620.0
50%,2013.0,6048.304202,72.765,5872624.0
75%,2018.0,22137.309568,77.529,21484940.0
max,2022.0,240862.182448,85.497561,1417173000.0


In [49]:
df.describe(include="object")

Unnamed: 0,Country,Country Code,Region,Income Group,Lending Type
count,4123,4123,4123,4123,4123
unique,217,217,7,5,4
top,Afghanistan,AFG,Europe & Central Asia,High income,Not classified
freq,19,19,1102,1558,1387


In [50]:
df.describe(include="object").T

Unnamed: 0,count,unique,top,freq
Country,4123,217,Afghanistan,19
Country Code,4123,217,AFG,19
Region,4123,7,Europe & Central Asia,1102
Income Group,4123,5,High income,1558
Lending Type,4123,4,Not classified,1387


## Section Two

Compute quartiles and transform a numerical column into a categorical column based on the quartiles.

In [51]:
def assign_quartile(gdp):
    if gdp <= 1862.89:
        quartile = "1st Quartile (<=25%)"
    elif gdp > 1862.89 and gdp <= 6048.30:
        quartile = "2nd Quartile (25-50%)"
    elif gdp > 6048.30 and gdp <= 22137.31:
        quartile = "3rd Quartile (50-75%)"
    else:
        quartile = "4th Quartile (>75%)"
    return quartile


hungary = assign_quartile(14294.25)
hungary

'3rd Quartile (50-75%)'

In [52]:
df_usa = df[df["Country Code"] == "USA"]
df_usa

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type
3914,2004,United States,41724.631629,77.487805,292805298.0,USA,North America,High income,Not classified
3915,2005,United States,44123.407068,77.487805,295516599.0,USA,North America,High income,Not classified
3916,2006,United States,46302.00088,77.687805,298379912.0,USA,North America,High income,Not classified
3917,2007,United States,48050.223777,77.987805,301231207.0,USA,North America,High income,Not classified
3918,2008,United States,48570.04598,78.039024,304093966.0,USA,North America,High income,Not classified
3919,2009,United States,47194.943355,78.390244,306771529.0,USA,North America,High income,Not classified
3920,2010,United States,48650.643128,78.541463,309327143.0,USA,North America,High income,Not classified
3921,2011,United States,50065.966504,78.641463,311583481.0,USA,North America,High income,Not classified
3922,2012,United States,51784.418574,78.741463,313877662.0,USA,North America,High income,Not classified
3923,2013,United States,53291.127689,78.741463,316059947.0,USA,North America,High income,Not classified


In [53]:
assign_quartile(76329.58)

'4th Quartile (>75%)'

In [54]:
df_2020 = df[df["Year"] == 2020]
print(df_2020.shape[0])
print(f"There are {df_2020.shape[0]} countries in the 2020 dataset")
df_2020.sample(3)

217
There are 217 countries in the 2020 dataset


Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type
2847,2020,Pakistan,1322.314785,66.269,227196741.0,PAK,South Asia,Lower middle income,Blend
2144,2020,Liberia,597.529692,60.948,5087584.0,LBR,Sub-Saharan Africa,Low income,IDA
1612,2020,Haiti,1283.141228,64.052,11306801.0,HTI,Latin America & Caribbean,Lower middle income,IDA


In [55]:
# Tuple
df_2020.shape

(217, 9)

## In-class Exercise

Create a new column call "GDP Quartile" in the df_2020 dataframe and assign each country the quartile it belongs to by applying the assign_quartile() function.


In [56]:
df_2020["GDP Quartile"] = df_2020["GDP per Capita"].apply(assign_quartile)

print(df_2020.shape[0])
print(df_2020["GDP Quartile"].value_counts())

217
GDP Quartile
4th Quartile (>75%)      61
2nd Quartile (25-50%)    58
3rd Quartile (50-75%)    54
1st Quartile (<=25%)     44
Name: count, dtype: int64


In [57]:
df_2020.sample(10)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,GDP Quartile
3588,2020,Switzerland,85897.784334,83.0,8638167.0,CHE,Europe & Central Asia,High income,Not classified,4th Quartile (>75%)
3417,2020,Spain,26984.296277,82.331707,47365655.0,ESP,Europe & Central Asia,High income,Not classified,4th Quartile (>75%)
2942,2020,Peru,6063.626923,73.665,33304756.0,PER,Latin America & Caribbean,Upper middle income,IBRD,3rd Quartile (50-75%)
3493,2020,St. Martin (French part),,80.149,32553.0,MAF,Latin America & Caribbean,High income,Not classified,4th Quartile (>75%)
2410,2020,Mauritius,9011.042884,74.177073,1266014.0,MUS,Sub-Saharan Africa,Upper middle income,IBRD,3rd Quartile (50-75%)
3379,2020,South Africa,5753.066494,65.252,58801927.0,ZAF,Sub-Saharan Africa,Upper middle income,IBRD,2nd Quartile (25-50%)
2467,2020,Moldova,4376.242493,70.166,2635130.0,MDA,Europe & Central Asia,Upper middle income,IBRD,2nd Quartile (25-50%)
1251,2020,Faroe Islands,62234.96868,83.197561,52415.0,FRO,Europe & Central Asia,High income,Not classified,4th Quartile (>75%)
3398,2020,South Sudan,,55.48,10606227.0,SSD,Sub-Saharan Africa,Low income,IDA,4th Quartile (>75%)
3911,2020,United Kingdom,40217.009012,80.35122,67081234.0,GBR,Europe & Central Asia,High income,Not classified,4th Quartile (>75%)


In [58]:
df_4th = df_2020[df_2020["GDP Quartile"] == "4th Quartile (>75%)"]
df_4th.shape

(61, 10)

In [59]:
", ".join(list(df_4th["Country"]))

"Andorra, Aruba, Australia, Austria, Bahamas, The, Bahrain, Belgium, Bermuda, British Virgin Islands, Brunei Darussalam, Canada, Cayman Islands, Channel Islands, Cyprus, Czechia, Denmark, Eritrea, Estonia, Faroe Islands, Finland, France, Germany, Gibraltar, Greenland, Guam, Hong Kong SAR, China, Iceland, Ireland, Isle of Man, Israel, Italy, Japan, Korea, Dem. People's Rep., Korea, Rep., Kuwait, Liechtenstein, Luxembourg, Macao SAR, China, Malta, Monaco, Netherlands, New Caledonia, New Zealand, Norway, Portugal, Puerto Rico, Qatar, San Marino, Singapore, Sint Maarten (Dutch part), Slovenia, South Sudan, Spain, St. Martin (French part), Sweden, Switzerland, United Arab Emirates, United Kingdom, United States, Venezuela, RB, Virgin Islands (U.S.)"

In [60]:


quartile_groups = df_2020.groupby("GDP Quartile")["Country"].apply(list)


for quartile, countries in quartile_groups.items():
  print(f"{quartile}:")
  for country in countries:
      print(f"  - {country}")
  print("\n")

1st Quartile (<=25%):
  - Afghanistan
  - Angola
  - Benin
  - Burkina Faso
  - Burundi
  - Cambodia
  - Cameroon
  - Central African Republic
  - Chad
  - Comoros
  - Congo, Dem. Rep.
  - Ethiopia
  - Gambia, The
  - Guinea
  - Guinea-Bissau
  - Haiti
  - Kiribati
  - Kyrgyz Republic
  - Lesotho
  - Liberia
  - Madagascar
  - Malawi
  - Mali
  - Mauritania
  - Mozambique
  - Myanmar
  - Nepal
  - Niger
  - Pakistan
  - Rwanda
  - Senegal
  - Sierra Leone
  - Somalia
  - Sudan
  - Syrian Arab Republic
  - Tajikistan
  - Tanzania
  - Timor-Leste
  - Togo
  - Uganda
  - Uzbekistan
  - Yemen, Rep.
  - Zambia
  - Zimbabwe


2nd Quartile (25-50%):
  - Albania
  - Algeria
  - Armenia
  - Azerbaijan
  - Bangladesh
  - Belize
  - Bhutan
  - Bolivia
  - Botswana
  - Cabo Verde
  - Colombia
  - Congo, Rep.
  - Cote d'Ivoire
  - Djibouti
  - Ecuador
  - Egypt, Arab Rep.
  - El Salvador
  - Eswatini
  - Fiji
  - Georgia
  - Ghana
  - Guatemala
  - Honduras
  - India
  - Indonesia
  - Iran, Islamic

In [61]:
for quartile, countries in quartile_groups.items():
  print(f"{quartile}:")
  for country in countries:
      print(f"  - {country}")
  print("\n")

1st Quartile (<=25%):
  - Afghanistan
  - Angola
  - Benin
  - Burkina Faso
  - Burundi
  - Cambodia
  - Cameroon
  - Central African Republic
  - Chad
  - Comoros
  - Congo, Dem. Rep.
  - Ethiopia
  - Gambia, The
  - Guinea
  - Guinea-Bissau
  - Haiti
  - Kiribati
  - Kyrgyz Republic
  - Lesotho
  - Liberia
  - Madagascar
  - Malawi
  - Mali
  - Mauritania
  - Mozambique
  - Myanmar
  - Nepal
  - Niger
  - Pakistan
  - Rwanda
  - Senegal
  - Sierra Leone
  - Somalia
  - Sudan
  - Syrian Arab Republic
  - Tajikistan
  - Tanzania
  - Timor-Leste
  - Togo
  - Uganda
  - Uzbekistan
  - Yemen, Rep.
  - Zambia
  - Zimbabwe


2nd Quartile (25-50%):
  - Albania
  - Algeria
  - Armenia
  - Azerbaijan
  - Bangladesh
  - Belize
  - Bhutan
  - Bolivia
  - Botswana
  - Cabo Verde
  - Colombia
  - Congo, Rep.
  - Cote d'Ivoire
  - Djibouti
  - Ecuador
  - Egypt, Arab Rep.
  - El Salvador
  - Eswatini
  - Fiji
  - Georgia
  - Ghana
  - Guatemala
  - Honduras
  - India
  - Indonesia
  - Iran, Islamic

In [62]:
df_2020 = df_2020.dropna()
df_2020["GDP (Rounded)"] = df_2020["GDP per Capita"].apply(round)
df_2020.sample(5)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,GDP Quartile,GDP (Rounded)
2771,2020,North Macedonia,5965.450232,74.395122,2072531.0,MKD,Europe & Central Asia,Upper middle income,IBRD,2nd Quartile (25-50%),5965
2638,2020,Nepal,1139.189892,69.246,29348627.0,NPL,South Asia,Lower middle income,IDA,1st Quartile (<=25%),1139
1726,2020,Indonesia,3895.618152,68.808,271857970.0,IDN,East Asia & Pacific,Upper middle income,IBRD,2nd Quartile (25-50%),3896
2182,2020,Liechtenstein,165287.186767,81.658537,38756.0,LIE,Europe & Central Asia,High income,Not classified,4th Quartile (>75%),165287
2258,2020,Madagascar,462.404229,65.182,28225177.0,MDG,Sub-Saharan Africa,Low income,IDA,1st Quartile (<=25%),462


In [63]:
def pop_million(pop):
    return pop / 1000000


df_2020["Population (Million)"] = df_2020["Population"].apply(pop_million)

df_2020.sample(5)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,GDP Quartile,GDP (Rounded),Population (Million)
1194,2020,Estonia,23595.243684,78.595122,1329522.0,EST,Europe & Central Asia,High income,Not classified,4th Quartile (>75%),23595,1.329522
2657,2020,Netherlands,52162.570115,81.358537,17441500.0,NLD,Europe & Central Asia,High income,Not classified,4th Quartile (>75%),52163,17.4415
3094,2020,Rwanda,773.773261,66.774,13146362.0,RWA,Sub-Saharan Africa,Low income,IDA,1st Quartile (<=25%),774,13.146362
643,2020,Cambodia,1577.91174,70.416,16396860.0,KHM,East Asia & Pacific,Lower middle income,IDA,1st Quartile (<=25%),1578,16.39686
3778,2020,Turkiye,8638.739133,75.85,83384680.0,TUR,Europe & Central Asia,Upper middle income,IBRD,3rd Quartile (50-75%),8639,83.38468


In [64]:
df_2020["Population2 (Million)"] = df_2020["Population"].apply(lambda pop : pop / 1000000)
df_2020.sample(5)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,GDP Quartile,GDP (Rounded),Population (Million),Population2 (Million)
1992,2020,"Korea, Rep.",31721.298914,83.426829,51836239.0,KOR,East Asia & Pacific,High income,Not classified,4th Quartile (>75%),31721,51.836239,51.836239
4101,2020,Zambia,956.831729,62.38,18927715.0,ZMB,Sub-Saharan Africa,Lower middle income,IDA,1st Quartile (<=25%),957,18.927715,18.927715
757,2020,Channel Islands,57339.526871,81.264,171113.0,CHI,Europe & Central Asia,High income,Not classified,4th Quartile (>75%),57340,0.171113,0.171113
643,2020,Cambodia,1577.91174,70.416,16396860.0,KHM,East Asia & Pacific,Lower middle income,IDA,1st Quartile (<=25%),1578,16.39686,16.39686
1935,2020,Kenya,1936.250755,62.675,51985780.0,KEN,Sub-Saharan Africa,Lower middle income,Blend,2nd Quartile (25-50%),1936,51.98578,51.98578


## Homework Question 1

Save the summary statistics of all numerical columns and all categorical columns to only one Excel spreadsheet file with two worksheets, one for numerical and one for categorical columns.

In [74]:
import pandas as pd


df = pd.read_csv("https://raw.githubusercontent.com/wcj365/python-stats-dataviz/refs/heads/master/fall2024/data/World_Development_Indicators_(WDI).csv")  # or use another method to load data

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Summary for numerical columns
numerical_summary = df[numerical_cols].describe()
print("Numerical Summary Statistics:")
print(numerical_summary)

# Summary for categorical columns (unique counts and frequencies)
categorical_summary = df[categorical_cols].apply(lambda x: x.value_counts()).T
print("Categorical Summary Statistics:")
print(categorical_summary)

# Save to Excel file with two sheets
with pd.ExcelWriter('summary_statistics.xlsx') as writer:
    numerical_summary.to_excel(writer, sheet_name='Numerical')
    categorical_summary.to_excel(writer, sheet_name='Categorical')






Numerical Summary Statistics:
             Year  GDP per capita (current US$)  \
count  4123.00000                   3962.000000   
mean   2013.00000                  17007.692848   
std       5.47789                  25733.109164   
min    2004.00000                    128.538423   
25%    2008.00000                   1862.892400   
50%    2013.00000                   6048.304202   
75%    2018.00000                  22137.309568   
max    2022.00000                 240862.182448   

       Life expectancy at birth, total (years)  Population, total  
count                              3777.000000       4.123000e+03  
mean                                 71.060853       3.319575e+07  
std                                   8.499806       1.316430e+08  
min                                  42.125000       9.791000e+03  
25%                                  65.351000       7.436200e+05  
50%                                  72.765000       5.872624e+06  
75%                               

In [75]:
# for df_2020

numerical_summary = df_2020.describe()
print("Numerical Summary Statistics:")
print(numerical_summary)

categorical_summary = df_2020.describe(include="object").T
print("Categorical Summary Statistics:")
print(categorical_summary)


# Save to Excel file with two sheets
with pd.ExcelWriter('summary_statistics.xlsx') as writer:
    numerical_summary.to_excel(writer, sheet_name='Numerical')
    categorical_summary.to_excel(writer, sheet_name='Categorical')

Numerical Summary Statistics:
         Year  GDP per Capita  Life Expectancy    Population  GDP (Rounded)  \
count   202.0      202.000000       202.000000  2.020000e+02     202.000000   
mean   2020.0    16198.267524        72.314599  3.825505e+07   16198.277228   
std       0.0    23391.117592         7.451962  1.446623e+08   23391.118369   
min    2020.0      216.827417        52.777000  1.106900e+04     217.000000   
25%    2020.0     2188.047693        66.779750  1.307377e+06    2188.250000   
50%    2020.0     5920.260419        72.871500  6.916570e+06    5920.000000   
75%    2020.0    20761.211199        77.981750  2.673161e+07   20761.000000   
max    2020.0   165287.186767        85.497561  1.411100e+09  165287.000000   

       Population (Million)  Population2 (Million)  GDP per Capita (Rounded)  
count            202.000000             202.000000                202.000000  
mean              38.255054              38.255054              16198.267475  
std              144.

## Homework Question 2

Create a new column that is the GDP per Capita rounded to 2 decimal points by using the Python's build function "round"
Hint: apply the round function/method and provide a parameter 2.

In [66]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/wcj365/python-stats-dataviz/refs/heads/master/fall2024/data/World_Development_Indicators_(WDI).csv")

# Apply the round function to GDP per Capita and create a new column
df['GDP per Capita (rounded)'] = df['GDP per capita (current US$)'].apply(lambda x: round(x, 2))

# Display the first few rows to verify the new column
print(df[['GDP per capita (current US$)', 'GDP per Capita (rounded)']].head())




   GDP per capita (current US$)  GDP per Capita (rounded)
0                    221.830531                    221.83
1                    254.115274                    254.12
2                    274.015394                    274.02
3                    376.318296                    376.32
4                    382.533804                    382.53


In [67]:
df_2020["GDP per Capita (Rounded)"] = df_2020["GDP per Capita"].apply(lambda x: round(x, 2))

df_2020.sample(5)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,GDP Quartile,GDP (Rounded),Population (Million),Population2 (Million),GDP per Capita (Rounded)
2315,2020,Maldives,7216.816371,79.875,514438.0,MDV,South Asia,Upper middle income,IDA,3rd Quartile (50-75%),7217,0.514438,0.514438,7216.82
4025,2020,Viet Nam,3586.347176,75.378,96648685.0,VNM,East Asia & Pacific,Lower middle income,IBRD,2nd Quartile (25-50%),3586,96.648685,96.648685,3586.35
928,2020,Croatia,14269.908855,77.72439,4047680.0,HRV,Europe & Central Asia,High income,IBRD,3rd Quartile (50-75%),14270,4.04768,4.04768,14269.91
3664,2020,Thailand,7001.78546,79.274,71475664.0,THA,East Asia & Pacific,Upper middle income,IBRD,3rd Quartile (50-75%),7002,71.475664,71.475664,7001.79
2125,2020,Lesotho,917.356381,54.693,2254100.0,LSO,Sub-Saharan Africa,Lower middle income,IDA,1st Quartile (<=25%),917,2.2541,2.2541,917.36


## Homework Question 3 (bonus, will cover next week)

step 1 - create a column "Life Expectancy Quartile" similar to the GDP per Capita Quartile above.
Step 2 - Create a column "health and wealth status" based on the following definition
- rich and healthy (1st GDP per capita quartile and 1st life expentancy quartile)
- poor and unhealthy ((4th GDP per capita quartile and 4th life expentancy quartile)
- rich and unhealthy (1st GDP per capita quartile and 4th life expentancy quartile)
- poor and healthy ((4th GDP per capita quartile and 1st life expentancy quartile)
- Other

In [68]:
import pandas as pd

# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/wcj365/python-stats-dataviz/refs/heads/master/fall2024/data/World_Development_Indicators_(WDI).csv")

# Step 1: Create the "GDP per Capita Quartile" and "Life Expectancy Quartile"

# Dropping NA values from the relevant columns for clean quartile calculation

df = df.dropna(subset=['GDP per capita (current US$)', 'Life expectancy at birth, total (years)'])

# Create GDP per Capita Quartile
df['GDP per Capita Quartile'] = pd.qcut(df['GDP per capita (current US$)'], 4, labels=[1, 2, 3, 4])

# Create Life Expectancy Quartile
df['Life Expectancy Quartile'] = pd.qcut(df['Life expectancy at birth, total (years)'], 4, labels=[1, 2, 3, 4])

# Step 2: Create the "health and wealth status" column based on the given definition
def health_wealth_status(row):
    if row['GDP per Capita Quartile'] == 1 and row['Life Expectancy Quartile'] == 1:
        return 'rich and healthy'
    elif row['GDP per Capita Quartile'] == 4 and row['Life Expectancy Quartile'] == 4:
        return 'poor and unhealthy'
    elif row['GDP per Capita Quartile'] == 1 and row['Life Expectancy Quartile'] == 4:
        return 'rich and unhealthy'
    elif row['GDP per Capita Quartile'] == 4 and row['Life Expectancy Quartile'] == 1:
        return 'poor and healthy'
    else:
        return 'Other'

# Apply the function to each row to create the new column
df['health and wealth status'] = df.apply(health_wealth_status, axis=1)

# Display the first few rows to verify the results
print(df[['GDP per capita (current US$)', 'Life expectancy at birth, total (years)',
          'GDP per Capita Quartile', 'Life Expectancy Quartile', 'health and wealth status']].head())



   GDP per capita (current US$)  Life expectancy at birth, total (years)  \
0                    221.830531                                   57.944   
1                    254.115274                                   58.361   
2                    274.015394                                   58.684   
3                    376.318296                                   59.111   
4                    382.533804                                   59.852   

  GDP per Capita Quartile Life Expectancy Quartile health and wealth status  
0                       1                        1         rich and healthy  
1                       1                        1         rich and healthy  
2                       1                        1         rich and healthy  
3                       1                        1         rich and healthy  
4                       1                        1         rich and healthy  


In [69]:
df.sample(55)

Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type,GDP per Capita Quartile,Life Expectancy Quartile,health and wealth status
1376,2012,Georgia,4421.930712,72.412,3728874.0,GEO,Europe & Central Asia,Upper middle income,IBRD,2,2,Other
765,2009,Chile,10204.558854,78.741,16833447.0,CHL,Latin America & Caribbean,High income,IBRD,3,4,Other
1507,2010,Guam,30011.21858,76.879,164905.0,GUM,East Asia & Pacific,High income,Not classified,4,3,Other
2645,2008,Netherlands,57879.943755,80.25122,16445593.0,NLD,Europe & Central Asia,High income,Not classified,4,4,poor and unhealthy
1096,2017,Ecuador,6246.404252,76.972,16696944.0,ECU,Latin America & Caribbean,Upper middle income,IBRD,3,3,Other
381,2005,Benin,805.904685,57.125,8149419.0,BEN,Sub-Saharan Africa,Lower middle income,IDA,1,1,rich and healthy
434,2020,Bhutan,3181.339747,71.609,772506.0,BTN,South Asia,Lower middle income,IDA,2,2,Other
3926,2016,United States,57866.744934,78.539024,323071755.0,USA,North America,High income,Not classified,4,4,poor and unhealthy
2328,2014,Mali,818.430341,57.9,17551814.0,MLI,Sub-Saharan Africa,Low income,IDA,1,1,rich and healthy
1780,2017,Ireland,70150.737016,82.156098,4807388.0,IRL,Europe & Central Asia,High income,Not classified,4,4,poor and unhealthy


In [70]:
df_2020.sample(5)

Unnamed: 0,Year,Country,GDP per Capita,Life Expectancy,Population,Country Code,Region,Income Group,Lending Type,GDP Quartile,GDP (Rounded),Population (Million),Population2 (Million),GDP per Capita (Rounded)
3854,2020,Uganda,846.881199,62.851,44404611.0,UGA,Sub-Saharan Africa,Low income,IDA,1st Quartile (<=25%),847,44.404611,44.404611,846.88
2885,2020,Panama,13293.333195,76.657,4294396.0,PAN,Latin America & Caribbean,High income,IBRD,3rd Quartile (50-75%),13293,4.294396,4.294396,13293.33
2429,2020,Mexico,8894.89065,70.133,125998302.0,MEX,Latin America & Caribbean,Upper middle income,IBRD,3rd Quartile (50-75%),8895,125.998302,125.998302,8894.89
871,2020,"Congo, Rep.",2011.269479,63.785,5702174.0,COG,Sub-Saharan Africa,Lower middle income,Blend,2nd Quartile (25-50%),2011,5.702174,5.702174,2011.27
1859,2020,Jamaica,4897.26475,71.869,2820436.0,JAM,Latin America & Caribbean,Upper middle income,IBRD,2nd Quartile (25-50%),4897,2.820436,2.820436,4897.26
