In [7]:
!pip install missingno shap

Collecting shap
  Downloading shap-0.46.0-cp312-cp312-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp312-cp312-win_amd64.whl (456 kB)
   ---------------------------------------- 0.0/456.2 kB ? eta -:--:--
   -------- ------------------------------- 92.2/456.2 kB 1.8 MB/s eta 0:00:01
   -------------------------------- ------- 368.6/456.2 kB 3.9 MB/s eta 0:00:01
   ---------------------------------------- 456.2/456.2 kB 3.6 MB/s eta 0:00:00
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8


In [1]:
### 1. Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno  # For missing values visualization
import shap  # For explainability
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the HDI dataset (CSV file)
hdi_df = pd.read_csv(r"C:\Users\sonal\Portfolio_Projects/dataset/HDI.csv")
display(hdi_df.head())  # Show first few rows

Unnamed: 0,iso3,country,hdicode,region,hdi_rank_2021,hdi_1990,hdi_1991,hdi_1992,hdi_1993,hdi_1994,...,mf_2012,mf_2013,mf_2014,mf_2015,mf_2016,mf_2017,mf_2018,mf_2019,mf_2020,mf_2021
0,AFG,Afghanistan,Low,SA,180.0,0.273,0.279,0.287,0.297,0.292,...,1.86,1.88,1.66,1.62,1.66,1.41,1.32,1.38,1.38,1.38
1,AGO,Angola,Medium,SSA,148.0,,,,,,...,4.09,4.53,3.97,3.59,2.79,2.64,2.28,2.18,2.18,2.18
2,ALB,Albania,High,ECA,67.0,0.647,0.629,0.614,0.617,0.624,...,12.44,11.49,13.14,12.61,14.39,14.46,12.85,12.96,12.96,12.96
3,AND,Andorra,Very High,,40.0,,,,,,...,,,,,,,,,,
4,ARE,United Arab Emirates,Very High,AS,26.0,0.728,0.739,0.742,0.748,0.755,...,49.56,49.68,55.49,59.76,64.95,75.61,65.97,68.95,68.95,68.95


In [3]:
# Load the World Bank dataset (Excel file)
worldbank_df = pd.read_excel(r"C:\Users\sonal\Portfolio_Projects/dataset/WorldBank.xlsx")
display(worldbank_df.head())

Unnamed: 0,Country Name,Country Code,Region,IncomeGroup,Year,"Birth rate, crude (per 1,000 people)","Death rate, crude (per 1,000 people)",Electric power consumption (kWh per capita),GDP (USD),GDP per capita (USD),Individuals using the Internet (% of population),"Infant mortality rate (per 1,000 live births)",Life expectancy at birth (years),Population density (people per sq. km of land area),Unemployment (% of total labor force) (modeled ILO estimate)
0,Afghanistan,AFG,South Asia,Low income,2018,,,,19363000000.0,520.897,,47.9,,56.9378,1.542
1,Afghanistan,AFG,South Asia,Low income,2017,33.211,6.575,,20191800000.0,556.302,13.5,49.5,64.13,55.596,1.559
2,Afghanistan,AFG,South Asia,Low income,2016,33.981,6.742,,19362600000.0,547.228,11.2,51.2,63.763,54.1971,1.634
3,Afghanistan,AFG,South Asia,Low income,2015,34.809,6.929,,19907100000.0,578.466,8.26,53.1,63.377,52.7121,1.679
4,Afghanistan,AFG,South Asia,Low income,2014,35.706,7.141,,20484900000.0,613.856,7.0,55.1,62.966,51.1148,1.735


In [4]:
# Load the Data Dictionary (CSV file)
data_dict = pd.read_csv(r"C:\Users\sonal\Portfolio_Projects/dataset/world_indicators_data_dictionary.csv", encoding="ISO-8859-1")
display(data_dict.head())

Unnamed: 0,Table,Field,Description
0,WorldBank,Country Name,The name of the country
1,WorldBank,Country Code,The three letter code representing the country
2,WorldBank,Region,The World Bank region of the country
3,WorldBank,IncomeGroup,The World Bank Income Group of the country
4,WorldBank,Year,The Year in which the statistics were recorded


In [26]:
display(hdi_df.describe())

Unnamed: 0,hdi_rank_2021,hdi_1990,hdi_1991,hdi_1992,hdi_1993,hdi_1994,hdi_1995,hdi_1996,hdi_1997,hdi_1998,...,mf_2012,mf_2013,mf_2014,mf_2015,mf_2016,mf_2017,mf_2018,mf_2019,mf_2020,mf_2021
count,191.0,152.0,152.0,152.0,152.0,152.0,163.0,163.0,163.0,163.0,...,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0
mean,95.811518,0.595112,0.597862,0.600493,0.604474,0.609329,0.613908,0.619663,0.624975,0.630417,...,15.249784,15.385926,15.347629,15.352632,15.342119,15.49178,14.597416,14.776332,14.775408,14.774396
std,55.307333,0.161918,0.161921,0.162193,0.163122,0.163818,0.162789,0.163461,0.164299,0.165177,...,14.134572,14.555342,14.363092,14.68034,15.015809,15.333709,13.581871,14.266308,14.266504,14.266706
min,1.0,0.216,0.218,0.222,0.227,0.232,0.238,0.243,0.248,0.256,...,0.43,0.42,0.45,0.56,0.6,0.61,0.5,0.47,0.47,0.47
25%,48.5,0.47775,0.477,0.47525,0.47425,0.4765,0.4805,0.4885,0.49,0.492,...,4.5975,4.525,4.452375,4.4575,4.5175,4.575,4.6125,4.7325,4.7325,4.7325
50%,96.0,0.6215,0.6235,0.622,0.624,0.6235,0.642,0.65,0.651,0.657,...,10.907725,11.272314,10.745,10.26,10.59,10.895,11.215,11.03,11.03,11.03
75%,143.5,0.7255,0.727,0.72375,0.72425,0.73375,0.737,0.743,0.7525,0.759,...,20.345,20.3975,20.2275,20.1875,19.35,19.7575,18.4325,17.9325,17.9325,17.9325
max,191.0,0.872,0.873,0.878,0.88,0.884,0.885,0.887,0.89,0.895,...,74.56,82.97,81.1,86.78,83.51,88.7,71.16,85.7,85.7,85.7


In [27]:
display(worldbank_df.describe())

Unnamed: 0,Year,"Birth rate, crude (per 1,000 people)","Death rate, crude (per 1,000 people)",Electric power consumption (kWh per capita),GDP (USD),GDP per capita (USD),Individuals using the Internet (% of population),"Infant mortality rate (per 1,000 live births)",Life expectancy at birth (years),Population density (people per sq. km of land area),Unemployment (% of total labor force) (modeled ILO estimate)
count,12449.0,11440.0,11416.0,5848.0,9578.0,9575.0,5064.0,9984.0,11176.0,11845.0,5208.0
mean,1989.0,28.643276,10.588539,3175.294686,170074000000.0,8231.812259,23.334471,51.704437,64.044692,318.86137,8.295079
std,17.03007,13.131893,5.489382,4467.139298,897986600000.0,16173.539954,28.319388,46.131039,11.491087,1593.406041,6.290703
min,1960.0,6.9,1.127,0.0,8824450.0,34.7906,0.0,1.4,18.907,0.098625,0.14
25%,1974.0,16.6,6.86375,390.38575,1393010000.0,513.1455,0.594949,14.475,55.91775,19.7834,3.687
50%,1989.0,27.5455,9.2,1541.895,7275305000.0,1852.81,8.406225,37.0,67.276,64.0075,6.775
75%,2004.0,40.88125,12.687,4313.7675,48577820000.0,7774.565,41.29595,78.2,72.69225,144.823,11.21225
max,2018.0,58.227,54.444,54799.2,20500000000000.0,189171.0,100.0,279.4,85.4171,21389.1,37.94


In [28]:
display(data_dict.describe())

Unnamed: 0,Table,Field,Description
count,58,58,58
unique,2,58,55
top,hdi,Country Name,The name of the country
freq,43,1,2


In [6]:
print(hdi_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Columns: 1008 entries, iso3 to mf_2021
dtypes: float64(1004), object(4)
memory usage: 1.6+ MB
None


In [8]:
print(worldbank_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12449 entries, 0 to 12448
Data columns (total 15 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Country Name                                                  12449 non-null  object 
 1   Country Code                                                  12449 non-null  object 
 2   Region                                                        12449 non-null  object 
 3   IncomeGroup                                                   12449 non-null  object 
 4   Year                                                          12449 non-null  int64  
 5   Birth rate, crude (per 1,000 people)                          11440 non-null  float64
 6   Death rate, crude (per 1,000 people)                          11416 non-null  float64
 7   Electric power consumption (kWh per capita)                   5848 

In [9]:
print(data_dict.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Table        58 non-null     object
 1   Field        58 non-null     object
 2   Description  58 non-null     object
dtypes: object(3)
memory usage: 1.5+ KB
None


In [11]:
# Check missing values in each dataset
print("\nMissing Values in HDI dataset:")
print(hdi_df.isnull().sum())


Missing Values in HDI dataset:
iso3              0
country           0
hdicode          15
region           55
hdi_rank_2021    15
                 ..
mf_2017          38
mf_2018          38
mf_2019          38
mf_2020          38
mf_2021          38
Length: 1008, dtype: int64


In [12]:
print(worldbank_df.isnull().sum())

Country Name                                                       0
Country Code                                                       0
Region                                                             0
IncomeGroup                                                        0
Year                                                               0
Birth rate, crude (per 1,000 people)                            1009
Death rate, crude (per 1,000 people)                            1033
Electric power consumption (kWh per capita)                     6601
GDP (USD)                                                       2871
GDP per capita (USD)                                            2874
Individuals using the Internet (% of population)                7385
Infant mortality rate (per 1,000 live births)                   2465
Life expectancy at birth (years)                                1273
Population density (people per sq. km of land area)              604
Unemployment (% of total labor for

In [13]:
print(data_dict.isnull().sum())

Table          0
Field          0
Description    0
dtype: int64
