In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Load libraries and read the data

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

# For scaling the data
from sklearn.preprocessing import StandardScaler

# To perform K-means clustering
from sklearn.cluster import KMeans


#To perform hierarchical clustering
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [None]:
data_dict = pd.read_csv('/kaggle/input/unsupervised-learning-on-country-data/data-dictionary.csv')

In [None]:
data_dict

In [None]:
countries = pd.read_csv('/kaggle/input/unsupervised-learning-on-country-data/Country-data.csv')

In [None]:
countries.head()

In [None]:
countries.shape

Since the exports, healths and imports are given as percentage of GDPP. First we will convert them to their actual values respectively. This will help us to get the actual comparisons among the countries.


In [None]:
# features = ['exports', 'health', 'imports']
# for feature in features:
#     countries[feature] = countries[feature]*countries['gdpp']/100
countries[['exports', 'health', 'imports']]= \
                                countries[['exports', 'health', 'imports']].multiply(countries["gdpp"]/100, axis="index")


In [None]:
countries.head()

In [None]:
countries.info()

In [None]:
#checking for duplicates
countries.duplicated(subset = ['country'], keep = False).sum()

There are no duplicated rows

In [None]:
# Checking for null values in the column
countries.isnull().sum()

There are no NULL values in the dataset

#### Checking for outliers and treating it

In [None]:
countries.describe(percentiles=[.25,.5,.75,.90,.95,.99])

### Performing Univariate Ananlysis on all the features

In [None]:
features = countries.columns
features

In [None]:
fig = plt.figure(figsize=(15,25))
for plot, feature in enumerate(features[1:]):
    fig.add_subplot(5,3,plot+1)
    sns.boxplot(x=countries[feature], data=countries)
                 

From the above plots, we can conculde the following:
- There are outliers in each features.
- Most of the outliers are in health, exports and gdpp
- There are only 167 rows(countries), so removing these outliers will delete the crucial information from the datesets.
- Specially the countries which have exceptionally high child mortalities(outliers), will be missed, which actually need the aid from the NGO. For example, in case of Child Mortality rate, country with value `208` is an outlierm, however, the coutry itself could in dire need of aid.
- So I chose to treat the outliers only in the `gdpp` column. The coutries with high `gdpp` would not require an aid. So I will remove those from the dataset.

In [None]:
#Caluclating the quantile1 and quantile3 for gdpp and removing the upper outliers
Q1 = countries.gdpp.quantile(0.05)
Q3 = countries.gdpp.quantile(0.95)

countries[(countries.gdpp > Q3)]

###### In the above table we can see, that Qatar is having a high child mortality rate of 9.0, but its other features like gdpp, total_fertility rate, income etc are in the higher range. So it would not be first in the list of needy countries.

After removing the outlier

In [None]:
countries = countries[(countries.gdpp <= Q3)]

In [None]:
countries.shape

In [None]:
countries.head(5)

#### Visulaizing the data using Pair Plot

In [None]:
fig = plt.figure(figsize=(20,35))
sns.pairplot(countries,diag_kind='kde',corner=True)

From the above plots we can conclude the following about gddp
- It is highly and positively correlated with life expectancy and income
- It is inversely correlated with child_mortality, total fertility and inflation
- health, imports and exports are also positively correlated with gddp
- Also there is linear realtionship between gdpp vs income,imports vs exports, total_fer  vs child_mort

### Top 10 under developed coutries based on the data visulaization 

###### Top 10 coutries with highest mortality rate

In [None]:
plt.figure(figsize = (15,5))
child_mort_top10 = countries[['country','child_mort']].sort_values('child_mort', ascending = False).head(10)
ax=sns.barplot(x='country', y='child_mort', data= child_mort_top10)

for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.05 , p.get_height() * 1.01))
ax.set(xlabel = '', ylabel= 'Child Mortality Rate')
plt.xticks(rotation=45)

###### Top 10 countries with the `lowest` spending on health

In [None]:
plt.figure(figsize = (15,5))
health = countries[['country','health']].sort_values('health', ascending = True).head(10)
ax = sns.barplot(x='country', y='health', data= health)
ax.set(xlabel = '', ylabel= 'Health')
plt.xticks(rotation=90)

###### Top 10 countries with lowest per capita income.

In [None]:
plt.figure(figsize = (15,5))
income_lowets_10 = countries[['country','income']].sort_values('income', ascending = True).head(10)
axes = sns.barplot(x='country', y='income', data= income_lowets_10)
axes.set(xlabel = '', ylabel= 'Per Capita Income')
plt.xticks(rotation=90)

###### Top 10 Countries with the highest Inflation rate.

In [None]:
plt.figure(figsize = (15,5))
high_inflation = countries[['country','inflation']].sort_values('inflation', ascending = False).head(10)
ax = sns.barplot(x='country', y='inflation', data= high_inflation)
ax.set(xlabel = '', ylabel= 'Inflation')
plt.xticks(rotation=90)


From the above plot we can see that is Nigeria is clearly standing out having the highest infaltion.

#### From the above visulizations we get a good idea about the under developed countries which might be in need of the NGOs aid.


#### Checking if the datset is fit for applying clustering on it

In [None]:
#Find the Hopkins statistic score
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
hopkins(countries.drop('country', axis = 1))

### We have got the Hopkins score close to 1 which tends to indicate the data is highly clustered

### Preparing the dataset for clustering

#### Scaling the dataset

In [None]:
features = countries.columns
standard_scaler = StandardScaler()
countries_scaled= standard_scaler.fit_transform(countries.drop('country', axis=1))
countries_scaled

In [None]:
country_norm = pd.DataFrame(countries_scaled)
country_norm.columns = features[1:]
country_norm.head()

#### Finding the  Silhouette score

In [None]:
from sklearn.metrics import silhouette_score
ss = []
for k in range(2, 15):
    kmean = KMeans(n_clusters = k).fit(country_norm)
    ss.append([k, silhouette_score(country_norm, kmean.labels_)])
temp = pd.DataFrame(ss)    
plt.plot(temp[0], temp[1])

#### Elbow score

In [None]:
# sum of squared distances
ssd = []
for num_clusters in list(range(1,21)):
    model_clus = KMeans(n_clusters = num_clusters, max_iter=50)
    model_clus.fit(country_norm)
    ssd.append(model_clus.inertia_)

    
fig, ax = plt.subplots()
ax.plot(range(1,21),ssd, marker = '+',color = 'red' , ls = '--', markeredgecolor = 'blue', markersize = '10')

###### By looking silhouette plot, we see the highest peak is at k = 4 and in sum of squared distances plot , we see that the elbow is in the range of 3 to 5 , so let us take k as 4.

### Performing K-Means Clustering on the scaled dataset


In [None]:
kmeans_clus4 = KMeans(n_clusters=4, max_iter=50, random_state= 50)
kmeans_clus4.fit(country_norm)

In [None]:
kmeans_clus4.labels_

In [None]:
#Finding the labels and converting it to series so that it can be appended to the `original countries` easily.
labels_kmeans = pd.Series(kmeans_clus4.labels_, index=countries.index)

In [None]:
#Adding the labels to countries dataset
countries_kmeans = countries.copy()
countries_kmeans['Labels'] = labels_kmeans
countries_kmeans.head()

### Checking the number of countries assigned to each clusters

In [None]:
countries_kmeans['Labels'].value_counts()

## Profiling the clusters based on  - [gdpp, child_mort and income] 

#### Bivariate analysis of clustered countries 

In [None]:
axes = sns.scatterplot(x='gdpp',y='child_mort',hue='Labels',legend='full',data=countries_kmeans, palette=['green','orange','brown','red'])
axes.set(xlabel = 'GDPP', ylabel= 'Child Mortality Rate')

In [None]:
axes = sns.scatterplot(x='gdpp',y='income',hue='Labels',legend='full',data=countries_kmeans, palette=['green','orange','brown','red'])
axes.set(xlabel = 'GDPP', ylabel= 'Income')

In [None]:
axes= sns.scatterplot(x='income',y='child_mort',hue='Labels',legend='full',data=countries_kmeans, palette=['green','orange','brown','red'])
axes.set(xlabel = 'Child Mortality Rate', ylabel= 'Income')

### From the above 3 scatterplots we can deduce the following insights about the clusters, specially about the clster labeled 1.

- In the gdpp vs child_mort plot, the child_mort is high when gdpp is low.
- In the gdpp vs income, when the gdpp is average the income is average.
- In the income vs child_mort, when income is high the child mortality is low and vice versa.

#### Taking the means and analyzing the clusters further

In [None]:
child_mort_mean =pd.DataFrame(countries_kmeans.groupby(["Labels"]).child_mort.mean())
exports_mean=pd.DataFrame(countries_kmeans.groupby(["Labels"]).exports.mean())
health_mean =pd.DataFrame(countries_kmeans.groupby(["Labels"]).health.mean())
imports_mean =pd.DataFrame(countries_kmeans.groupby(["Labels"]).imports.mean())
income_mean =pd.DataFrame(countries_kmeans.groupby(["Labels"]).income.mean())
inflat_mean =pd.DataFrame(countries_kmeans.groupby(["Labels"]).inflation.mean())
life_expec_mean =pd.DataFrame(countries_kmeans.groupby(["Labels"]).life_expec.mean())
total_fer_mean =pd.DataFrame(countries_kmeans.groupby(["Labels"]).total_fer.mean())
gdpp_mean =pd.DataFrame(countries_kmeans.groupby(["Labels"]).gdpp.mean())

In [None]:
cluster_analysis_df = pd.concat([pd.Series([0,1,2,3]),child_mort_mean,exports_mean,health_mean,imports_mean,income_mean,inflat_mean,life_expec_mean,
                                 total_fer_mean,gdpp_mean], axis=1)

In [None]:
cluster_analysis_df.columns = ["Labels","child_mort_mean","exports_mean","health_mean","imports_mean","income_mean","inflation_mean","life_expec_mean","total_fer_mean","gdpp_mean"]
cluster_analysis_df

In [None]:
features = cluster_analysis_df.columns
fig = plt.figure(figsize=(18,15))
for plot, feature in enumerate(features[1:]):
    fig.add_subplot(3,3,plot+1)
    sns.barplot(cluster_analysis_df.Labels, cluster_analysis_df[feature])

From the above plot we can see that the criterias , namely, `gdpp, income and child mortality` are satified by cluster labeled 1 

In [None]:
#Binning the clusters to get the top 5 countries sorted in order exaplined above
clustered_kmeans=countries[countries_kmeans['Labels']==1]
clustered_kmeans.sort_values(['gdpp','income','child_mort','health','inflation','life_expec','total_fer','imports','exports'], 
                      ascending=[True,True,False,True,False,True,False,False,True]).head(10)


### So from the above results and analysis, we can conclude that as per K- means clustering, the country which are direst need of aid are:
    1. Burundi
    2. Liberia
    3. Congo, Dem. Rep
    4. Niger
    5. Sierra Leone

##  Taking Hierarchical Clustering approach

#### Using the already cleaned and scaled data, done earlier in  kmeans analysis

In [None]:
country_norm.shape

In [None]:
country_norm.head()

In [None]:
countries_hrcl  = country_norm.copy()
fig = plt.figure(figsize = (25,8))
mergings = linkage(countries_hrcl, method = "single", metric='euclidean')
dendrogram(mergings)
plt.show()

### By single method heirarchical clustering things are not clear, so let us go for complete heirarchical clustering method.

In [None]:
# HC with complete method
fig = plt.figure(figsize = (25, 8))
mergings = linkage(countries_hrcl, method = "complete", metric='euclidean')
dendrogram(mergings)
plt.show()

##### At 12.5 there are 3 clusters and at 10.0 there are 4 clusters
### Let us cut the dendrogram  such that there are  n_clusters = 4 


In [None]:
countries

In [None]:
clusterCut = cut_tree(mergings, n_clusters = 4).reshape(-1,)
clusterCut

In [None]:
countries_hrcl_cluster = countries.copy()
countries_hrcl_cluster.head()

In [None]:
countries_hrcl_cluster['Labels'] = pd.Series(clusterCut, index=countries.index)
countries_hrcl_cluster.head()

### Checking the number of countries assigned to each clusters

In [None]:
countries_hrcl_cluster['Labels'].value_counts()

The above numbers are quite different from the one which we got in kMeans method.
But that is understandable becuase the Hierarchical clustering method will form different clusters than the Kmeans one.

## Profiling the clusters based on  - [gdpp, child_mort and income] 

#### Bivariate analysis of clustered countries 

In [None]:
axes = sns.scatterplot(x='gdpp', y='child_mort', hue='Labels',legend='full',data=countries_hrcl_cluster, palette=['green','orange','brown','red'])
axes.set(xlabel = 'GDPP', ylabel= 'Child Mortality Rate')

In [None]:
axes = sns.scatterplot(x='gdpp',y='income',hue='Labels',legend='full',data=countries_hrcl_cluster, palette=['green','orange','brown','red'])
axes.set(xlabel = 'GDPP', ylabel= 'Income')

In [None]:
axes= sns.scatterplot(x='income',y='child_mort',hue='Labels',legend='full',data=countries_hrcl_cluster, palette=['green','orange','brown','red'])
axes.set(xlabel = 'Child Mortality Rate', ylabel= 'Income')

### From the above 3 scatterplots we can deduce the following insights about the clusters, specially about the clster labeled 0.

- In the gdpp vs child_mort plot, the child_mort is high when gdpp is low.
- In the gdpp vs income, when the gdpp is average the income is average.
- In the income vs child_mort, when income is high the child mortality is low and vice versa.

#### Taking the means of the features and analyzing the clusters further

In [None]:
child_mort_mean =pd.DataFrame(countries_hrcl_cluster.groupby(["Labels"]).child_mort.mean())
exports_mean=pd.DataFrame(countries_hrcl_cluster.groupby(["Labels"]).exports.mean())
health_mean =pd.DataFrame(countries_hrcl_cluster.groupby(["Labels"]).health.mean())
imports_mean =pd.DataFrame(countries_hrcl_cluster.groupby(["Labels"]).imports.mean())
income_mean =pd.DataFrame(countries_hrcl_cluster.groupby(["Labels"]).income.mean())
inflat_mean =pd.DataFrame(countries_hrcl_cluster.groupby(["Labels"]).inflation.mean())
life_expec_mean =pd.DataFrame(countries_hrcl_cluster.groupby(["Labels"]).life_expec.mean())
total_fer_mean =pd.DataFrame(countries_hrcl_cluster.groupby(["Labels"]).total_fer.mean())
gdpp_mean =pd.DataFrame(countries_hrcl_cluster.groupby(["Labels"]).gdpp.mean())

In [None]:
hrcl_cluster_analysis_df = pd.concat([pd.Series([0,1,2,3]),child_mort_mean,exports_mean,health_mean,imports_mean,income_mean,inflat_mean,life_expec_mean,
                                 total_fer_mean,gdpp_mean], axis=1)

In [None]:
hrcl_cluster_analysis_df.columns = ["Labels","child_mort_mean","exports_mean","health_mean","imports_mean","income_mean","inflation_mean","life_expec_mean","total_fer_mean","gdpp_mean"]
hrcl_cluster_analysis_df

In [None]:
features = hrcl_cluster_analysis_df.columns
fig = plt.figure(figsize=(18,15))
for plot, feature in enumerate(features[1:]):
    fig.add_subplot(3,3,plot+1)
    sns.barplot(hrcl_cluster_analysis_df.Labels, hrcl_cluster_analysis_df[feature])

In [None]:
#Binning the clusters to get the top 5 countries sorted in order exaplined above
clustered_hrcl=countries[countries_hrcl_cluster['Labels']==0]
clustered_hrcl.sort_values(['gdpp','income','child_mort','health','inflation','life_expec','total_fer','imports','exports'], 
                      ascending=[True,True,False,True,False,True,False,False,True]).head(10)


### So from the above results and analysis, we can conclude that as per Heirarchical clustering, the country which are direst need of aid are:
    1. Burundi
    2. Liberia
    3. Congo, Dem. Rep
    4. Niger
    5. Sierra Leone

### We got same countries by both K-means and Heirarchical Clustering techniques: 
##### Therefore following are the countries which are in direst need of aid by considering socio – economic factor into consideration:

    1. Burundi
    2. Liberia
    3. Congo, Dem. Rep.
    4. Niger
    5. Sierra Leone