In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import the StandardScaler()
from sklearn.preprocessing import StandardScaler

#Improting the PCA module
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
from math import isnan

# To perform KMeans clustering 
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

#Let's check the silhouette score first to identify the ideal number of clusters
from sklearn.metrics import silhouette_score

## Read and Understand the data

In [None]:
# reading datasets
country_data = pd.read_csv('../input/pca-kmeans-hierarchical-clustering/Country-data.csv')
country_data.head()

In [None]:
country_data.shape

In [None]:
country_data.columns

In [None]:
country_data.info()

## Data Cleaning
- Identifying Missing data
- Identifying wrong data type
- Removing duplicates

In [None]:
#Identifying Missing data
total_null = country_data.isnull().sum().sort_values(ascending = False)
percent = ((country_data.isnull().sum()/country_data.isnull().count())*100).sort_values(ascending = False)
print("Total records (country_data Data) = ", country_data.shape[0])

missing_data = pd.concat([total_null,percent.round(2)],axis=1,keys=['Total Missing','In Percent'])
missing_data.head(5)

## Inference
- None of the columns have null values hence no drop required.

In [None]:
country_data.dtypes

## Inference
None of the columns have inconsistent datatype, hence no conversion is required.

## Data Preparation
* Derived Metrices
* Exploratary Data analysis
    * Visualize the data (Undertanding top or bottom 10 countries) on various factors
* Correlation between different variables. How they co-related
* Scaling the Data
* PCA (Principal Component Analysis) on the data to remove redundancies.

In [None]:
country_data.describe()

## Derived Metrices
* The variables export, health & imports are percentage values and hence wouldn't give the clear picture of spending by the country. For example two countries (Afghanistan & Albina) have similar import % but not necessarily have the same gdpp which doesn't give accurate of idea of country being develop or under develop. Hence we need to derive the actual value of this variable.

In [None]:
# Converting exports,imports & health spending percentages to absolute values.
country_data['exports'] = country_data['exports'] * country_data['gdpp']/100
country_data['imports'] = country_data['imports'] * country_data['gdpp']/100
country_data['health'] = country_data['health'] * country_data['gdpp']/100

country_data.head(10)

## Exploratory Data Analytics
* With the new derived variables let's visualise the top/bottom countries on different socio-economic and health factors

In [None]:
fig = plt.figure(figsize=(14,6))
fig.set_facecolor("lightgrey")

plt.subplot(2,3,1)
# Health :Total health spending as %age of Total GDP.
bottom10_health = country_data[['country','health']].sort_values('health', ascending = True).head(10)
sns.barplot(x='country',y='health',data=bottom10_health,palette="BuGn_r")
plt.title("Top 10 Countries with lowest spent on Health overall gdp",fontsize=9)
plt.xticks(rotation = 90,fontsize=10,family = "Comic Sans MS")

# Exports: Exports of goods and services. Given as %age of the Total GDP
plt.subplot(2,3,2)
bottom10_export = country_data[['country','exports']].sort_values('exports', ascending = True).head(10)
sns.barplot(x='country',y='exports',data=bottom10_export,palette="Blues")
plt.title("Top 10 Countries with lowest exports",fontsize=9)
plt.xticks(rotation = 90,fontsize=10,family = "Comic Sans MS")

# Imports: Imports of goods and services. Given as %age of the Total GDP
plt.subplot(2,3,3)
bottom10_import = country_data[['country','imports']].sort_values('imports', ascending = True).head(10)
sns.barplot(x='country',y='imports',data=bottom10_import,palette="Reds")
plt.title("Top 10 Countries with lowest imports",fontsize=9)
plt.xticks(rotation = 90,fontsize=10,family = "Comic Sans MS")


fig = plt.figure(figsize=(14,6))
fig.set_facecolor("lightgrey")

plt.subplot(2,3,1)
# Child Death Rate : Death of children under 5 years of age per 1000 live births
top10_deathrate = country_data[['country','child_mort']].sort_values('child_mort',ascending=False).head(10)
sns.barplot(x='country',y='child_mort',data=top10_deathrate,palette="BuGn_r")
plt.title("Top 10 Countries with highest child Death Rate",fontsize=9)
plt.xticks(rotation = 90,fontsize=10,family="Comic Sans MS")

plt.subplot(2,3,2)
# Fertility Rate: The number of children that would be born to each woman if the current age-fertility rates remain the same
top10_tot_fer = country_data[['country','total_fer']].sort_values('total_fer', ascending = False).head(10)
sns.barplot(x='country',y='total_fer',data=top10_tot_fer,palette="Blues")
plt.title("Top 10 Countries with highest Fertility Rate",fontsize=9)
plt.xticks(rotation = 90,fontsize=10,family = "Comic Sans MS")

# Life Expectancy: The average number of years a new born child would live if the current mortality patterns are to remain same
plt.subplot(2,3,3)
top10_lowest_life_expec = country_data[['country','life_expec']].sort_values('life_expec', ascending = True).head(10)
sns.barplot(x='country',y='life_expec',data=top10_lowest_life_expec,palette="Reds")
plt.title("Top 10 Countries with lowest life Expectancy",fontsize=9)
plt.xticks(rotation = 90,fontsize=10,family = "Comic Sans MS")

fig = plt.figure(figsize=(14,6))
fig.set_facecolor("lightgrey")

plt.subplot(2,3,1)
# The GDP per capita : Calculated as the Total GDP divided by the total population.
bottom10_gdpp = country_data[['country','gdpp']].sort_values('gdpp',ascending=True).head(10)
sns.barplot(x='country',y='gdpp',data=bottom10_gdpp,palette="BuGn_r")
plt.title("Bottom 10 Countries with overall gdpp",fontsize=9)
plt.xticks(rotation = 90,fontsize=10,family="Comic Sans MS")

plt.subplot(2,3,2)
# Per capita Income : Net income per person
bottom10_net_income = country_data[['country','income']].sort_values('income', ascending = True).head(10)
sns.barplot(x='country',y='income',data=bottom10_net_income,palette="Blues")
plt.title("Bottom 10 Countries with net income per person",fontsize=9)
plt.xticks(rotation = 90,fontsize=10,family = "Comic Sans MS")

# Inflation: The measurement of the annual growth rate of the Total GDP
plt.subplot(2,3,3)
bottom10_inflation = country_data[['country','inflation']].sort_values('inflation', ascending = False).head(10)
sns.barplot(x='country',y='inflation',data=bottom10_inflation,palette="Reds")
plt.title("Bottom 10 Countries with inflation rate",fontsize=9)
plt.xticks(rotation = 90,fontsize=10,family = "Comic Sans MS")

plt.show()

## Correlation coefficients
- We have a lot of variables, creating and visualising proper clusters will be a difficulty.Let's see if we can apply PCA to do dimensional reduction first let's plot the correlation matrix and check if the data is indeed highly correlated so that the usage of PCA in this scenario is justified

In [None]:
# Correlation coefficients to see which variables are highly correlated

plt.figure(figsize = (16, 10))
sns.heatmap(country_data.corr(),annot=True,cmap="Greens")
plt.show()

## Inference
- child_mortality and life_expentency are highly correlated with correlation of -0.89
- child_mortality and total_fertility are highly correlated with correlation of 0.85
- imports and exports are highly correlated with correlation of 0.99
- life_expentency and total_fertility are highly correlated with correlation of -0.76

In [None]:
# Pairplot of all numeric columns
sns.pairplot(country_data)

## Inferences
- A lot of highly correlated variables exist, hence the usage of PCA is justified. Now let's proceed to doing it on the dataset

In [None]:
country_data_tmp = country_data.copy()
country_data_tmp.head()

## Rescaling the features

- Most software packages use SVD to compute the principal components and assume that the data is scaled and centred, so it is important to do standardisation/normalisation.

In [None]:
# Create a scaling object
scaler = StandardScaler()

# Create a list of the variables that you need to scale
col_list = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

# Scale these variables using 'fit_transform'
country_data_tmp[col_list] = scaler.fit_transform(country_data_tmp[col_list])

country_data_tmp.head()

## Separate out the Feature variable and response variable

In [None]:
# Putting feature variable to x
x = country_data_tmp.drop(['country'],axis=1)

# Putting response variable to country
country = country_data_tmp['country']

In [None]:
x.shape
x.head()

In [None]:
country.shape
country.head()

## Applying PCA on data

### PCA
- Principal component analysis (PCA) is one of the most commonly used dimensionality reduction techniques to improve model performance

In [None]:
pca = PCA(svd_solver='randomized',random_state=42)
pca.fit(x)

In [None]:
pca.components_

In [None]:
# Variance Ratio
pca.explained_variance_ratio_

In [None]:
# Variance Ratio bar plot for each PCA components.
fig = plt.figure(figsize = (8,6))
plt.bar(range(1,len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_)
plt.xlabel("PCA Components",fontsize=10,family = "Comic Sans MS")
plt.ylabel("Variance Ratio",fontsize=10,family = "Comic Sans MS")
plt.title("Variance Ratio for each PCA component",fontsize=14,family = "Comic Sans MS")

## Inference
* With first component variance explained is almost 60%.
* For second component variance explained is almost 20%.
* For third component variance explained is around 10%

## Making Scree plot

In [None]:
fig = plt.figure(figsize = (8,6))
fig.set_facecolor("lightgrey")
var_cumu = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1,len(var_cumu)+1), var_cumu)
plt.vlines(x=3,ymax=1,ymin=0.0,color="g",linestyles="--")
plt.hlines(y=0.88,xmax=8,xmin=0.0,color="b",linestyles="--")

plt.xlabel('Number of PCA Components',fontsize=12,family = "Comic Sans MS")
plt.ylabel('Cumulative Explained Variance',fontsize=12,family = "Comic Sans MS")
plt.title("Sree plot to Visualize Cumulative Variance",fontsize=14,family = "Comic Sans MS")

## Inference

- From the above it is clear that first 3 principal components can well explain around 90% varaiance. Hence we will use them clustering process.

In [None]:
# Checking which attributes are well explained by the pca components
colnames = list(x.columns)
pca_attr = pd.DataFrame({'Attribute':colnames,'PC1':pca.components_[0],'PC2':pca.components_[1],'PC3':pca.components_[2]})
pca_attr

In [None]:
# Plotting the above dataframe for better visualization with PC1 and PC2
sns.set(style='darkgrid')
sns.pairplot(data=pca_attr, x_vars=["PC1"], y_vars=["PC2"], hue = "Attribute" ,height=8)
plt.xlabel("Principal Component 1",fontsize=12,family = "Comic Sans MS")
plt.ylabel("Principal Component 2",fontsize=12,family = "Comic Sans MS")

for i,txt in enumerate(pca_attr.Attribute):
    plt.annotate(txt, (pca_attr.PC1[i],pca_attr.PC2[i]))

## Inference
- life expectency, income, gdpp and health are very well explained by PC1.
- imports and exports are well explained by both the components PC1 and PC2.
- child mortality,inflation and total fertility are well explained by PC2.


In [None]:
# Building the dataframe using Incremental PCA for better efficiency.
inc_pca = IncrementalPCA(n_components=3)

In [None]:
pca_final = inc_pca.fit_transform(x)
pca_final.shape

In [None]:
pca_final = pd.DataFrame(pca_final, columns=["PC1", "PC2","PC3"])
df = pd.concat([country, pca_final], axis=1)
df.head()

In [None]:
# Plotting Heatmap to check is there still dependency in the dataset.

plt.figure(figsize = (8,6))        
ax = sns.heatmap(df.corr(),annot = True,cmap="Blues")

## Inference:

From above heatmap  - Correlation among the attributes is almost 0, we can proceed with this dataframe

In [None]:
sns.set(style='white')
fig.set_facecolor("lightgrey")

plt.figure(figsize=(20, 8))
plt.subplot(2,3,1)
sns.scatterplot(data=df, x='PC1', y='PC2')
plt.subplot(2,3,2)
sns.scatterplot(data=df, x='PC1', y='PC3')
plt.subplot(2,3,3)
sns.scatterplot(data=df, x='PC3', y='PC2')

## Outlier Analysis and Treatment

In [None]:
#Function to plot a list of categorical variables together
def box_plot(colname):
    plt.figure(figsize=(15, 4))
    for var in colname:
        plt.subplot(1,3,colname.index(var)+1)
        sns.boxplot(x = var, data = df)
        plt.xlabel(var, fontsize=12,family = "Comic Sans MS")
    plt.show()

In [None]:
colnames = ['PC1', 'PC2', 'PC3']
box_plot(colnames[:])

In [None]:
# Statstical Outlier treatment for PC1

Q1 = df.PC1.quantile(0.05)
Q3 = df.PC1.quantile(0.95)
IQR = Q3 - Q1
df = df[(df.PC1 >= Q1) & (df.PC1 <= Q3)]

# Statstical Outlier treatment for PC2

Q1 = df.PC2.quantile(0.05)
Q3 = df.PC2.quantile(0.95)
IQR = Q3 - Q1
df = df[(df.PC2 >= Q1) & (df.PC2 <= Q3)]

# Statstical Outlier treatment for PC3
Q1 = df.PC3.quantile(0.05)
Q3 = df.PC3.quantile(0.95)
IQR = Q3 - Q1
df = df[(df.PC3 >= Q1) & (df.PC3 <= Q3)]

In [None]:
colnames = ['PC1', 'PC2', 'PC3']
box_plot(colnames[:])

In [None]:
# Reindexing the df after outlier removal
df = df.reset_index(drop=True)
df_final = df.drop(['country'],axis=1)
df.head()
df_final.shape

## Hopkins Statistics Test

- A way of measuring the cluster tendency of a data set.
- A value close to 1 tends to indicate the data is highly clustered, random data will tend to result in values around 0.5, and uniformly distributed data will tend to result in values close to 0

In [None]:
def hopkins(X):
    d = X.shape[1]
    n = len(X)
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    HS = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(HS):
        print(ujd, wjd)
        HS = 0
 
    return HS

In [None]:
#Let's check the Hopkins measure
hopkins(df_final)


## Inference

- 0.75 is a good Hopkins score for Clustering.

## Building Model

- K Means Clustering

K-means clustering is one of the simplest and popular unsupervised machine learning algorithms.

The algorithm works as follows:

First we initialize k points, called means, randomly. We categorize each item to its closest mean and we update the mean’s coordinates, which are the averages of the items categorized in that mean so far. We repeat the process for a given number of iterations and at the end, we have our clusters.

### Finding the Optimal Number of Clusters

Elbow Curve to get the right number of Clusters

A fundamental step for any unsupervised algorithm is to determine the optimal number of clusters into which the data may be clustered. The Elbow Method is one of the most popular methods to determine this optimal value of k.

In [None]:
# Elbow curve method to find the ideal number of clusters.
ssd = []
for num_clusters in list(range(1,10)):
    kmeans = KMeans(n_clusters = num_clusters, max_iter=50,random_state= 100)
    kmeans.fit(df_final)
    ssd.append(kmeans.inertia_)

plt.plot(ssd)

## Inference
* Looking at the above elbow curve it looks good to proceed with either 4 or 5 clusters.

In [None]:
# Silhouette score analysis to find the ideal number of clusters for K-means clustering
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50,random_state= 100)
    kmeans.fit(df_final)
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(df_final, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))

In [None]:
#K-means with k=4 clusters

cluster5 = KMeans(n_clusters=5, max_iter=50, random_state= 100)
cluster5.fit(df_final)

# Cluster labels
cluster5.labels_

In [None]:
# Assign the label
df['Cluster_Id'] = cluster5.labels_
df.head()

In [None]:
# Number of countries in each cluster
df['Cluster_Id'].value_counts()


## Inference:

- It seems there are good number of countries in each clusters.

In [None]:
# Scatter plot on Principal components to visualize the spread of the data

fig, axes = plt.subplots(1,3, figsize=(15,7))

sns.scatterplot(x='PC1',y='PC2',hue='Cluster_Id',legend='full',palette="Set1",data=df,ax=axes[0])
sns.scatterplot(x='PC1',y='PC3',hue='Cluster_Id',legend='full',palette="Set1",data=df,ax=axes[1])
sns.scatterplot(x='PC2',y='PC3',hue='Cluster_Id',legend='full',palette="Set1",data=df,ax=axes[2])

## Inference:
* We have visualized the data on the principal components and saw some good clusters were formed but some were not so good hence let's now visualize the data on the original attributes.

In [None]:
# Merging the df with PCA with original df

df_merge = pd.merge(country_data,df,on='country')
df_merge_col = df_merge[['country','child_mort','exports','imports','health','income','inflation','life_expec','total_fer','gdpp','Cluster_Id']]

# Creating df with mean values
cluster_child = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).child_mort.mean())
cluster_export = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).exports.mean())
cluster_import = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).imports.mean())
cluster_health = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).health.mean())
cluster_income = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).income.mean())
cluster_inflation = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).inflation.mean())         
cluster_lifeexpec = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).life_expec.mean())
cluster_totalfer = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).total_fer.mean())
cluster_gdpp = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).gdpp.mean())

df_concat = pd.concat([pd.Series([0,1,2,3,4]),cluster_child,cluster_export,cluster_import,cluster_health,cluster_income
                       ,cluster_inflation,cluster_lifeexpec,cluster_totalfer,cluster_gdpp], axis=1)
df_concat.columns = ["Cluster_Id", "Child_Mortality", "Exports", "Imports","Health_Spending","Income","Inflation","Life_Expectancy","Total_Fertility","GDPpcapita"]
df_concat.head()

## Inferences
- From the business understanding we have learnt that Child_Mortality, Income, Gdpp are some important factors which decides the development of any country. We have also cross checked with Principal components and found that these variables have good score in PCA. Hence, we will proceed with analyzing these 3 components to build some meaningful clusters.

In [None]:
df_merge_col.head(5)

In [None]:
figsize=(15,12)
sns.scatterplot(x='income',y='child_mort',hue='Cluster_Id',data = df_merge_col,legend='full',palette="Set1")

In [None]:
figsize=(15,12)
sns.scatterplot(x='child_mort',y='gdpp',hue='Cluster_Id',data=df_merge_col,legend='full',palette="Set1")

In [None]:
figsize=(15,12)
sns.scatterplot(x='gdpp',y='income',hue='Cluster_Id',data=df_merge_col,legend='full',palette="Set1")

In [None]:
# Box plot on Original attributes to visualize the spread of the data
fig, axes = plt.subplots(2,2, figsize=(15,12))

sns.boxplot(x = 'Cluster_Id', y = 'child_mort', data = df_merge_col,ax=axes[0][0])
sns.boxplot(x = 'Cluster_Id', y = 'income', data = df_merge_col,ax=axes[0][1])
sns.boxplot(x = 'Cluster_Id', y = 'inflation', data=df_merge_col,ax=axes[1][0])
sns.boxplot(x = 'Cluster_Id', y = 'gdpp', data=df_merge_col,ax=axes[1][1])



## Inference:

- Child Mortality is highest for Cluster 0 and Cluster 3.These clusters need some aid.
- Income and Gdpp are measures of development. Higher the per capita income and gdpp better is the country's development. Income per capita and gdpp seems lowest for countries in clusters 0 and 3. Hence, these countries need some help.



In [None]:
# Box plot to visualise the mean value of few original attributes.

fig, axes = plt.subplots(2,2, figsize=(15,12))

sns.boxplot(x = 'Cluster_Id', y = 'Child_Mortality', data = df_concat,ax=axes[0][0])
sns.boxplot(x = 'Cluster_Id', y = 'Income', data = df_concat,ax=axes[0][1])
sns.boxplot(x = 'Cluster_Id', y = 'Inflation', data=df_concat,ax=axes[1][0])
sns.boxplot(x = 'Cluster_Id', y = 'GDPpcapita', data=df_concat,ax=axes[1][1])

## Inferences
 - Mean values also suggests same trends as above.

In [None]:
# List of countries in Cluster 0
df_merge_col[df_merge_col['Cluster_Id']==0]

In [None]:
# List of countries in Cluster 3
df_merge_col[df_merge_col['Cluster_Id']==3]

## Hierarchical Clustering

Hierarchical clustering involves creating clusters that have a predetermined ordering from top to bottom. For example, all files and folders on the hard disk are organized in a hierarchy. There are two types of hierarchical clustering,

- Divisive
- Agglomerative.

In [None]:
df_final.head()

In [None]:
# Single linkage
single_link = linkage(df_final, method='single',metric='euclidean')
dendrogram(single_link)
plt.show()

## Inference 
- No Good results. Let's try Complete linkage Method.

In [None]:
# Complete Linkage

complete_link = linkage(df_final, method='complete',metric='euclidean')
dendrogram(complete_link)
plt.show()

In [None]:
df_hc = df.copy()
df_hc = df_hc.drop('Cluster_Id',axis=1)
df_hc.head()

In [None]:
# Let cut the tree at height of approx 3 to get 4 clusters &
# see if it get any better cluster formation.

clusterCut = pd.Series(cut_tree(complete_link, n_clusters = 4).reshape(-1,))
df_hc_cut = pd.concat([df_hc, clusterCut], axis=1)
df_hc_cut.columns = ['country', 'PC1', 'PC2','PC3','Cluster_Id']
df_hc_cut.head()

In [None]:
# Scatter plot on Principal components to visualize the spread of the data

fig, axes = plt.subplots(1,2, figsize=(15,8))

sns.scatterplot(x='PC1',y='PC2',hue='Cluster_Id',legend='full',palette="Set1",data=df_hc_cut,ax=axes[0])
sns.scatterplot(x='PC1',y='PC3',hue='Cluster_Id',legend='full',palette="Set1",data=df_hc_cut,ax=axes[1])

In [None]:
# Merging the df with PCA with original df

df_merge_hc=pd.merge(country_data,df_hc_cut,on='country')
df_merge_hc_col=df_merge_hc[['country','child_mort','exports','imports','health','income','inflation','life_expec','total_fer','gdpp','Cluster_Id']]
df_merge_hc_col.head()

In [None]:
df_merge_hc_col['Cluster_Id'].value_counts()

## Inference
- The no. of Countries in Cluster =0 using Hierarihical clustering is same as the no. of countries for Cluster = 0 in K-Means clustering.

In [None]:
figsize=(15,12)
sns.scatterplot(x='income',y='child_mort',hue='Cluster_Id',data = df_merge_hc_col,legend='full',palette="Set1")

In [None]:
figsize=(15,12)
sns.scatterplot(x='child_mort',y='gdpp',hue='Cluster_Id',data=df_merge_hc_col,legend='full',palette="Set1")

In [None]:
figsize=(15,12)
sns.scatterplot(x='gdpp',y='income',hue='Cluster_Id',data=df_merge_hc_col,legend='full',palette="Set1")

In [None]:
# Box plot on Original attributes to visualize the spread of the data
fig, axes = plt.subplots(2,2, figsize=(15,12))

sns.boxplot(x = 'Cluster_Id', y = 'child_mort', data = df_merge_hc_col,ax=axes[0][0])
sns.boxplot(x = 'Cluster_Id', y = 'income', data = df_merge_hc_col,ax=axes[0][1])
sns.boxplot(x = 'Cluster_Id', y = 'inflation', data=df_merge_hc_col,ax=axes[1][0])
sns.boxplot(x = 'Cluster_Id', y = 'gdpp', data=df_merge_hc_col,ax=axes[1][1])


## Inference
- We have analyzed both K-means and Hierarchial clustering and found clusters formed are identical. The clusters formed in both the cases are not that great but gives some idea about countries which need of aid.

In [None]:
# List of countries in Cluster 0
df_merge_hc_col[df_merge_hc_col['Cluster_Id']==0]

## Inference 
 - The country list for cluster =0 using Hierarchical and K-Means are same. 

In [None]:
country_list = df_merge_hc_col[df_merge_hc_col['Cluster_Id']==0]
country_list.head()

In [None]:
country_list['country']

In [None]:
country_childmort = pd.DataFrame(country_list.groupby(['country'])['child_mort'].mean().sort_values(ascending = False))
country_childmort.plot.bar(figsize=(15,10),facecolor='g')
plt.title('Country vs Child Mortality',fontsize=16,family = "Comic Sans MS")
plt.xlabel("Country",fontweight = 'bold')
plt.ylabel("Child Mortality", fontsize = 12, fontweight = 'bold')
plt.show()

In [None]:
# BarPlot for Per Capita Income of countries which are in need of aid

country_income = pd.DataFrame(country_list.groupby(['country'])['income'].mean().sort_values(ascending = True))
country_income.plot.bar(figsize=(15,10),facecolor='b')
plt.title('Country vs Per Capita Income',fontsize=16,family = "Comic Sans MS")
plt.xlabel("Country",fontweight = 'bold')
plt.ylabel("Per Capita Income", fontsize = 12, fontweight = 'bold')
plt.show()


In [None]:
# BarPlot for Per Capita Income of countries which are in need of aid

country_gdp = pd.DataFrame(country_list.groupby(['country'])['gdpp'].mean().sort_values(ascending = True))
country_gdp.plot.bar(figsize=(15,10),facecolor='r')
plt.title('Country vs GDP per capita',fontsize=16,family = "Comic Sans MS")
plt.xlabel("Country",fontweight = 'bold')
plt.ylabel("GDP per capita", fontsize = 12, fontweight = 'bold')
plt.show()

## Closing Statement

- We have used PCA above to reduce the variables involved and then done the clustering of countries based on those Principal components and then later we identified few factors like child mortality, income etc which plays a vital role in deciding the development status of the country and builded clusters of countries based on that. Based on those clusters we have identified the below list of countries which are in dire need of aid. The list of countries are subject to change as it is based on the few factors like Number of components chosen, Number of Clusters chosen, Clustering method used etc.which we have used to build the model.

In [None]:
# Final countries list
country_list.reset_index(drop=True).country