# Importing modules

In [None]:
import warnings 
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

# 1. Importing the dataset and data inspection

In [None]:
cntry_df = pd.read_csv(r'https://cdn.upgrad.com/UpGrad/temp/2f181ade-814a-430a-b721-769d6a9bf814/Country-data.csv')
cntry_df.head()

In [None]:
cntry_df.dtypes

- All features except country are numerical as expected

In [None]:
cntry_df.shape

In [None]:
cntry_df.country.nunique()

- No of unique entries in country column and number of rows in dataframe are equal. Which means that there are no multiple entries

In [None]:
cntry_df.info()

In [None]:
cntry_df.isnull().sum()

- No null values in the dataframe

In [None]:
cntry_df.describe()

### INSIGHT
   - Dataframe describes various socio-economic factors of 167 different countries.
   - Dataframe has no multiple entries and no null values
   - All features except 'country' are numerical as expected. So, no need of any type casting
   - Descriptive statistics indicates that there is some variability in the data and requires scaling while model building.

## 2. Data Transformation

### INSIGHT
- In the dataframe(cntry_df) provided, `imports`,`exports` and `health` features have the percentage of money spent or gained from these sectors from the total gdpp which is not clear. So, to avoid this, values in `imports`, `exports` and `health` are converted to absolute values

In [None]:
for i in ['imports','exports','health']:
    cntry_df[i] = (cntry_df[i] * cntry_df['gdpp'])/100
cntry_df.head()

## 2.1 Outlier Treatment

In [None]:
# checking the outliers

plt.figure(figsize = (20,10))

for i in enumerate(cntry_df.columns[1:]): # first column is 'country'. So, dropped it.
    plt.subplot(3,3,i[0]+1)
    sns.boxplot(cntry_df[i[1]])

`OBSERVATIONS:`
   - There are outliers in all features. 
   - We cannot cap the outliers in all features because is leads to deletion of some countries which are actually in dire need of aidand should be in prime focus.
   - So, I've decided to go with the model which suites the Business Problem better. That is, not to treat the Outlier, and check the model with different K values to see which one gives a better business outcome.

## 3. Exploratory Data Analysis
- `Visualizing Distribution`

Plotting a pairplot

In [None]:
num_df = cntry_df[['child_mort', 'exports', 'health', 'imports', 'income', 'inflation','life_expec', 'total_fer', 'gdpp']
]
pair = sns.PairGrid(num_df)
pair.map_upper(sns.regplot,color ='green', fit_reg = False)
pair.map_lower(sns.regplot, color ='red', fit_reg = False)
pair.map_diag(plt.hist)
plt.show()

In [None]:
plt.figure(figsize=(21, 18))
features = ['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']
for i in enumerate(features):
    ax = plt.subplot(3, 3, i[0]+1)
    sns.distplot(num_df[i[1]])
    plt.xticks(rotation=20)

### INSIGHT:
   - Majority of the data points are `not normally distributed`. 
   - Variance is not uniform and range is not same for all features.
   - We need to standardize data to overcome all this. Since we need to calculate the Euclidean distance between the data points for clustering, it is important to ensure that the values with high range do not outweigh the values with smaller range. Thus, scaling down all the attributes to a uniform scale is important.

### SCALING THE DATA (STANDARDIZATION) 
    - We will use standardization method for scaling the data.

In [None]:
new_cntry_df = cntry_df[cntry_df.columns[cntry_df.dtypes != 'object']]
new_cntry_df.info()

In [None]:
new_cntry_df.describe()

`Rescaling the data`

In [None]:
# instantiate 
scaler = StandardScaler()

# fit_transform
scaled_data = pd.DataFrame(scaler.fit_transform(new_cntry_df))

scaled_data.columns = cntry_df.columns[cntry_df.dtypes != 'object']

scaled_data.head()

### HOPKINS STATISTICS
   - Before applying clustering on a dataframe, first we need to check whether has some meaningful(not occuring at random) clusters.
   - The process of evaluating data whether clustering can be applied or not is called `Clustering Tendency`.
   - For `clustering tendency`, we use `Hopkins Test` which examines whether the data points differ significantly from uniformly distributed data in the multidimensional space.

In [None]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins_test(X):
    d = X.shape[1]
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
#Using the Hopkins Statistic function by passing the above dataframe as a parameter
hopkins_test(scaled_data)

### INSIGHT
   - If the value is between {0.01, ...,0.3}, the data is regularly spaced (normally distributed).
   - If the value is around 0.5, it is random.
   - If the value is between {0.7, ..., 0.99}, it has a high tendency to cluster. 

# 4. Model building
## 4.1 Hierarchial Clustering

- We use Hierarchial clusterign approach to identify appropriate cluster size with a good split in data
    - Max Intra-cluster distance and Min Inter-cluster distance

### Single linkage
   - Here, the distance between 2 clusters is defined as the shortest distance between points in the two clusters.

In [None]:
plt.figure(figsize=(20, 10))
plt.title('Hierarchical Clustering Dendrogram (Single)')
plt.xlabel('index')
plt.ylabel('distance')
merging_s = linkage(scaled_data, method="single", metric='euclidean')
dendrogram(merging_s)
plt.show()

### complete linkage :
- Here, the distance between 2 clusters is defined as the maximum distance between any 2 points in the clusters.


In [None]:
plt.figure(figsize=(18, 8))
plt.title('Hierarchical Clustering Dendrogram (Complete)')
plt.xlabel('index')
plt.ylabel('distance')
merging_c = linkage(scaled_data, method="complete", metric='euclidean')
dendrogram(merging_c)
plt.show()

####  `NOTE`
- From the above two dendrograms, it is clear that `Complete Linkage` has a batter cluster formation.
- So, we will use `Complete linkage` output for further analysis. We will build two iterations of clustering
    - one with 3 clusters and another one with 4 clusters (based on the output from Complete Linkage Dendrogram) and analyze the output
    

#### First iteration - Using 3 clusters

In [None]:
cluster_label_3 = cut_tree(merging_c, n_clusters = 3).reshape(-1,)
cluster_label_3

In [None]:
# creating a dataframe with cluster labels on it

cntry_data_h = cntry_df.copy()
cntry_data_h['cluster_label_3'] = cluster_label_3
cntry_data_h.head()

In [None]:
# Box plot on various features against the CLUSTER_ID to visualize the spread of the data

plt.figure(figsize=(20, 5))
plt.subplot(1, 3, 1)
sns.boxplot(x='cluster_label_3', y='child_mort', data=cntry_data_h, palette="bright")
plt.subplot(1, 3, 2)
sns.boxplot(x='cluster_label_3', y='gdpp', data=cntry_data_h, palette="bright")
plt.subplot(1, 3, 3)
sns.boxplot(x='cluster_label_3', y='income', data=cntry_data_h, palette="bright")

plt.show()

In [None]:
# plotting scatter plot on various features to visualize the clusters based on them

plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.scatterplot(x='gdpp', y='child_mort', hue='cluster_label_3',
                data=cntry_data_h, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 2)
sns.scatterplot(x='income', y='child_mort', hue='cluster_label_3',
                data=cntry_data_h, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 3)
sns.scatterplot(x='gdpp', y='income', hue='cluster_label_3',
                data=cntry_data_h, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.show()

In [None]:
# checking cluster count

cntry_data_h.cluster_label_3.value_counts()

In [None]:
# checking countries in cluster 2 

cntry_data_h[cntry_data_h['cluster_label_3'] == 2]

In [None]:
# checking countries in cluster 1

cntry_data_h[cntry_data_h['cluster_label_3'] == 1]

- Clusters 1 and 2 are developed and developing countries. So, our segmentation is good in terms of all our under developed countries are segmented under cluster 0. We will further tune this cluster

#### Second iteration - Using 4 clusters

In [None]:
# 4 Clusters
cluster_label_4 = cut_tree(merging_c, n_clusters = 4).reshape(-1,)
cluster_label_4

In [None]:
cntry_data_h['cluster_label_4'] = cluster_label_4

cntry_data_h.head()

In [None]:
# Box plot on various variable against the CLUSTER_ID to visualize the spread of the data

plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.boxplot(x='cluster_label_4', y='child_mort', data=cntry_data_h, palette="bright")
plt.subplot(1, 3, 2)
sns.boxplot(x='cluster_label_4', y='gdpp', data=cntry_data_h, palette="bright")
plt.subplot(1, 3, 3)
sns.boxplot(x='cluster_label_4', y='income', data=cntry_data_h, palette="bright")

plt.show()

In [None]:
# Scatter plot on various variables to visualize the clusters based on them

plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.scatterplot(x='gdpp', y='child_mort', hue='cluster_label_4',
                data=cntry_data_h, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 2)
sns.scatterplot(x='income', y='child_mort', hue='cluster_label_4',
                data=cntry_data_h, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 3)
sns.scatterplot(x='gdpp', y='income', hue='cluster_label_4',
                data=cntry_data_h, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.show()

In [None]:
cntry_data_h.cluster_label_4.value_counts()

In [None]:
# Checking the countries in Cluster 2 to see which are the countries in that segment.

cntry_data_h[cntry_data_h['cluster_label_4'] == 2]

In [None]:
# Checking the countries in Cluster 3 to see which are the countries in that segment.

cntry_data_h[cntry_data_h['cluster_label_4'] == 3]

In [None]:
# Checking the countries in Cluster 1 to see which are the countries in that segment.

cntry_data_h[cntry_data_h['cluster_label_4'] == 1]

#### INSIGHT
   - From the 2 iterations above, we can conclude that `3 clusters` is ideal for number of clusters.
   - This is because for `4 clusters`, `nigeria` was added as a separate cluster. Since Nigeria could be a backward country which may need the aid in terms of it's child mortality rate. 
   - So, the ideal number of clusters is `3 clusters`. We will further validate this.

### Interpreting Clusters from the Hierarchial Clustering model

In [None]:
cntry_data_h_analysis = cntry_data_h.groupby(['cluster_label_3']).mean()
cntry_data_h_analysis

In [None]:
# Creating a new field for count of observations in each cluster
cntry_data_h_analysis['Observations']=cntry_data_h[['cluster_label_3','child_mort']].groupby(['cluster_label_3']).count()
cntry_data_h_analysis

In [None]:
# Creating a new field for proportion of observations in each cluster
cntry_data_h_analysis['Proportion']=round(cntry_data_h_analysis['Observations']/cntry_data_h_analysis['Observations'].sum(),2)
cntry_data_h_analysis

In [None]:
plt.figure(figsize=(10, 5))
cntry_data_h_plot1=cntry_data_h[['cluster_label_3','gdpp','income']].copy()
cntry_data_h_plot1=cntry_data_h_plot1.groupby('cluster_label_3').mean()
cntry_data_h_plot1.plot.bar()

cntry_data_h_plot2=cntry_data_h[['cluster_label_3','child_mort']].copy()
cntry_data_h_plot2=cntry_data_h_plot2.groupby('cluster_label_3').mean()
cntry_data_h_plot2.plot.bar()
plt.show()

### Interpretation of Clusters: 
   - `Cluster 0` has the highest average child mortality rate of ~42 when compared to other 2 clusters and `lowest gdpp` and `lowest income` of ~7551 & 12641 respectively.
   - All the above reasons make the `cluster 0` ideal for the requirement. We can also observe that cluster 0 comprises of ~89% of overall countries and has ~148 countries out of 167 total countries. This is a problem.
   - Clustering model is not giving us a good result because ~89% of data is under one cluster and increasing the number of clusters doesn't solve this problem. We will check K-Means clustering.

## Finding the Optimal Number of Clusters
### SSD (Elbow curve)

In [None]:
# elbow-curve/SSD
ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(scaled_data)
    
    ssd.append(kmeans.inertia_)
    
# plot the SSDs for each n_clusters
# ssd
plt.plot(ssd)

### Silhouette Analysis
`                                   silhouette score = p−q/max(p,q)`
 
- p  is the mean distance to the points in the nearest cluster that the data point is not a part of
- q  is the mean intra-cluster distance to all the points in its own cluster.
- The value of the silhouette score range lies between -1 to 1.
- A score closer to 1 indicates that the data point is very similar to other data points in the cluster,
- A score closer to -1 indicates that the data point is not similar to the data points in its cluster.

In [None]:
# silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(scaled_data)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(scaled_data, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))

#### INSIGHT:
   - From the above validations (Elbow curve and Silhouette analysis), we can notice that 3,4 & 5 are the optimal number of clusters. We will do 3 iterations for 3,4 & 5 clusters.

### 4.2 K-MEANS CLUSTERING 

#### First iteration - 3 Clusters

In [None]:
# K Means with with k=3
kmeans_3 = KMeans(n_clusters=3, max_iter=500, init='k-means++', n_init=10, random_state= 335)
kmeans_3.fit(scaled_data)
kmeans_3.labels_

In [None]:
# Create a new datafame with scaled data and adding country name to it.
cntry_data_k = cntry_data_h.copy()
cntry_data_k.head()

In [None]:
# assigning the label
cntry_data_k['cluster_id_3'] = kmeans_3.labels_
cntry_data_k.head()

In [None]:
# Checking the number of countries in each cluster
cntry_data_k.cluster_id_3.value_counts()

In [None]:
# Box plot on various variable against the CLUSTER_ID to visualize the spread of the data

plt.figure(figsize=(20, 5))
plt.subplot(1, 3, 1)
sns.boxplot(x='cluster_id_3', y='child_mort',
            data=cntry_data_k, palette="bright")
plt.subplot(1, 3, 2)
sns.boxplot(x='cluster_id_3', y='gdpp',
            data=cntry_data_k, palette="bright")
plt.subplot(1, 3, 3)
sns.boxplot(x='cluster_id_3', y='income',
            data=cntry_data_k, palette="bright")

plt.show()

In [None]:
# Scatter plot on various variables to visualize the clusters based on them

plt.figure(figsize=(20, 5))
plt.subplot(1, 3, 1)
sns.scatterplot(x='gdpp', y='child_mort', hue='cluster_id_3',
                data=cntry_data_k, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 2)
sns.scatterplot(x='income', y='child_mort', hue='cluster_id_3',
                data=cntry_data_k, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 3)
sns.scatterplot(x='gdpp', y='income', hue='cluster_id_3',
                data=cntry_data_k, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.show()

In [None]:
# Checking the CLuster means:
cntry_data_k.groupby(['cluster_id_3']).mean().sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])

### INSIGHT: 
- `cluster 2` and `cluster 0` seems to have highest child mortality rate and lowest income and lowest GDPP. Checking the countries in both clusters.

In [None]:
# Checking the countries in Cluster 2 to see which are the countries in that segment.

cntry_data_k[['country','child_mort','income','gdpp']][cntry_data_k['cluster_id_3'] == 2].sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])

In [None]:
# Checking the countries in Cluster 0 to see which are the countries in that segment.

cntry_data_k[['country','child_mort','income','gdpp']][cntry_data_k['cluster_id_3'] == 0].sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])

#### Second Iteration - 4 clusters

In [None]:
# K Means with with k=4
kmeans_4 = KMeans(n_clusters=4, max_iter=500, init='k-means++', n_init=10, random_state= 335)
kmeans_4.fit(scaled_data)
kmeans_4.labels_

In [None]:
# assigning the label
cntry_data_k['cluster_id_4'] = kmeans_4.labels_
cntry_data_k.head()

In [None]:
# Checking the number of countries in each cluster

cntry_data_k.cluster_id_4.value_counts()

### Visualizing the clustered data

In [None]:
# Box plot on various variable against the CLUSTER_ID to visualize the spread of the data

plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.boxplot(x='cluster_id_4', y='child_mort',
            data=cntry_data_k, palette="bright")
plt.subplot(1, 3, 2)
sns.boxplot(x='cluster_id_4', y='gdpp',
            data=cntry_data_k, palette="bright")
plt.subplot(1, 3, 3)
sns.boxplot(x='cluster_id_4', y='income',
            data=cntry_data_k, palette="bright")

plt.show()

In [None]:
# Scatter plot on various variables to visualize the clusters based on them

plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.scatterplot(x='gdpp', y='child_mort', hue='cluster_id_4',
                data=cntry_data_k, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 2)
sns.scatterplot(x='income', y='child_mort', hue='cluster_id_4',
                data=cntry_data_k, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 3)
sns.scatterplot(x='gdpp', y='income', hue='cluster_id_4',
                data=cntry_data_k, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.show()

In [None]:
# Checking the CLuster means:
cntry_data_k.groupby(['cluster_id_4']).mean().sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])

#### INSIGHT:
   - `Cluster 3` & `Cluster 1` seems to have highest child mortality rate and lowest income and lowest gdpp. Checking the countries in these clusters

In [None]:
# Checking the countries in Cluster 3 to see which are the countries in that segment.

cntry_data_k[['country','child_mort','income','gdpp']][cntry_data_k['cluster_id_4'] == 3].sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])

In [None]:
# Checking the countries in Cluster 1 to see which are the countries in that segment.

cntry_data_k[['country','child_mort','income','gdpp']][cntry_data_k['cluster_id_4'] == 1].sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])

#### Third Iteration - 5 clusters

In [None]:
# K Means with with k=5
kmeans_5 = KMeans(n_clusters=5, max_iter=500, init='k-means++', n_init=10, random_state= 335)
kmeans_5.fit(scaled_data)
kmeans_5.labels_

In [None]:
# assigning the label
cntry_data_k['cluster_id_5'] = kmeans_5.labels_
cntry_data_k.head()

In [None]:
# Checking the number of countries in each cluster

cntry_data_k.cluster_id_5.value_counts()

In [None]:
## Visualizing the Clustered Data

In [None]:
# Box plot on various variable against the CLUSTER_ID to visualize the spread of the data

plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.boxplot(x='cluster_id_5', y='child_mort',
            data=cntry_data_k, palette="bright")
plt.subplot(1, 3, 2)
sns.boxplot(x='cluster_id_5', y='gdpp',
            data=cntry_data_k, palette="bright")
plt.subplot(1, 3, 3)
sns.boxplot(x='cluster_id_5', y='income',
            data=cntry_data_k, palette="bright")

plt.show()

In [None]:
# Scatter plot on various variables to visualize the clusters based on them

plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.scatterplot(x='gdpp', y='child_mort', hue='cluster_id_5',
                data=cntry_data_k, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 2)
sns.scatterplot(x='income', y='child_mort', hue='cluster_id_5',
                data=cntry_data_k, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.subplot(1, 3, 3)
sns.scatterplot(x='gdpp', y='income', hue='cluster_id_5',
                data=cntry_data_k, legend='full', palette="bright", s=100, c='lightblue', alpha=.4)
plt.show()

In [None]:
# Checking the CLuster means:
cntry_data_k.groupby(['cluster_id_5']).mean().sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])

#### INSIGHT: 
- Cluster 3 & Cluster 1 seems to have highest Child Mortality and lowest Income & GDPP. Lets check the countries in these clusters to see there outcome.

In [None]:
# Checking the countries in Cluster 3 to see which are the countries in that segment.

cntry_data_k[['country','child_mort','income','gdpp']][cntry_data_k['cluster_id_5'] == 4].sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])

In [None]:
# Checking the countries in Cluster 1 to see which are the countries in that segment.

cntry_data_k[['country','child_mort','income','gdpp']][cntry_data_k['cluster_id_5'] == 0].sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])

### Choosing the model

From the above 3 Iterations of K-Means, we could see that using 3 Clusters provided a better output in terms of a balanced cluster size. So we will consider the 'K-Means with 3 Clusters' as our FINAL MODEL

### Interpreting the Clusters from K-Means model

In [None]:
#New dataframe for group by & analysis

cntry_data_k_analysis =  cntry_data_k.groupby(['cluster_id_3']).mean().sort_values(['child_mort','income','gdpp'],ascending = [False,True,True])
cntry_data_k_analysis

In [None]:
# Creating a new field for count of observations in each cluster

cntry_data_k_analysis['Observations']=cntry_data_k[['cluster_id_3','child_mort']].groupby(['cluster_id_3']).count()
cntry_data_k_analysis

In [None]:
# Creating a new field for proportion of observations in each cluster

cntry_data_k_analysis['Proportion']=round(cntry_data_k_analysis['Observations']/cntry_data_k_analysis['Observations'].sum(),2)


#Summary View
cntry_data_k_analysis[['child_mort','income','gdpp','Observations','Proportion']]

In [None]:
plt.figure(figsize=(8, 5))
cntry_data_k_plot1=cntry_data_k[['cluster_id_3','gdpp','income']].copy()
cntry_data_k_plot1=cntry_data_k_plot1.groupby('cluster_id_3').mean()
cntry_data_k_plot1.plot.bar()

cntry_data_k_plot2=cntry_data_k[['cluster_id_3','child_mort']].copy()
cntry_data_k_plot2=cntry_data_k_plot2.groupby('cluster_id_3').mean()
cntry_data_k_plot2.plot.bar()

### Renaming the Clusters

In [None]:
cntry_data_k_analysis = cntry_data_k_analysis.rename({
    2: 'Under_Developed Countries',
    1: 'Developed Countries',
    0: 'Developing Countries'}).reset_index()

In [None]:
cntry_data_k_analysis[['cluster_id_3','child_mort','income','gdpp','Observations','Proportion']]

In [None]:
cntry_data_k_analysis[['cluster_id_4','child_mort','income','gdpp','Observations','Proportion']]

### Analysing the 'Under Developed Countries' Cluster

In [None]:
# Countries in Cluster_id = 0, save to a new dataframe, this will form our final cluster data

final_cluster = cntry_data_k[cntry_data_k['cluster_id_3'] == 2].copy()
final_cluster = final_cluster[['country', 'child_mort', 'income', 'gdpp']]


In [None]:
final_cluster.sort_values([ 'gdpp', 'income','child_mort'], ascending=[True, True, False])

### Decision Making on the final approach

In [None]:
final_cluster.describe()

In [None]:
Under_Developed_Countries = cntry_df[cntry_df['gdpp'] <= 932].copy()
# Under_Developed_Countries = Under_Developed_Countries[Under_Developed_Countries['gdpp'] <= 932]
Under_Developed_Countries=Under_Developed_Countries.sort_values([ 'gdpp', 'income','child_mort'], ascending=[True, True, False])

In [None]:
Under_Developed_Countries[['country', 'child_mort', 'gdpp', 'income']]

In [None]:
Under_Developed_Countries = Under_Developed_Countries[Under_Developed_Countries['income'] <= 1860]
Under_Developed_Countries=Under_Developed_Countries.sort_values([ 'gdpp', 'income','child_mort'], ascending=[True, True, False])

In [None]:
Under_Developed_Countries[['country', 'child_mort', 'gdpp', 'income']]

In [None]:
# We create a new dataframe to store the countries satisfying the cluster median from original dataframe

Under_Developed_Countries = Under_Developed_Countries[Under_Developed_Countries['child_mort'] >= 90]
Under_Developed_Countries=Under_Developed_Countries.sort_values([ 'gdpp', 'income','child_mort'], ascending=[True, True, False])

In [None]:
Under_Developed_Countries[['country', 'child_mort', 'gdpp', 'income']]

### Descriptive Statistics of Cluster 'Under_Developed_Countries'

In [None]:
# top 10 underdeveloped countries list to a new df

UDC_top_10=Under_Developed_Countries[['country', 'gdpp', 'income', 'child_mort']].head(10).copy()

In [None]:
#Final countries list

UDC_top_10

In [None]:
UDC_top_10.describe()

## Univariate Analysis of Cluster 'Under_Developed_Countries' (Top 10)

In [None]:
plt.figure(figsize=(21, 18))
features = ['gdpp','income','child_mort']
for i in enumerate(features):
    ax = plt.subplot(3, 3, i[0]+1)
    sns.distplot(UDC_top_10[i[1]])
    plt.xticks(rotation=20)

### Bivariate Analysis of Cluster 'Under_Developed_Countries' (Top 10)

In [None]:
# Scatter plot on various variables to visualize the clusters based on them

plt.figure(figsize=(18, 10))
plt.subplot(1, 3, 1)
sns.scatterplot(x='gdpp', y='child_mort', hue='country',
                data=UDC_top_10, legend='full', palette="bright", s=300, c='lightblue')
plt.subplot(1, 3, 2)
sns.scatterplot(x='gdpp', y='income', hue='country',
                data=UDC_top_10, legend='full', palette="bright", s=300, c='lightblue')
plt.subplot(1, 3, 3)
sns.scatterplot(x='income', y='child_mort', hue='country',
                data=UDC_top_10, legend='full', palette="bright", s=300, c='lightblue')
plt.show()

## Conclusion & Recommendation
- We performed CLUSTERING on the socio-economic data provided for various countries to identify countries most eligible for Financial Aid from the NGO. 
- Based on Clustering Analysis, below are the top 10 countries under our 'Under Developed Countries' cluster which are in dire need of the Financial Aid. This output is purely based on the dataset provided and various analytical methodologies performed.

In [None]:
#TOP 10 COUNTRIES recommended for Financial Aid

UDC_top_10