### - Clustering Assignment

#### - Import the necessary libraries

#### - Step 1: Reading and understanding the data

In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.max_colwidth', -1)

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
from math import isnan

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [None]:
#Reading the Countries file

df_country = pd.read_csv('Country-data.csv')
df_country.head()

In [None]:
#Reading the data dictionary

country_dict = pd.read_csv('data-dictionary+.csv')
country_dict.head(10)

In [None]:
#Checking the shape of the data

df_country.shape

In [None]:
#Checking the statistics of the numerical values

df_country.describe()

In [None]:
#Checking the data types

df_country.info()

#### - Step 2: Data Cleaning

In [None]:
#Calculating the missing values

df_country.isnull().sum()

##### - No missing values were found in any of the columns
##### - Datatypes also all seem to be consistent
##### - No cleaning seems to be requried as data seems to be good enough to proceed further

#### - Step 3: Data Visualization

In [None]:
#Understanding the correlation of the various factor in the dataset

plt.figure(figsize = (15,10))

sns.heatmap(df_country.corr(), annot = True)
plt.show()

##### - gdpp and income are highly correlated
##### - Exports and imports are also highly correlated
##### - life expectancy and income are highly correlated
##### - total fertility and life expectancy are inversely correlated

In [None]:
sns.pairplot(df_country)
plt.show()

#### - Step 4: Preparing the data

##### - Some column values such as imports, exports and health spend are percentage values and thus aren't very useful. Therefore we'll need to convert them into absolute values to further deduce them.

In [None]:
#Converting the imports, exports and health columns to absolute values

df_country['imports'] = df_country['imports']*df_country['gdpp']/100
df_country['exports'] = df_country['exports']*df_country['gdpp']/100
df_country['health'] = df_country['health']*df_country['gdpp']/100

In [None]:
df_country.head()

In [None]:
#Dropping the country, before proceeding towards re-scaling the features

df_country_drop = df_country.copy()
country = df_country_drop.pop('country')

In [None]:
df_country_drop.head()

In [None]:
# Re-scaling of features within the dataframe

scaler = StandardScaler()
df_country_scaled = scaler.fit_transform(df_country_drop)

In [None]:
#As seen in the below array the features have now been scaled

df_country_scaled

#### - Step 5: Principal Component Analysis or PCA

In [None]:
#PCA is being done to remove redundancies and the attributes that are highly correlated

pca = PCA(svd_solver = 'randomized', random_state = 42)

In [None]:
pca.fit(df_country_scaled)

In [None]:
#As we can see components for PCA have been created

pca.components_

In [None]:
#Checking the variance ratio for PCA

pca.explained_variance_ratio_

In [None]:
#Barplot for PCA components varianca ratio

plt.bar(range(1,len(pca.explained_variance_ratio_)+1),pca.explained_variance_ratio_)
plt.xlabel('Components of PCA')
plt.ylabel('Variance Ratio')
plt.show()

##### - As seen above, component one has a variance of almost 0.6
##### - The second component variance is almost 20%

In [None]:
#Checking for the best explained pca components

col_type = list(df_country.drop(['country'], axis = 1).columns)
attributes_pca = pd.DataFrame({'Attribute':col_type, 'Comp_1':pca.components_[0], 'Comp_2':pca.components_[1], 'Comp_3':pca.components_[2]})

In [None]:
attributes_pca

In [None]:
#Now we'll visualise the above dataset with a plot for Comp_1 and Comp_2

sns.pairplot(data = attributes_pca, x_vars = ['Comp_1'], y_vars = ['Comp_2'], hue = 'Attribute', height = 8)
plt.xlabel('Component 1')
plt.ylabel('Component 2')

for i,txt in enumerate(attributes_pca.Attribute):
    plt.annotate(txt, (attributes_pca.Comp_1[i],attributes_pca.Comp_2[i]))

plt.show()

##### - Child mortality and total fertility are very well explained by Comp_1
##### - gdpp, health, income and life expectancy are well explained by Comp_2

In [None]:
#Now we'll visualise the above dataset with a plot for Comp_1 and Comp_3

sns.pairplot(data = attributes_pca, x_vars = ['Comp_1'], y_vars = ['Comp_3'], hue = 'Attribute', height = 8)
plt.xlabel('Component 1')
plt.ylabel('Component 3')

for i,txt in enumerate(attributes_pca.Attribute):
    plt.annotate(txt, (attributes_pca.Comp_1[i],attributes_pca.Comp_3[i]))

plt.show()

##### - Inflation in best explained by Comp_3
##### - As we can see from the above plots, more than 90% of the variance is explained well by the three components. We will build the dataframe with these components

In [None]:
#Based on self done research, we have understood that incremental PCA grants better efficiency.

incre_pca = IncrementalPCA(n_components = 3)

In [None]:
#Now we'll fit the incremental pca on the scaled df

df_incre_pca = incre_pca.fit_transform(df_country_scaled)
df_incre_pca

In [None]:
#Creating a new dataframe with the principal components

df_pca = pd.DataFrame(df_incre_pca, columns = ['Comp_1','Comp_2','Comp_3'])
df_pca_final = pd.concat([country, df_pca], axis = 1)
df_pca_final.head()

In [None]:
#Checking for dependancy in the dataset

plt.figure(figsize = (10,5))
sns.heatmap(df_pca_final.corr(), annot = True)
plt.show()

##### - As we can see above the correlation is almost non-existent

In [None]:
#Checking for spread of data across the components

plt.figure(figsize = (10,5))
plt.subplot(1,3,1)
sns.scatterplot(data = df_pca_final, x = 'Comp_1', y = 'Comp_2')
plt.subplot(1,3,2)
sns.scatterplot(data = df_pca_final, x = 'Comp_1', y = 'Comp_3')
plt.subplot(1,3,3)
sns.scatterplot(data = df_pca_final, x = 'Comp_3', y = 'Comp_2')
plt.show()

In [None]:
#Outlier Analysis

outliers = ['Comp_1','Comp_2','Comp_3']
plt.rcParams['figure.figsize'] = [10,8]
sns.boxplot(data = df_pca_final[outliers])
plt.title('Distribution of outlier variables')
plt.xlabel('PC Components')
plt.ylabel('Range')
plt.show()

In [None]:
# Statstical Outlier treatment for PC_1

Q1 = df_pca_final.Comp_1.quantile(0.05)
Q3 = df_pca_final.Comp_1.quantile(0.95)
IQR = Q3 - Q1
df_pca_final = df_pca_final[(df_pca_final.Comp_1 >= Q1) & (df_pca_final.Comp_1 <= Q3)]

# Statstical Outlier treatment for PC_2

Q1 = df_pca_final.Comp_2.quantile(0.05)
Q3 = df_pca_final.Comp_2.quantile(0.95)
IQR = Q3 - Q1
df_pca_final = df_pca_final[(df_pca_final.Comp_2 >= Q1) & (df_pca_final.Comp_2 <= Q3)]

# Statstical Outlier treatment for PC_3

Q1 = df_pca_final.Comp_3.quantile(0.05)
Q3 = df_pca_final.Comp_3.quantile(0.95)
IQR = Q3 - Q1
df_pca_final = df_pca_final[(df_pca_final.Comp_3 >= Q1) & (df_pca_final.Comp_3 <= Q3)]

In [None]:
outliers = ['Comp_1','Comp_2','Comp_3']
plt.rcParams['figure.figsize'] = [10,8]
sns.boxplot(data = df_pca_final[outliers])
plt.title('Distribution of outlier variables')
plt.xlabel('PC Components')
plt.ylabel('Range')
plt.show()

In [None]:
#Resetting the index after outlier removal

df_pca_final = df_pca_final.reset_index(drop = True)
df_pca_final_data = df_pca_final.drop(['country'],axis=1)
df_pca_final.head()

#### - Step 6: Hopkins Statistic Test

In [None]:
#We can use this test to determine if the data is good enough for clustering

def hopkins(X):
    d = X.shape[1]
    n = len(X)
    m = int(0.1*n)
    nbrs = NearestNeighbors(n_neighbors = 1).fit(X.values)
    
    rand_X = sample(range(0,n,1), m)
    
    ujd = []
    wjd = []
    for j in range(0,m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
        
        HS = sum(ujd) / (sum(ujd) + sum(wjd))
        if isnan(HS):
            print(ujd, wjd)
            HS = 0
            
        return HS

In [None]:
#Hopkins score for the data

hopkins(df_pca_final_data)

#### - Step 7: Building the model

In [None]:
#We will first start with using K-means clustering
#Elbow curve method is to be used here to help attain the optimal value for k

ssd = []
for num_clusters in list(range(1,8)):
    model_clus = KMeans(n_clusters = num_clusters, max_iter=50,random_state= 100)
    model_clus.fit(df_pca_final_data)
    ssd.append(model_clus.inertia_)

plt.plot(ssd)
plt.show()

##### - As we can see from the above elbow curve, the number of clusters should be 4 or 5

In [None]:
#We will also do a silhouette score analysis to determine the ideal number of clusters

range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    #intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50,random_state= 100)
    kmeans.fit(df_pca_final_data)
    
    cluster_labels = kmeans.labels_
    
    #silhouette score
    silhouette_avg = silhouette_score(df_pca_final_data, cluster_labels)
    
    print('If number of clusters = {0}, the silhouette score will be {1}'.format(num_clusters, silhouette_avg))

In [None]:
#Let's see what happens when we change the number of clusters to 5

cluster5 = KMeans(n_clusters = 5, max_iter = 50, random_state = 100)
cluster5.fit(df_pca_final_data)

In [None]:
#Labels for cluster5

cluster5.labels_

In [None]:
#Assigning labels

df_pca_final['Cluster_Id'] = cluster5.labels_
df_pca_final.head()

In [None]:
#Finding out the number of countries in each cluster

df_pca_final['Cluster_Id'].value_counts()

##### - Each of the clusters seem to have a good number of countries present

In [None]:
#Plotting a scatterplot to visualise the spread of the data

fig, axes = plt.subplots(1,2, figsize = (15,8))

sns.scatterplot(x = 'Comp_1', y = 'Comp_2', hue = 'Cluster_Id', data = df_pca_final, ax = axes[0], palette = 'Set1')
sns.scatterplot(x = 'Comp_1', y = 'Comp_3', hue = 'Cluster_Id', data = df_pca_final, ax = axes[1], palette = 'Set1')

plt.show()

##### - As we can see above we have run into the same issue as with k=4. But since we have another segment, we can proceed with 5 clusters.

In [None]:
#Let'try to visualise the data on the original attributes

df_merge = pd.merge(df_country, df_pca_final, on = 'country')
df_merge_col = df_merge[['country','child_mort','exports','imports','health','income','inflation','life_expec','total_fer','gdpp','Cluster_Id']]

cluster_child = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).child_mort.mean())
cluster_export = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).exports.mean())
cluster_import = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).imports.mean())
cluster_health = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).health.mean())
cluster_income = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).income.mean())
cluster_inflation = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).inflation.mean())         
cluster_lifeexpec = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).life_expec.mean())
cluster_totalfer = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).total_fer.mean())
cluster_gdpp = pd.DataFrame(df_merge_col.groupby(["Cluster_Id"]).gdpp.mean())

df_concat = pd.concat([pd.Series([0,1,2,3,4]),cluster_child,cluster_export,cluster_import,cluster_health,cluster_income
                       ,cluster_inflation,cluster_lifeexpec,cluster_totalfer,cluster_gdpp], axis=1)
df_concat.columns = ["Cluster_Id", "Child_Mortality", "Exports", "Imports","Health_Spending","Income","Inflation","Life_Expectancy","Total_Fertility","GDPpcapita"]
df_concat.head()

In [None]:
df_merge_col.head()

In [None]:
#Plotting a scatterplot to visualise the spread of the original attributes

fig, axes = plt.subplots(1,3, figsize = (15,10))

sns.scatterplot(x = 'income', y = 'child_mort',hue='Cluster_Id',data = df_merge_col,legend='full',palette="Set1",ax=axes[0])
sns.scatterplot(x = 'gdpp', y = 'income',hue='Cluster_Id', data = df_merge_col,legend='full',palette="Set1",ax=axes[1])
sns.scatterplot(x = 'child_mort', y = 'gdpp',hue='Cluster_Id', data=df_merge_col,legend='full',palette="Set1",ax=axes[2])

plt.show()

In [None]:
#Let's also plot a boxplot on the original attributes

fig, axes = plt.subplots(2,2, figsize = (15,10))

sns.boxplot(x = 'Cluster_Id', y = 'child_mort', data = df_merge_col, ax = axes[0][0])
sns.boxplot(x = 'Cluster_Id', y = 'income', data = df_merge_col, ax = axes[0][1])
sns.boxplot(x = 'Cluster_Id', y = 'inflation', data = df_merge_col, ax = axes[1][0])
sns.boxplot(x = 'Cluster_Id', y = 'gdpp', data = df_merge_col, ax = axes[1][1])
plt.show()

##### - Child mortality is highest in clusters 1 and 4. They will require aid.
##### - A high income and high gdpp are indicators of a well developed country.
##### - As seen above countries in clusters 1 and 4 have the lowest income and gdpp and will therefor require monetary assistance.

In [None]:
#Let's take a look at the countries in cluster 1

df_merge_col[df_merge_col['Cluster_Id']==1]

In [None]:
#Let's take a look at the countries in cluster 4

df_merge_col[df_merge_col['Cluster_Id']==4]

In [None]:
#Let's try hierarchical clustering and see if the results differ
#As we know there are two types of hierarchical clustering, divisive and agglomerative.

df_pca_final_data.head()

In [None]:
#Let's begin with single linkage

merging = linkage(df_pca_final_data, method = 'single', metric = 'euclidean')
dendrogram(merging)
plt.show()

In [None]:
#We can also try complete linkage

merging = linkage(df_pca_final_data, method = 'complete', metric = 'euclidean')
dendrogram(merging)
plt.show()

In [None]:
df_pca_hclus = df_pca_final.copy()
df_pca_hclus = df_pca_hclus.drop('Cluster_Id', axis = 1)
df_pca_hclus.head()

In [None]:
#To get a better cluster formation, we can cut the tree at height of 3.

cluster_cut = pd.Series(cut_tree(merging, n_clusters = 4).reshape(-1,))
df_hclus = pd.concat([df_pca_hclus, cluster_cut], axis = 1)
df_hclus.columns = ['country', 'Comp_1', 'Comp_2','Comp_3','Cluster_Id']
df_hclus.head()

In [None]:
#Plotting a scatterplot to visualise the spread of the principal components

fig, axes = plt.subplots(1,2, figsize = (10,5))

sns.scatterplot(x = 'Comp_1', y = 'Comp_2', hue = 'Cluster_Id', data = df_hclus, ax = axes[0], palette = 'Set1')
sns.scatterplot(x = 'Comp_1', y = 'Comp_3', hue = 'Cluster_Id', data = df_hclus, ax = axes[1], palette = 'Set1')

plt.show()

##### - As seen above the fourth cluster has not properly formed in the first plot

In [None]:
#Merging the original dataframe with the dataframe containing PCA

df_merge_hclus = pd.merge(df_country, df_hclus, on = 'country')
df_merge_hclus_col = df_merge[['country','child_mort','exports','imports','health','income','inflation','life_expec','total_fer','gdpp','Cluster_Id']]


In [None]:
df_merge_hclus_col.head()

In [None]:
#Plotting a scatterplot to visualise the spread of the original attributes

fig, axes = plt.subplots(1,3, figsize = (15,10))

sns.scatterplot(x = 'income', y = 'child_mort',hue='Cluster_Id',data = df_merge_hclus_col,legend='full',palette="Set1",ax=axes[0])
sns.scatterplot(x = 'gdpp', y = 'income',hue='Cluster_Id', data = df_merge_hclus_col,legend='full',palette="Set1",ax=axes[1])
sns.scatterplot(x = 'child_mort', y = 'gdpp',hue='Cluster_Id', data= df_merge_hclus_col,legend='full',palette="Set1",ax=axes[2])

plt.show()

##### - After analyzing using both methods. We can see that the clusters have formed better using K-means. Using which we'll obtain the final list of countries.

#### - Step 8: Final Analysis

##### - As seen above we have identified that cluster 1 and 4 are in need of aid

In [None]:
df_clus1 = df_merge_col[df_merge_col['Cluster_Id'] == 1]

In [None]:
df_clus4 = df_merge_col[df_merge_col['Cluster_Id'] == 4]

In [None]:
#Arriving at the list of countries which require aid

df_append = df_clus1.append(df_clus4)

In [None]:
df_append.head()

In [None]:
#Analysing the numerical columns within the dataframe

df_append.describe()

In [None]:
#Using the above information we can see that the mean child_mort is 52 for the selected clusters
#We can take all the countries with child_mort greater than the mean, 52.

df_final_list = df_country[df_country['child_mort'] > 52]
df_final_list.shape

In [None]:
#Checking the demographic again

df_final_list.describe()

In [None]:
#As seen above the mean income is 3855, we can take all countries which fall below that

df_final_list1 = df_final_list[df_final_list['income'] <= 3855]
df_final_list1.shape

In [None]:
#Checking the demographic once again

df_final_list1.describe()

In [None]:
#As seen above mean gdpp is 833, we can take all countries which fall below

df_final_list2 = df_final_list1[df_final_list1['gdpp'] <= 833]
df_final_list2.shape

In [None]:
#Hence we arrive at the final list of countries which require aid based on the selected socio-economic factors

df_final_list2['country']

In [None]:
#Visualising the data for countries which need aid based on child mortality

df_list_cm = pd.DataFrame(df_final_list2.groupby(['country'])['child_mort'].mean().sort_values(ascending = False))

df_list_cm.plot.bar()
plt.title('Aid based on child mortality by country')
plt.xlabel('Country', fontweight = 'bold')
plt.ylabel('Child Mortality', fontweight = 'bold', fontsize = 10)

plt.show()

In [None]:
#Visualising the data for countries which need aid based on per capita income

df_list_cm = pd.DataFrame(df_final_list2.groupby(['country'])['income'].mean().sort_values(ascending = False))

df_list_cm.plot.bar()
plt.title('Aid based on per capita income by country')
plt.xlabel('Country', fontweight = 'bold')
plt.ylabel('Per capita income', fontweight = 'bold', fontsize = 10)

plt.show()

In [None]:
#Visualising the data for countries which need aid based on gdpp

df_list_cm = pd.DataFrame(df_final_list2.groupby(['country'])['gdpp'].mean().sort_values(ascending = False))

df_list_cm.plot.bar()
plt.title('Aid based on gdpp by country')
plt.xlabel('Country', fontweight = 'bold')
plt.ylabel('gdpp', fontweight = 'bold', fontsize = 10)

plt.show()

#### - Conclusion

##### - As can be seen above PCA was done to get rid of redundant variables. After which we did clustering of countries based on the PCA components. We also verified socio-economic factors such as Child Mortality, Income and GDPP. These factors place a vital role in determining the development within the country. Clusters were built using this information. Using these clusters we were able to derive the final list of countries.

In [None]:
#Final list of countries that require aid

df_final_list2.reset_index(drop = True)['country']