In [1]:
pip install umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.6


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.graph_objs as go
import plotly.figure_factory as ff
import umap # use 'pip install umap-learn' or 'conda install -c conda-forge umap-learn'
from sklearn.preprocessing import LabelEncoder





In [3]:
 #Importing dataset and examining it
dataset = pd.read_csv("/content/soci_econ_country_profiles.csv")
print(dataset.head())


   Unnamed: 0    country         Region  Surface area (km2)  \
0           0  Argentina   SouthAmerica             2780400   
1           1  Australia        Oceania             7692060   
2           2    Austria  WesternEurope               83871   
3           3    Belarus  EasternEurope              207600   
4           4    Belgium  WesternEurope               30528   

   Population in thousands (2017)  Population density (per km2, 2017)  \
0                           44271                                16.2   
1                           24451                                 3.2   
2                            8736                               106.0   
3                            9468                                46.7   
4                           11429                               377.5   

   Sex ratio (m per 100 f, 2017)  \
0                           95.9   
1                           99.3   
2                           96.2   
3                           87.0   
4 

In [4]:
dataset.duplicated()
print(dataset.shape)
print(dataset.info())
print(dataset.describe())


(66, 96)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 96 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Unnamed: 0                                                   66 non-null     int64  
 1   country                                                      66 non-null     object 
 2   Region                                                       66 non-null     object 
 3   Surface area (km2)                                           66 non-null     int64  
 4   Population in thousands (2017)                               66 non-null     int64  
 5   Population density (per km2, 2017)                           66 non-null     float64
 6   Sex ratio (m per 100 f, 2017)                                66 non-null     float64
 7   GDP: Gross domestic product (million current US$)            66 non-null 

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 96 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Unnamed: 0                                                   66 non-null     int64  
 1   country                                                      66 non-null     object 
 2   Region                                                       66 non-null     object 
 3   Surface area (km2)                                           66 non-null     int64  
 4   Population in thousands (2017)                               66 non-null     int64  
 5   Population density (per km2, 2017)                           66 non-null     float64
 6   Sex ratio (m per 100 f, 2017)                                66 non-null     float64
 7   GDP: Gross domestic product (million current US$)            66 non-null     int64

In [6]:
from sklearn.preprocessing import LabelEncoder
cul = ['country',
 'Region',
 'Economy: Agriculture (% of GVA)',
 'Labour force participation (female/male pop. %)',
 'Population growth rate (average annual %)',
 'Urban population growth rate (average annual %)',
 'Population age distribution (0-14 / 60+ years, %)',
 'International migrant stock (000/% of total pop.)',
 'Refugees and others of concern to UNHCR (in thousands)',
 'Health: Physicians (per 1000 pop.)',
 'Education: Government expenditure (% of GDP)',
 'Education: Primary gross enrol. ratio (f/m per 100 pop.)',
 'Education: Secondary gross enrol. ratio (f/m per 100 pop.)',
 'Education: Tertiary gross enrol. ratio (f/m per 100 pop.)',
 'Forested area (% of land area)',
 'Energy supply per capita (Gigajoules)',
 'Pop. using improved drinking water (urban/rural, %)',
 'Pop. using improved sanitation facilities (urban/rural, %)']
for i in cul:
  dataset[i]= LabelEncoder().fit_transform(dataset[i])

In [7]:
null_values = dataset.isnull().sum()
null_values = null_values[null_values > 0]
null_values.sort_values(ascending=False)

Taxes on income, profits and capital gains (% of revenue)      9
Tax revenue (% of GDP)                                         8
Air transport, freight (million ton-km)                        8
Air transport, passengers carried                              5
Commercial service exports (current US$)                       3
Commercial service imports (current US$)                       3
Current account balance (% of GDP)                             3
Military expenditure (% of GDP)                                3
Human capital index (HCI) (scale 0-1)                          2
Inflation, consumer prices (annual %)                          2
Adjusted net national income per capita (constant 2010 US$)    2
Consumer price index (2010 = 100)                              2
High-technology exports (current US$)                          1
Imports of goods and services (% of GDP)                       1
Gross domestic savings (% of GDP)                              1
Current health expenditur

In [8]:
# Function to fill missing values with median
def fill_with_median(dataset):
    for column in dataset.columns:
        dataset[column].fillna(dataset[column].median(), inplace=True)
    return dataset

# Apply the function to the dataset
data_filled = fill_with_median(dataset)

# Verify that there are no more missing values
print(data_filled.isnull().sum().sum())

# Display the first few rows of the cleaned dataset
data_filled.head()

0


Unnamed: 0.1,Unnamed: 0,country,Region,Surface area (km2),Population in thousands (2017),"Population density (per km2, 2017)","Sex ratio (m per 100 f, 2017)",GDP: Gross domestic product (million current US$),"GDP growth rate (annual %, const. 2005 prices)",GDP per capita (current US$),...,"Inflation, consumer prices (annual %)","Life expectancy at birth, female (years)","Life expectancy at birth, male (years)","Life expectancy at birth, total (years)",Military expenditure (% of GDP),"Population, female","Population, male",Tax revenue (% of GDP),"Taxes on income, profits and capital gains (% of revenue)",Urban population (% of total population)_y
0,0,0,8,2780400,44271,16.2,95.9,632343,2.4,14564.5,...,1.961454,79.726,72.924,76.372,0.856138,22572521.0,21472290.0,10.955501,12.929913,91.749
1,1,1,6,7692060,24451,3.2,99.3,1230859,2.4,51352.2,...,1.948647,84.6,80.5,82.5,2.007966,12349632.0,12252228.0,21.915859,64.110306,85.904
2,2,2,13,83871,8736,106.0,96.2,376967,1.0,44117.7,...,2.081269,84.0,79.4,81.643902,0.756179,4478340.0,4319226.0,25.355237,27.024073,58.094
3,3,3,2,207600,9468,46.7,87.0,54609,-3.9,5750.8,...,6.031837,79.2,69.3,74.129268,1.162417,5077542.0,4420722.0,13.019006,2.933101,78.134
4,4,4,13,30528,11429,377.5,97.3,455107,1.5,40277.8,...,2.125971,83.9,79.2,81.492683,0.910371,5766141.0,5609017.0,23.399721,33.727746,97.961


In [9]:
null_values = dataset.isnull().sum()
null_values = null_values[null_values > 0]
null_values.sort_values(ascending=False)


Series([], dtype: int64)

In [10]:
# Defining feature set
X = dataset.drop(['Region','country'], axis = 1) # Features
print(type(X))
print(X.shape)

<class 'pandas.core.frame.DataFrame'>
(66, 94)


In [11]:
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

In [31]:
# Implementing UMAP to visualize dataset
u = umap.UMAP(n_components = 2, n_neighbors=4, min_dist=0.03)
x_umap = u.fit_transform(X_scaled)

regions=list(dataset['Region'])
data = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=None, colorscale='Rainbow', opacity=0.5),
                                text=[f'Region: {a}' for a in regions],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Region Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
fig.show()

In [34]:
# Labelling clusters using KMeans
kmeans = KMeans(n_clusters = 3)
kmeans.fit(x_umap)

labels = list(kmeans.labels_)
data = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=kmeans.labels_, colorscale='Rainbow', opacity=0.5),
                                text=[f'Region: {a}<br>Label: {b}' for a,b in list(zip(regions,labels))],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Kmeans Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
fig.show()

dataset['Label'] = kmeans.labels_
dataset.to_csv("ClusteredRegions.csv", index=False)
print(dataset.Label.value_counts())





Label
0    26
1    21
2    19
Name: count, dtype: int64


In [36]:
# Implementing UMAP to visualize dataset
u = umap.UMAP(n_components = 2, n_neighbors=4, min_dist=0.03)
x_umap = u.fit_transform(X_scaled)

country =list(dataset['country'])
data = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=None, colorscale='Rainbow', opacity=0.5),
                                text=[f'country: {a}' for a in country],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Country Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
fig.show()

In [37]:
# Labelling clusters using KMeans
kmeans = KMeans(n_clusters = 3)
kmeans.fit(x_umap)

labels = list(kmeans.labels_)
data = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=kmeans.labels_, colorscale='Rainbow', opacity=0.5),
                                text=[f'country: {a}<br>Label: {b}' for a,b in list(zip(regions,labels))],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Kameans Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
fig.show()

dataset['Label'] = kmeans.labels_
dataset.to_csv("Clusteredcountry.csv", index=False)
print(dataset.Label.value_counts())





Label
1    26
2    21
0    19
Name: count, dtype: int64
