In [1]:
import sqlite3
import matplotlib.pyplot as plt 
import pandas as pd           # Data Manipulation
import numpy as np            # Linear Algebra
import seaborn as sns         # Data Visualization
import plotly.graph_objs as go
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler  # For scaling dataset
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation #For clustering
from sklearn.mixture import GaussianMixture #For GMM clustering

## Data Overview

In [30]:
# Use pandas to query data
# sheet_name = "small countries are beautiful"
# sql = "SELECT * FROM `small countries are beautiful`"

# df = pd.read_sql_query(sql, conn)

# Transform df(Dataframe) to numeric dtype
df = pd.read_excel("Datasets_cleaned.xlsx", 0)
cols = df.columns
for i in cols[1:]:
    df[i]=pd.to_numeric(df[i],downcast='float', errors='coerce')

# Data Cleaning

## Clean columns
df = df.drop(columns=["happy_planet_index","world_happiness","education_expenditure"])

## Clean rows
df = df.dropna(thresh=len(df.columns)*0.7)

cols

Index(['indicator', 'population', 'gdp_per_cap', 'gdp ', 'gdp_growth',
       'gdp_per_km2', 'land_area', 'population_density', 'total_foreign_born',
       'per_foreign_born', 'population_living_abroad_diaspora',
       'population_living_abroad', 'happy_planet_index', 'world_happiness',
       'human_development_index', 'health_expenditure',
       'education_expenditure',
       'sustainable_economic_development_assessment(seda)', 'unemployment',
       'political_stability&absence_of_violence', 'control_of_corruption',
       'judicial_effectiveness_score', 'government_integrity_score',
       'property_rights_score', 'tax_burden_score',
       'overall_economic_freedom_score', 'financial_freedom_score'],
      dtype='object')

In [16]:
df

Unnamed: 0,indicator,population,gdp_per_cap,gdp,gdp_growth,gdp_per_km2,land_area,population_density,total_foreign_born,per_foreign_born,...,sustainable_economic_development_assessment(seda),unemployment,political_stability&absence_of_violence,control_of_corruption,judicial_effectiveness_score,government_integrity_score,property_rights_score,tax_burden_score,overall_economic_freedom_score,financial_freedom_score
0,Albania,2900000.0,34.200001,11840.200195,1.5,1.248266e+06,27398.0,104.870697,57616.0,3.100000,...,53.099998,8.500000,-2.78,-1.52,28.200001,26.200001,17.900000,91.800003,51.299999,10.0
1,Armenia,3000000.0,25.799999,8621.000000,2.6,9.147963e+05,28203.0,102.931152,191199.0,10.600000,...,51.700001,16.299999,0.40,-0.42,25.400000,39.900002,54.099998,85.099998,64.500000,70.0
2,Bahamas,400000.0,9.000000,24555.199219,3.7,8.991009e+05,10010.0,39.496605,59306.0,16.299999,...,60.400002,11.200000,-0.96,-0.61,35.200001,29.000000,27.799999,74.000000,44.700001,30.0
4,Barbados,300000.0,4.800000,17100.199219,3.0,1.116279e+07,430.0,664.462769,34475.0,11.300000,...,60.799999,6.600000,-0.29,-1.41,25.400000,18.900000,36.000000,82.400002,48.599998,40.0
6,Bhutan,800000.0,6.500000,8227.400391,1.2,1.692973e+05,38394.0,21.187658,51106.0,6.700000,...,47.700001,6.600000,0.18,-0.26,44.500000,32.599998,40.799999,65.699997,52.299999,60.0
7,Bosnia and Herzegovina,3900000.0,42.200001,10957.500000,3.0,8.244281e+05,51187.0,68.496429,34803.0,0.600000,...,50.200001,16.799999,-0.71,-0.56,47.400002,40.500000,55.299999,84.699997,68.699997,70.0
8,Botswana,2200000.0,36.700001,17041.599609,2.5,6.475747e+04,566730.0,4.043656,160644.0,7.200000,...,44.099998,5.700000,0.90,1.80,93.400002,77.400002,78.699997,63.000000,80.900002,90.0
9,Brunei Darussalam,400000.0,32.500000,76884.000000,0.9,6.172840e+06,5265.0,81.346680,102733.0,49.299999,...,72.500000,6.100000,1.04,1.53,80.900002,73.500000,83.500000,49.900002,71.800003,70.0
10,Cabo Verde,500000.0,3.500000,6662.000000,1.1,8.678403e+05,4033.0,135.580154,14924.0,3.000000,...,,5.100000,-0.76,-0.88,36.799999,39.900002,53.599998,87.500000,64.300003,60.0
11,Central African Republic,4900000.0,3.200000,651.900024,0.5,5.136568e+03,622984.0,7.478699,81598.0,2.900000,...,16.100000,15.300000,1.00,1.17,53.500000,50.900002,46.500000,96.500000,63.299999,60.0


## Data Standarization

In [31]:
# Normalize the data except a few cols

std = df.copy()

# std = std.dropna(how='any')

std[df_norm.columns[1:]] = preprocessing.scale(std[df_norm.columns[1:]])

std


Data with input dtype float32 were all converted to float64 by the scale function.



Unnamed: 0,indicator,population,gdp_per_cap,gdp,gdp_growth,gdp_per_km2,land_area,population_density,total_foreign_born,per_foreign_born,...,sustainable_economic_development_assessment(seda),unemployment,political_stability&absence_of_violence,control_of_corruption,judicial_effectiveness_score,government_integrity_score,property_rights_score,tax_burden_score,overall_economic_freedom_score,financial_freedom_score
0,Albania,0.63585,-0.11936,-0.37373,-0.29113,-0.29179,-0.33699,-0.21429,-0.36407,-0.55749,...,0.00530,-0.08313,-2.90097,-1.45325,-0.87710,-0.86532,-1.62354,1.39438,-0.80261,-2.05991
1,Armenia,0.69973,-0.23733,-0.50812,0.03260,-0.33113,-0.33393,-0.22146,-0.10859,-0.04866,...,-0.05565,1.15195,0.54263,-0.38341,-1.01667,-0.08500,0.24279,0.85723,0.46420,1.01812
2,Bahamas,-0.96115,-0.47327,0.15711,0.35633,-0.33298,-0.40312,-0.45593,-0.36083,0.33806,...,0.32311,0.34440,-0.93010,-0.56820,-0.52816,-0.70584,-1.11314,-0.03269,-1.43602,-1.03390
4,Barbados,-1.02503,-0.53226,-0.15413,0.15032,0.87802,-0.43955,1.85411,-0.40832,-0.00117,...,0.34052,-0.38398,-0.20457,-1.34626,-1.01667,-1.28112,-0.69038,0.64076,-1.06173,-0.52090
6,Bhutan,-0.70563,-0.50838,-0.52456,-0.37942,-0.41909,-0.29518,-0.52360,-0.37652,-0.31325,...,-0.22979,-0.38398,0.30439,-0.22780,-0.06457,-0.50079,-0.44291,-0.69812,-0.70664,0.50511
7,Bosnia and Herzegovina,1.27465,-0.00700,-0.41058,0.15032,-0.34179,-0.24653,-0.34874,-0.40769,-0.72711,...,-0.12095,1.23112,-0.65938,-0.51957,0.07999,-0.05082,0.30466,0.82516,0.86728,1.01812
8,Botswana,0.18869,-0.08424,-0.15658,0.00317,-0.43143,1.71406,-0.58697,-0.16703,-0.27933,...,-0.38652,-0.52649,1.08408,1.77572,2.37300,2.05093,1.51107,-0.91458,2.03813,2.04413
9,Brunei Darussalam,-0.96115,-0.14323,2.34176,-0.46771,0.28926,-0.42116,-0.30124,-0.27778,2.57694,...,0.84988,-0.46315,1.23568,1.51312,1.74990,1.82879,1.75854,-1.96484,1.16479,1.01812
10,Cabo Verde,-0.89727,-0.55052,-0.58991,-0.40885,-0.33667,-0.42585,-0.10078,-0.44571,-0.56428,...,,-0.62150,-0.71352,-0.83079,-0.44840,-0.08500,0.21701,1.04964,0.44501,0.50511
11,Central African Republic,1.91345,-0.55473,-0.84082,-0.58543,-0.43846,1.92799,-0.57427,-0.31820,-0.57106,...,-1.60550,0.99361,1.19237,1.16299,0.38406,0.54154,-0.14904,1.77119,0.34904,0.50511


In [32]:
pd.options.display.float_format = '{:,.5f}'.format
std.describe()

Unnamed: 0,population,gdp_per_cap,gdp,gdp_growth,gdp_per_km2,land_area,population_density,total_foreign_born,per_foreign_born,population_living_abroad_diaspora,...,sustainable_economic_development_assessment(seda),unemployment,political_stability&absence_of_violence,control_of_corruption,judicial_effectiveness_score,government_integrity_score,property_rights_score,tax_burden_score,overall_economic_freedom_score,financial_freedom_score
count,65.0,65.0,65.0,65.0,65.0,65.0,65.0,64.0,64.0,64.0,...,46.0,64.0,64.0,64.0,65.0,65.0,65.0,65.0,65.0,65.0
mean,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,...,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0
std,1.00778,1.00778,1.00778,1.00778,1.00778,1.00778,1.00778,1.00791,1.00791,1.00791,...,1.01105,1.00791,1.00791,1.00791,1.00778,1.00778,1.00778,1.00778,1.00778,1.00778
min,-1.15279,-0.59686,-0.84082,-4.32306,-0.43846,-0.44005,-0.5946,-0.46968,-0.76103,-0.78808,...,-5.11444,-1.38155,-2.90097,-1.75474,-1.78433,-1.34947,-1.63901,-2.64631,-2.66445,-2.05991
25%,-0.89727,-0.55052,-0.63619,-0.40885,-0.41186,-0.42169,-0.48721,-0.44627,-0.5999,-0.63302,...,-0.32448,-0.64129,-0.65938,-0.62169,-0.8771,-0.79128,-0.71615,-0.38544,-0.70664,-0.5209
50%,-0.32235,-0.34126,-0.37373,0.00317,-0.34316,-0.35303,-0.32941,-0.34889,-0.31325,-0.42974,...,0.08584,-0.28898,0.14196,-0.23752,-0.06457,-0.26157,-0.04077,0.13568,0.03233,-0.00789
75%,0.69973,-0.05896,0.20531,0.53291,-0.16237,-0.17923,0.01963,-0.10795,0.12774,0.22297,...,0.56037,0.38002,0.8323,0.75451,0.57348,0.54154,0.65524,0.68887,0.75212,0.50511
max,1.91345,4.02372,4.46156,2.26929,6.02048,5.46691,4.77406,5.00718,4.23914,3.99662,...,1.33748,3.27375,1.39811,2.1842,2.373,2.75721,2.0421,2.04378,2.03813,2.04413


In [26]:

norm = df.copy()

norm = norm.dropna(how='any')
# a = preprocessing.scale(df_norm['control_of_corruption'] )

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
norm[norm.columns[1:]] = min_max_scaler.fit_transform(norm[norm.columns[1:]])

norm.describe()


Data with input dtype float32 were all converted to float64 by MinMaxScaler.



Unnamed: 0,population,gdp_per_cap,gdp,gdp_growth,gdp_per_km2,land_area,population_density,total_foreign_born,per_foreign_born,population_living_abroad_diaspora,...,sustainable_economic_development_assessment(seda),unemployment,political_stability&absence_of_violence,control_of_corruption,judicial_effectiveness_score,government_integrity_score,property_rights_score,tax_burden_score,overall_economic_freedom_score,financial_freedom_score
count,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,...,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0
mean,0.490057,0.180552,0.204601,0.718936,0.085202,0.107009,0.097422,0.118834,0.186536,0.213065,...,0.794626,0.365948,0.696222,0.426263,0.403259,0.308568,0.426184,0.567769,0.541048,0.494318
std,0.331587,0.248691,0.212073,0.176539,0.181336,0.199879,0.177515,0.214127,0.230302,0.2342,...,0.159813,0.228084,0.24114,0.268899,0.255032,0.25583,0.281367,0.241001,0.22122,0.249934
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.145833,0.042161,0.067264,0.66206,0.002797,0.007134,0.010693,0.01302,0.036635,0.032937,...,0.752193,0.225636,0.572727,0.264815,0.184652,0.117892,0.207283,0.4451,0.397449,0.34375
50%,0.489583,0.099391,0.142815,0.726131,0.015649,0.029878,0.040014,0.042516,0.11194,0.112273,...,0.820175,0.309322,0.72987,0.354321,0.396882,0.217753,0.406162,0.591652,0.530612,0.5
75%,0.796875,0.206012,0.246222,0.816583,0.066456,0.103292,0.084484,0.087179,0.220149,0.287824,...,0.880229,0.476695,0.883766,0.645679,0.558153,0.363731,0.613796,0.749093,0.728061,0.65625
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
