## What is the percentage of the  population that stays in HDB?

In [1]:
import pandas as pd
import numpy as np
import random
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import timeit

pd.set_option('display.max_rows', 60000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 200)
pd.options.display.float_format = '{:,.2f}'.format

import pygal
from IPython.display import SVG, display, HTML
from pygal.style import RedBlueStyle

In [2]:
file_1='datasets/estimated-resident-population-living-in-hdb-flats/estimated-resident-population-in-hdb-flats-by-town.csv'
file_2='datasets/resident-population-by-ethnicity-gender-and-age-group/singapore-residents-by-ethnic-group-and-sex-end-june-annual.csv'

df1 = pd.read_csv(file_1)
df2 = pd.read_csv(file_2)

#total of all who stays in HDB 
df1 = df1[~(df1['town_or_estate']=='Total')].reset_index(drop=True) #selection of every town other than "total"  
df1_grpby = df1.groupby('financial_year').sum().reset_index(drop=False)  #aggregating the data by years 
df1_grpby.columns = ['year','hdb_pop']

#total residents in SG -- assuming that the rest of the population stays in private or otherwise 
df2 = df2[df2['level_1'] =='Total Residents'].copy().reset_index(drop=True)
df2['value'] = df2['value'].str.replace('na','0').astype(int) #data cleaning
df2_grpby = df2.groupby('year')['value'].sum().reset_index(drop=False) #aggregating the data by years 
df2_grpby.columns = ['year','total_pop']

df = pd.merge(df1_grpby,df2_grpby, how='inner',on='year') #joing df1_grpby and df2_grpby on 'year'
df['hdb_%'] = (df['hdb_pop']/df['total_pop']) * 100 #computing the % that stays in hdb 
df['year'] = df['year'].astype(str)

In [3]:
df1.shape #notice that through groupby function, we "aggregate these data into 11 years"

(312, 3)

In [4]:
df2.shape #notice that through groupby function, we "aggregate these data into 11 years"

(1512, 4)

## EDA

In [5]:
df.info() #summary of the combine dataset

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   year       12 non-null     object 
 1   hdb_pop    12 non-null     int64  
 2   total_pop  12 non-null     int32  
 3   hdb_%      12 non-null     float64
dtypes: float64(1), int32(1), int64(1), object(1)
memory usage: 432.0+ bytes


In [6]:
df.shape #this is the shape of the dataset 

(12, 4)

In [7]:
df.columns #these are the columns in the dataset

Index(['year', 'hdb_pop', 'total_pop', 'hdb_%'], dtype='object')

In [8]:
df.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [9]:
df.isnull().sum() #ensure no nans in the dataset

year         0
hdb_pop      0
total_pop    0
hdb_%        0
dtype: int64

In [10]:
df.describe() #a descriptive statistical summary of the dataset

Unnamed: 0,hdb_pop,total_pop,hdb_%
count,12.0,12.0,12.0
mean,3183725.0,4842498.83,65.91
std,72913.28,307108.03,2.84
min,3020100.0,4365064.0,60.6
25%,3149150.0,4612423.25,64.18
50%,3215200.0,4820747.0,66.7
75%,3239000.0,5063019.0,68.28
max,3249900.0,5347551.0,69.19


In [11]:
df

Unnamed: 0,year,hdb_pop,total_pop,hdb_%
0,2008,3020100,4365064,69.19
1,2009,3094100,4497097,68.8
2,2010,3125900,4569563,68.41
3,2011,3156900,4626710,68.23
4,2012,3165900,4703162,67.31
5,2013,3213000,4778993,67.23
6,2014,3217400,4862501,66.17
7,2015,3234800,4949465,65.36
8,2016,3249900,5038475,64.5
9,2017,3247500,5136651,63.22


## Plotting the donut chart

In [12]:
def gauge_chart(x=2015,y=2019):
    range_years_int = np.arange(x,y+1,1)  #produce the range of years using numpy 
    range_years = [str(x) for x in list(range_years_int)] #transform the years to string
    data = []
    for x in range_years:
        d_perc = round(df[df['year']==x]['hdb_%'].values[0],2) #retrieving only the hdb_% data for the particular year
        list_ = [x,d_perc,100] #in the format of year, hdb_%, and 100% 
        data.append(list_) #append to variable "data"
    
    gauge_chart = pygal.SolidGauge(inner_radius=0.70)
    percent_formatter = lambda x: '{:.10g}%'.format(x)
    gauge_chart.title = 'Population staying in Private vs Public Housing by %'
    for x in range(len(data)):
        gauge_chart.add(data[x][0], 
                   [{'value': data[x][1], 'max_value': data[x][2],'label': 'Population staying in HDB'}],
                     formatter=percent_formatter) #using a loop to plot the info in data unto the gauge chart 
    gauge_chart.render_to_file('3_donut_charts.svg')

In [13]:
gauge_chart()
#gauge_chart(2008,2013) #may select anywhere from 2008 to 2019, default at 2015 to 2019

In [14]:
#just in case if need to insert for those not staying in HDB 
# {'value': 100 - data[x][1], 'max_value': data[x][2],'label': 'Population staying in Private',
#                      'color': 'grey'}