# Importing Libraries

In [None]:
library(tidyverse)
library(clValid)
library(factoextra)


# Global Functions

In [None]:
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

# Loadind Data

In [None]:
soce <- read.csv("Book1.csv")

In [None]:
head(soce)

# Exploratory Data Analysis

In [None]:
soce_1 <- select(soce, -("District"))

In [None]:
soce_1agg <- aggregate(. ~ State, data = soce_1 , FUN = mean)

In [None]:
head(soce_1agg)

In [None]:
pl <- ggplot(soce_1agg, aes(x = State, y =  IMR)) + geom_bar(stat = "identity",aes(fill = factor(State))) + theme(axis.text.x = element_text(angle = 90)) + geom_hline(yintercept = 34, color = "Black", size = 1)
pl

### 34 is the Mean Infant Mortality Rate in India for the year 2016, highlighted by the horizontal line in the plot.

In [None]:
pl1 <- ggplot(soce_1agg, aes(x = State, y =  DR)) + geom_bar(stat = "identity",aes(fill = factor(State))) + theme(axis.text.x = element_text(angle = 90)) + geom_hline(yintercept = 54, color = "Black", size = 1)
pl1

### 51 is the Mean Dependency Ratio in India for the year 2018, highlighted by the horizontal line in the plot.

In [None]:
pl2 <- ggplot(soce_1agg, aes(x = State, y =  U5MR)) + geom_bar(stat = "identity",aes(fill = factor(State))) + theme(axis.text.x = element_text(angle = 90)) + geom_hline(yintercept = 43, color = "Black", size = 1)
pl2

### 43 is the Mean Under Five Mortality Rate in India for the year 2015, highlighted by the horizontal line in the plot.

In [None]:
pl3 <- ggplot(soce_1agg, aes(x = State, y =  SRB)) + geom_bar(stat = "identity",aes(fill = factor(State))) + theme(axis.text.x = element_text(angle = 90)) + geom_hline(yintercept = 900, color = "Black", size = 1)
pl3

### 900 is the mean Sex Ratio at Birth in India for the years 2013-15, highlighted by the horizontal line in the plot.

In [None]:
pl4 <- ggplot(soce_1agg, aes(x = State, y =  NNMR)) + geom_bar(stat = "identity",aes(fill = factor(State))) + theme(axis.text.x = element_text(angle = 90)) + geom_hline(yintercept = 28, color = "Black", size = 1)
pl4

### 28 is the Total Neonatal Mortality Rate in India for the year 2013, highlighted by the horizontal line in the plot.

In [None]:
pl5 <- ggplot(soce_1agg, aes(x = State, y =  TFR)) + geom_bar(stat = "identity",aes(fill = factor(State))) + theme(axis.text.x = element_text(angle = 90)) + geom_hline(yintercept = 2.33, color = "Black", size = 1)
pl5

### 2.33 is the total fertility rate(per woman) in India for the year 2016, highlighted by the horizontal line in the plot.

In [None]:
pl6 <- ggplot(soce_1agg, aes(x = State, y =  CL)) + geom_bar(stat = "identity",aes(fill = factor(State))) + theme(axis.text.x = element_text(angle = 90)) + geom_hline(yintercept = 2.3, color = "Black", size = 1)
pl6

### 2.3% is the average number of Children aged 5-14 years currently engaged in work, highlighted by the line in the plot.

# Data Preparation 

In [None]:
soce1 <- select(soce, -c("State","District"))

In [None]:
head(soce1)

# Standardised Table

In [None]:
soce1_sd <- scale(soce1)

In [None]:
head(soce1_sd)

# Determining optimal number of clusters

### Elbow Curve Method

In [None]:
fviz_nbclust(soce1, hcut, method = "wss")

In [None]:
#r_sq <- rnorm(20)
#for(number in 1:20){
#    clus <- kmeans(soce1, centers = number, nstart = 50)
#    r_sq[number] <- clus$betweenss/clus$totss
#}
#plot(r_sq, ylim = c(1,0))

### Silhouette Method

In [None]:
fviz_nbclust(soce1, hcut, method = "silhouette")

In [None]:
#soce_dist <- dist(soce1_sd)

In [None]:
#avg_sil <- function(k){
 #  km.res <- kmeans(soce1_sd, centers = k, nstart = 25 )
  #  ss <- silhouette(km.res$cluster, dist(soce1_sd))
  #  mean(ss[, 3])
#}

In [None]:
#k.values <- 2:15
#avg_sil_values <- map_dbl(k.values, avg_sil)
#plot(k.values, avg_sil_values, type = "b", pch = 19, frame = FALSE, xlab = "Number of clusters K", ylab = "Average Silhouettes")


### Cluster Validation

In [None]:
intern <- clValid(soce1,2:10, maxitems = nrow(soce1), clMethods =  c("kmeans"),validation=c("internal","stability"))
summary(intern)

In [None]:
intern <- clValid(soce1,2:10, maxitems = nrow(soce1), clMethods =  c("hierarchical"),validation=c("internal","stability"))
summary(intern)

#### Optimal number of clusters is 2, which will divide the data set into HIGH PRIORITY DISTRICTS AND LOW PRIORITY DISTRICTS

#### Observing the elbow curve, we can furthermore analyse the given data set for 4, 5 and 6 clusters. 

# Applying K-Means Algorithm for centers = 2, 4, 5, 6

In [None]:
set.seed(101)

In [None]:
clus2 <- kmeans(soce1_sd, centers = 2, iter.max = 50, nstart = 20)

In [None]:
clus2$size

In [None]:
clus4 <- kmeans(soce1_sd, centers = 4, iter.max = 50, nstart = 50)

In [None]:
clus4$size

In [None]:
clus5 <- kmeans(soce1_sd, centers = 5, iter.max = 50, nstart = 50)

In [None]:
clus5$size

In [None]:
clus6 <- kmeans(soce1_sd, centers = 6, iter.max = 50, nstart = 50)

In [None]:
clus6$size

## Binding clusters obtained for various centers to original data set

In [None]:
soce_2km <- cbind(soce, clus2$cluster)


In [None]:
colnames(soce_2km)[14] <- "ClusterID"

In [None]:
soce_2km

In [None]:
write.csv(soce_2km, file = "Soce2.csv")

In [None]:
soce_4km <- cbind(soce, clus4$cluster)

In [None]:
colnames(soce_4km)[14] <- "ClusterID"

In [None]:
write.csv(soce_4km, file = "Soce4.csv")

In [None]:
soce_5km <- cbind(soce, clus5$cluster)

In [None]:
colnames(soce_5km)[14] <- "ClusterID"

In [None]:
write.csv(soce_5km, file = "Soce5.csv")

In [None]:
soce_6km <- cbind(soce, clus6$cluster)

In [None]:
colnames(soce_6km)[14] <- "ClusterID"

In [None]:
write.csv(soce_6km, file = "Soce6.csv")

In [None]:
head(soce_2km)
head(soce_4km)
head(soce_5km)
head(soce_6km)

# Creating Cluster Subsets

### Centers = 2

In [None]:
Clus1_km <- subset(soce_2km, ClusterID == 1)
head(Clus1_km)

In [None]:
Clus2_km <- subset(soce_2km, ClusterID == 2)
head(Clus2_km)

### Centers = 4

In [None]:
Clus1_km4 <- subset(soce_4km, ClusterID == 1)

In [None]:
Clus2_km4 <- subset(soce_4km, ClusterID == 2)

In [None]:
Clus3_km4 <- subset(soce_4km, ClusterID == 3)

In [None]:
Clus4_km4 <- subset(soce_4km, ClusterID == 4)

### Centers = 5

In [None]:
Clus1_km5 <- subset(soce_5km, ClusterID == 1)

In [None]:
Clus2_km5 <- subset(soce_5km, ClusterID == 2)

In [None]:
Clus3_km5 <- subset(soce_5km, ClusterID == 3)

In [None]:
Clus4_km5 <- subset(soce_5km, ClusterID == 4)

In [None]:
Clus5_km5 <- subset(soce_5km, ClusterID == 5)

### Centers = 6

In [None]:
Clus1_km6 <- subset(soce_6km, ClusterID == 1)

In [None]:
Clus2_km6 <- subset(soce_6km, ClusterID == 2)

In [None]:
Clus3_km6 <- subset(soce_6km, ClusterID == 3)

In [None]:
Clus4_km6 <- subset(soce_6km, ClusterID == 4)

In [None]:
Clus5_km6 <- subset(soce_6km, ClusterID == 5)

In [None]:
Clus6_km6 <- subset(soce_6km, ClusterID == 6)

# Analysing Clusters (Centers = 2)

### Calculating Frequencies of various states

In [None]:
l1 <- list(table(Clus1_km$State),table(Clus2_km$State))

In [None]:
Assam <- as.vector(sapply(l1, function(x) x[1]))
Bihar <- as.vector(sapply(l1, function(x) x[2]))
Chhattisgarh <- as.vector(sapply(l1, function(x) x[3]))
Jharkhand <- as.vector(sapply(l1, function(x) x[4]))
Madhya_Pradesh <- as.vector(sapply(l1, function(x) x[5]))
Odisha <- as.vector(sapply(l1, function(x) x[6]))
Rajasthan <- as.vector(sapply(l1, function(x) x[7]))
Uttar_Pradesh <- as.vector(sapply(l1, function(x) x[8]))
Uttarakhand <- as.vector(sapply(l1, function(x) x[9]))

## General Summary

In [None]:
km1 <- group_by(soce_2km, ClusterID)

In [None]:
tab <- summarise(km1, Mean_DR = mean(DR), Mean_NNMR = mean(NNMR), Mean_CSP = mean(CSP), Mean_CL = mean(CL), Mean_CFI = mean(CFI), Mean_IMR = mean(IMR), Mean_SRB = mean(SRB), Mean_U5MR = mean(U5MR), Mean_LR = mean(LR), Mean_TFR = mean(TFR), Mean_SD = mean(SD))

In [None]:
tab <- cbind(tab, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab

## Parameter Wise Summary

### Dependency Ratio

In [None]:
km1 <- group_by(soce_2km, ClusterID)
tab1 <- summarise(km1, Mean_DR = mean(DR), Mode_DR = Mode(DR), Median_DR = median(DR), Range_DR = max(DR)-min(DR))

In [None]:
tab1 <- cbind(tab1, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab1

### Literacy Rate

In [None]:
km2 <- group_by(soce_2km, ClusterID)
tab2 <- summarise(km2, Mean_LR = mean(LR), Mode_LR = Mode(LR), Median_LR = median(LR), Range_LR = max(LR)-min(LR))

In [None]:
tab2 <- cbind(tab2, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab2

### Children Currently in School (%)

In [None]:
km3 <- group_by(soce_2km, ClusterID)
tab3 <- summarise(km3, Mean_CSP = mean(CSP), Mode_CSP = Mode(CSP), Median_CSP = median(CSP), Range_CSP = max(CSP)-min(CSP))

In [None]:
tab3 <- cbind(tab3, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab3

### Children below working age currently enrolled in work (%)

In [None]:
km4 <- group_by(soce_2km, ClusterID)
tab4 <- summarise(km4, Mean_CL = mean(CL), Mode_CL = Mode(CL), Median_CL = median(CL), Range_CL = max(CL)-min(CL))

In [None]:
tab4 <- cbind(tab4, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab4

### Neonatal Mortality Rate

In [None]:
km5 <- group_by(soce_2km, ClusterID)
tab5 <- summarise(km5, Mean_NNMR = mean(NNMR), Mode_NNMR = Mode(NNMR), Median_NNMR = median(NNMR), Range_NNMR = max(NNMR)-min(NNMR))

In [None]:
tab5 <- cbind(tab5, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab5

### Infant Mortality Rate

In [None]:
km6 <- group_by(soce_2km, ClusterID)
tab6 <- summarise(km6, Mean_IMR = mean(IMR), Mode_IMR = Mode(IMR), Median_IMR = median(IMR), Range_IMR = max(IMR)-min(IMR))

In [None]:
tab6 <- cbind(tab6, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab6

### Sex Ratio at Birth

In [None]:
km7 <- group_by(soce_2km, ClusterID)
tab7 <- summarise(km7, Mean_SRB = mean(SRB), Mode_SRB = Mode(SRB), Median_SRB = median(SRB), Range_SRB = max(SRB)-min(SRB))

In [None]:
tab7 <- cbind(tab7, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab7

### Under Five Mortality Rate

In [None]:
km8 <- group_by(soce_2km, ClusterID)
tab8 <- summarise(km8, Mean_U5MR = mean(U5MR), Mode_U5MR = Mode(U5MR), Median_U5MR = median(U5MR), Range_U5MR = max(U5MR)-min(U5MR))

In [None]:
tab8 <- cbind(tab8, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab8

### Total Fertility Rate

In [None]:
km9 <- group_by(soce_2km, ClusterID)
tab9 <- summarise(km9, Mean_TFR = mean(TFR), Mode_TFR = Mode(TFR), Median_TFR = median(TFR), Range_TFR = max(TFR)-min(TFR))

In [None]:
tab9 <- cbind(tab9,Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab9

### Number of Safe Deliveries

In [None]:
km10 <- group_by(soce_2km, ClusterID)
tab10 <- summarise(km10, Mean_SD = mean(SD), Mode_SD = Mode(SD), Median_SD = median(SD), Range_SD = max(SD)-min(SD))

In [None]:
tab10 <- cbind(tab10, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab10

### Children Fully Immunised

In [None]:
km11 <- group_by(soce_2km, ClusterID)
tab11 <- summarise(km11, Mean_CFI = mean(CFI), Median_CFI = median(CFI), Mode_CFI = Mode(CFI), Range_CFI = max(CFI)-min(CFI))

In [None]:
tab11 <- cbind(tab11, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab11

# Comparing Cluster solutions with original High Priority Districts(HPD)

### Loading High Priority Districts Data

In [None]:
hpd <- read.csv("highprioritydistricts.csv")

In [None]:
head(hpd,10)

### Vectorising districts in HPD dataset

In [None]:
dist_hpd <- as.vector(hpd$District)
length(dist_hpd)

### Vectorising Districts obtained in Cluster 2

In [None]:
dist_km <- as.vector(Clus2_km$District)

In [None]:
dist_km1 <- as.vector(Clus2_km4$District)

In [None]:
dist_km2 <- as.vector(Clus3_km4$District)

### Matching Districts in HPD and Cluster 2

In [None]:
m <- match(dist_hpd,dist_km)
m


In [None]:
length(m)

### Checking Percentage of Matched Districts

#### Number of unmatched districts : 18

#### Total Number of districts : 89

#### Percentage of matched districts

In [None]:
(1-(16/89))*100

#### Approx. 82 % of districts obtained in Cluster 1 matched with original HPD

# Analysing Clusters (Centers = 4)

### Calculating Frequencies of various States

In [None]:
l2 = list(table(Clus1_km4$State),table(Clus2_km4$State),table(Clus3_km4$State),table(Clus4_km4$State))

In [None]:
Assam <- as.vector(sapply(l2, function(x) x[1]))
Bihar <- as.vector(sapply(l2, function(x) x[2]))
Chhattisgarh <- as.vector(sapply(l2, function(x) x[3]))
Jharkhand <- as.vector(sapply(l2, function(x) x[4]))
Madhya_Pradesh <- as.vector(sapply(l2, function(x) x[5]))
Odisha <- as.vector(sapply(l2, function(x) x[6]))
Rajasthan <- as.vector(sapply(l2, function(x) x[7]))
Uttar_Pradesh <- as.vector(sapply(l2, function(x) x[8]))
Uttarakhand <- as.vector(sapply(l2, function(x) x[9]))

### General Summary

In [None]:
km_1 <- group_by(soce_4km, ClusterID)

In [None]:
tab_1 <- summarise(km_1, Mean_DR = mean(DR), Mean_NNMR = mean(NNMR), Mean_CSP = mean(CSP), Mean_CL = mean(CL), Mean_CFI = mean(CFI), Mean_IMR = mean(IMR), Mean_SRB = mean(SRB), Mean_U5MR = mean(U5MR), Mean_LR = mean(LR), Mean_TFR = mean(TFR), Mean_SD = mean(SD))

In [None]:
tab_1 <- cbind(tab_1, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab_1

# Analysing Clusters (Centers = 5)

### Calculating Frequencies of States

In [None]:
l3 = list(table(Clus1_km5$State),table(Clus2_km5$State),table(Clus3_km5$State),table(Clus4_km5$State), table(Clus5_km5$State))


In [None]:
Assam <- as.vector(sapply(l3, function(x) x[1]))
Bihar <- as.vector(sapply(l3, function(x) x[2]))
Chhattisgarh <- as.vector(sapply(l3, function(x) x[3]))
Jharkhand <- as.vector(sapply(l3, function(x) x[4]))
Madhya_Pradesh <- as.vector(sapply(l3, function(x) x[5]))
Odisha <- as.vector(sapply(l3, function(x) x[6]))
Rajasthan <- as.vector(sapply(l3, function(x) x[7]))
Uttar_Pradesh <- as.vector(sapply(l3, function(x) x[8]))
Uttarakhand <- as.vector(sapply(l3, function(x) x[9]))

In [None]:
km_2 <- group_by(soce_5km, ClusterID)

### General Summary

In [None]:
tab_2 <- summarise(km_2, Mean_DR = mean(DR), Mean_NNMR = mean(NNMR), Mean_CSP = mean(CSP), Mean_CL = mean(CL), Mean_CFI = mean(CFI), Mean_IMR = mean(IMR), Mean_SRB = mean(SRB), Mean_U5MR = mean(U5MR), Mean_LR = mean(LR), Mean_TFR = mean(TFR), Mean_SD = mean(SD))

In [None]:
tab_2 <- cbind(tab_2, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab_2

# Analysing Clusters (centers = 6)

### Calculating Frequencies of States

In [None]:
l4 = list(table(Clus1_km6$State),table(Clus2_km6$State),table(Clus3_km6$State),table(Clus4_km6$State), table(Clus5_km6$State), table(Clus6_km6$State))

In [None]:
Assam <- as.vector(sapply(l4, function(x) x[1]))
Bihar <- as.vector(sapply(l4, function(x) x[2]))
Chhattisgarh <- as.vector(sapply(l4, function(x) x[3]))
Jharkhand <- as.vector(sapply(l4, function(x) x[4]))
Madhya_Pradesh <- as.vector(sapply(l4, function(x) x[5]))
Odisha <- as.vector(sapply(l4, function(x) x[6]))
Rajasthan <- as.vector(sapply(l4, function(x) x[7]))
Uttar_Pradesh <- as.vector(sapply(l4, function(x) x[8]))
Uttarakhand <- as.vector(sapply(l4, function(x) x[9]))

### General Summary

In [None]:
km_3 <- group_by(soce_6km, ClusterID)

In [None]:
tab_3 <- summarise(km_3, Mean_DR = mean(DR), Mean_NNMR = mean(NNMR), Mean_CSP = mean(CSP), Mean_CL = mean(CL), Mean_CFI = mean(CFI), Mean_IMR = mean(IMR), Mean_SRB = mean(SRB), Mean_U5MR = mean(U5MR), Mean_LR = mean(LR), Mean_TFR = mean(TFR), Mean_SD = mean(SD))

In [None]:
tab_3 <- cbind(tab_3, Assam, Bihar, Chhattisgarh, Jharkhand, Madhya_Pradesh, Odisha, Rajasthan, Uttar_Pradesh, Uttarakhand)

In [None]:
tab_3