Importing required packages

In [None]:
from pandas import DataFrame
from pandas import ExcelWriter
from pandas import ExcelFile

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import scipy as sp
import math

from sklearn.decomposition import PCA
from scipy.linalg import svd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_samples

Load data and EDA

In [None]:
divorce = pd.read_csv("../input/divorce-predictors-data-set-csv/divorce-csv.csv")
divorce.head()

In [None]:
divorce_data = divorce.values[:,0:54]
Y = divorce.values[:,54]
divorce_df = DataFrame(divorce_data)
cov = divorce_df.cov()
corr = divorce_df.corr()

cmap=sns.diverging_palette(250, 5, as_cmap=True)
corr.style.background_gradient(cmap, axis=1)\
.set_properties(**{'max-width': '80px', 'font-size': '10pt'}).set_precision(2)

Most attributes are very highly correlated to each other which could mean they all are conveying
the similar information. For example, Atr 5, Atr 8 and Atr 9 have a strong correlation value of 0.92.

Atr 5: The time I spent with my wife is special for us. Atr 8: I enjoy our holidays with my wife. Atr 9: I
enjoy traveling with my wife.
It's clear these are all providing very similar information.

Attributes which are in blue shades i.e. not strongly correlated to other attributes. But between
these attributes, we can notice strong correlation. For example, Atr 42 and 43 are not correlated to
other attributes but between them there is fairly strong correlation of 0.72

A 42: When I argue with my spouse, ı only go out and I don't say a word. A 43: I mostly stay silent
to calm the environment a little bit. A 44: Sometimes I think it's good for me to leave home for a
while.

This is a strong evidence of presence of principal components and we can transform this dataset
to much smaller dimension

In [None]:
w,v = np.linalg.eig(corr)
plt.bar(np.arange(54),np.cumsum(w)/np.sum(w))

The first 2 PC capture around 80% of the variance and individual contributions from every other
component is very small and hence not considered.

In [None]:
lda = LinearDiscriminantAnalysis(n_components = 2)
plt.figure(figsize=(20,10))
ax = sns.heatmap(divorce_data,cmap='BuPu')

In [None]:
X_r1 = lda.fit(divorce_data[:,0:30],Y).transform(divorce_data[:,0:30])
sns.heatmap(X_r1,cmap = 'BuPu')

In [None]:
sns.scatterplot(x = np.arange(170), y = X_r1[:,0],hue = Y)

In [None]:
proj_data_1 = np.dot(divorce_data,v[:,0:2])
sns.scatterplot(proj_data_1[:,0],proj_data_1[:,1],hue = Y)

divorced = proj_data_1[Y==1]
together = proj_data_1[Y!=1]
print(divorced.shape)
print(together.shape)
sns.scatterplot(divorced[:,0],divorced[:,1])
sns.scatterplot(together[:,0],together[:,1])

plt.figure(figsize=(8,4))
plt.scatter(proj_data_1[:,0],proj_data_1[:,1],c = Y)
plt.show()

In [None]:
from sklearn.neighbors import NearestNeighbors 
from random import sample 
from numpy.random import uniform 
from math import isnan
def hopkins(X):
    n=X.shape[0]#rows
    d=X.shape[1]#cols
    p=int(0.1*n)#considering 10% of points
    nbrs=NearestNeighbors(n_neighbors=1).fit(X)
    
    rand_X=sample(range(0,n),p)
    uj=[]
    wj=[]
    for j in range(0,p):
        u_dist,_=nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1,-1),2,return_distance=True)
        uj.append(u_dist[0][1])#distances to nearest neighbors in random data
        w_dist,_=nbrs.kneighbors(X[rand_X[j]].reshape(1,-1),2,return_distance=True)
        wj.append(w_dist[0][1])#distances to nearest neighbors in real data
    H=sum(uj)/(sum(uj)+sum(wj))
    if isnan(H):
        print(uj,wj)
        H=0
        
    return H

print(hopkins(proj_data_1))
print(hopkins(divorced))
print(hopkins(together))

In [None]:
random_state = 0
score = np.zeros(20);
for i in range(2,20):
    kmeans = KMeans(n_clusters=i, random_state=random_state)
    kmeans.fit_predict(divorced)  
    score[i] = -kmeans.score(divorced) 
    print("SSE Score for k=",i,":", round(score[i],2))

plt.plot(range(2,20),score[2:20])
plt.scatter(range(2,20),score[2:20])
plt.show()

There is a bend at k = 4 which means the drop in SSE is minimal after k = 4. Hence we use k = 4 as number of clusters

In [None]:
n_clusters = 4
random_state = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state);
y_pred = kmeans.fit_predict(divorced)
plt.scatter(divorced[:, 0], divorced[:, 1], c=y_pred)  # KMeans clusters
plt.title('K-Means cluster labels')
plt.show()

For both Divorced and Together datasets, clusters isn't looking distinct enough. I will try other clustering techniques or reduce cluster count to improve this.


In [None]:
complete_linkage = AgglomerativeClustering(linkage="complete", n_clusters=n_clusters)
y_pred = complete_linkage.fit_predict(divorced)

plt.scatter(divorced[:, 0], divorced[:, 1], c=y_pred)
plt.title('Complete link cluster labels')
plt.show()

In [None]:
average_linkage = AgglomerativeClustering(linkage="average", n_clusters=n_clusters)
y_pred = average_linkage.fit_predict(divorced)

plt.scatter(divorced[:, 0], divorced[:, 1], c=y_pred)
plt.title('Average link cluster labels')
plt.show()

In [None]:
spectral = SpectralClustering(n_clusters=4,affinity = 'rbf',n_neighbors = 10,random_state=random_state)
y_pred = spectral.fit_predict(divorced)
plt.scatter(divorced[:, 0], divorced[:, 1], c=y_pred)
plt.title('Spectral link cluster labels')
plt.show()

In [None]:
def silhouette(X,labels):
    n_clusters=np.size(np.unique(labels));
    sample_silhouette_values=silhouette_samples(X,labels)
    y_lower=10
    for i in range(n_clusters):
        ith_cluster_silhouette_values=sample_silhouette_values[labels==i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i=ith_cluster_silhouette_values.shape[0]
        y_upper=y_lower+size_cluster_i
        color=cm.nipy_spectral(float(i)/n_clusters)
        plt.fill_betweenx(np.arange(y_lower,y_upper),0,ith_cluster_silhouette_values,facecolor=color,edgecolor=color,alpha=0.7)# Label the silhouette plots with their cluster numbers at the middle
        plt.text(-0.05,y_lower+0.5*size_cluster_i,str(i))#Compute the new y_lower for next cluster
        y_lower=y_upper+10# 10 for the 0 samples
    plt.title("Silhouette plot for the various clusters.")
    plt.xlabel("Silhouette coefficient values")
    plt.ylabel("Cluster label")
    plt.show()
    
silhouette(divorced,y_pred)

In [None]:
n_clusters = 4

complete_linkage = AgglomerativeClustering(linkage="complete", n_clusters=n_clusters)
y_pred = complete_linkage.fit_predict(together)

plt.scatter(together[:, 0], together[:, 1], c=y_pred)
plt.title('Complete link cluster labels')
plt.show()

average_linkage = AgglomerativeClustering(linkage="average", n_clusters=n_clusters)
y_pred = average_linkage.fit_predict(together)

plt.scatter(together[:, 0], together[:, 1], c=y_pred, label=y_pred)
plt.title('Complete link cluster labels')
plt.show()

silhouette(together,y_pred)

In [None]:
spectral = SpectralClustering(n_clusters=4,affinity = 'rbf',n_neighbors = 10,random_state=random_state)
y_pred = spectral.fit_predict(together)
plt.scatter(together[:, 0], together[:, 1]
            , c=y_pred)
plt.title('Spectral link cluster labels')
plt.show()

silhouette(together,y_pred)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn_scores = cross_val_score(knn,together,y_pred,cv=3,scoring='accuracy')
knn_scores.mean(),knn_scores.std()

scatter_kwargs={'s':120,'edgecolor':None,'alpha':0.7}
contourf_kwargs={'alpha':0.2}
scatter_highlight_kwargs = {'s':120,'label':'Test data','alpha':0.7}

knn.fit(divorced,y_pred)

plot_decision_regions(X=divorced,y=y_pred,clf=knn,legend=2,
                      scatter_kwargs = scatter_kwargs,
                      contourf_kwargs = contourf_kwargs,
                     scatter_highlight_kwargs = scatter_highlight_kwargs)

plt.xlabel('Attribute 1')
plt.ylabel('Attribute 2')
plt.title('k-NN')
plt.show()

In [None]:
knn.fit(together,y_pred)

plot_decision_regions(X=together,y=y_pred,clf=knn,legend=2,
                      scatter_kwargs = scatter_kwargs,
                      contourf_kwargs = contourf_kwargs,
                     scatter_highlight_kwargs = scatter_highlight_kwargs)

plt.xlabel('Attribute 1')
plt.ylabel('Attribute 2')
plt.title('k-NN')
plt.show()