In [48]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import graphviz
import subprocess
from sklearn import datasets, tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, make_scorer, plot_roc_curve, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from os import system
from IPython.display import Image

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Read the csv file from the local machine

In [49]:
df = pd.read_csv("../input/adult-data/adult-dataset.csv")
df.head()
df

# Pre-processing data
We find out that many instances contain a question mark (missing data). Then, we decided to remove this data from our analysis

In [50]:
df = df.replace({' ?': np.nan})
df.dropna(how='any',inplace = True)
df

# Univariate analysis 

In [51]:
df['class-label'].value_counts().plot(kind = "bar")


In [52]:
df.hist(column = "age",bins = 20)


In [53]:
df['workclass'].value_counts().plot(kind = "bar")

In [54]:
df['education'].value_counts().plot(kind = "bar")

In [55]:
df['marital-status'].value_counts().plot(kind = "bar")

In [56]:
df['occupation'].value_counts().plot(kind = "bar")

In [57]:
df['relationship'].value_counts().plot(kind = "bar")

In [58]:
df['race'].value_counts().plot(kind = "bar")

In [59]:
df['sex'].value_counts().plot(kind = "bar")

Because the capital-loss and capital-gain are numeric (most of them distributed in a wide range), we decided to normalize the data. 

In [60]:
df["capital-loss"]=((df["capital-loss"]-df["capital-loss"].min())/(df["capital-loss"].max()-df["capital-loss"].min()))
bins= [0, 0.05, 0.1, 0.15, 0.25, 0.3, 0.35, 0.40, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
plt.hist(df["capital-loss"], bins=bins, edgecolor="k")
plt.xlabel('Capital_Loss')
plt.ylabel('Frequency')
plt.show()

In [61]:
df["fnlwgt"]=((df["fnlwgt"]-df["fnlwgt"].min())/(df["fnlwgt"].max()-df["fnlwgt"].min()))

bins= [0, 0.05, 0.1, 0.15, 0.25, 0.3, 0.35, 0.40, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
plt.hist(df["fnlwgt"], bins=bins, edgecolor="k")
plt.xticks(bins)
plt.show()

In [62]:
df['education-num'].value_counts().plot(kind = "bar")


In [63]:
bins= [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
plt.hist(df["hours-per-week"], bins=bins, edgecolor="k")
plt.xticks(bins)
plt.show()




In [64]:
df.boxplot(column=['age'])

In [65]:
df.boxplot(column=['fnlwgt'])

In [66]:
df.boxplot(column=['hours-per-week'])

# Bivariate analysis

In [67]:

plt.figure(figsize=(12, 9))

hm = sns.heatmap(df.corr(), annot = True)

hm.set(title = "Correlation matrix of Adult dataset\n")

plt.show()

# 3.1 Feature Selection

In this model, we used numerical data for clustring and categorical data for evaluation

In [68]:
df_new = df.drop(labels=["education", "education-num", "marital-status", "occupation", "relationship", "race","sex","workclass","capital-loss","native-country","capital-gain","hours-per-week"],axis=1)
df_new

In [69]:
df_bin = pd.get_dummies(df["class-label"])
# Convert class-label to bin
df_new["class-label"]=df_bin.iloc[:,1]
df_new

In [70]:

# Specify Train data
x_train = df_new.iloc[:,0:2]
y_train = df_new.iloc[:,2]
x_train



## 3.2.1 KMeans Algorithm

### 3.2.1.1 Parameter Tuning

Defining proper number of clusters

In [71]:
from sklearn import cluster
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer, InterclusterDistance

model = cluster.KMeans()
visualizer = KElbowVisualizer(model, k=(4,12))
visualizer.fit(x_train)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure


print ('The is optimal value of k:')
visualizer.elbow_value_


### 3.2.1.2 Evaluation measures

##### 3.2.1.2.1 Internal measures

Plot Silhouette score

In [72]:
model = cluster.KMeans(7, random_state=42)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(x_train)        # Fit the data to the visualizer
visualizer.show()              # Finalize and render the figure
print ("Average silhouette score is:")
visualizer.silhouette_score_

#### 3.2.1.2.2 External measures

Purity and Entrophy

In [73]:
#Entrophy and purity:
from math import log
#class1_pr = np.count_nonzero(y_train.to_numpy())/len(y_train)
#class2_pr = 1-class1_pr

# prob[cluster_no][class]
def compute_entrophy_purity(label, x_train, y_train, no_clusters):
    x_train = x_train.to_numpy()
    y_train = y_train.to_numpy()
    prob = [[0,0] for _ in range(no_clusters)]
    lab_count = [0 for _ in range(no_clusters)]
    entrophy = [0 for _ in range(no_clusters)]
    purity = [0 for _ in range(no_clusters)]
    for i in range(len(x_train)):
        #compute probabilites p_ij of observing class j in cluster i
        #print(i, label[i], y_train[i])
        prob[label[i]][y_train[i]] += 1
        lab_count[label[i]] += 1
    for i in range(no_clusters):
        for j in [0,1]:
            prob[i][j] /= lab_count[i]
    ent_of_clustering = 0
    for i in range(no_clusters):
        #compute enthropies of clusters
        print(prob[i][0], prob[i][1])
        entrophy[i] = -(prob[i][0]*(log(prob[i][0], 2) if prob[i][0]>0 else 0) + prob[i][1]*(log(prob[i][1], 2)) if prob[i][1]>0 else 0)
        #compute purities of cluster
        purity[i] = max(prob[i][0], prob[i][1])
        # ethropy of clustering
        ent_of_clustering += entrophy[i]*lab_count[i]/len(x_train)
        # purity of clustering
        purity_of_clustering = purity[i]*lab_count[i]/len(x_train)
    return ent_of_clustering, purity_of_clustering
eoc, poc = compute_entrophy_purity(model.labels_, x_train, y_train, 7)
print("Purity:\n",poc)
print("Entrophy:\n",eoc)

#### 3.2.1.2.3 Discriminative behavior

In [74]:
# frequency of "Sex" feature in different cluster
plot_help = x_train
plot_help["cluster"] = pd.DataFrame(model.labels_ , columns=["cluster"])
plot_help["race"] = df["race"]
plot_help["sex"] = df["sex"]
sns.countplot(x = "cluster", hue="sex", data=plot_help)


In [75]:
# frequency of "race" feature in different cluster

sns.countplot(x = "cluster", hue="race", data=plot_help)
x_train = x_train.iloc[:,0:2]

This section is time consumming

### Trying to remove outliers

In [76]:
"""import numpy as np
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors=7)
threshold_LOF = 10
clf.fit_predict(x_train)
large_values = abs(clf.negative_outlier_factor_)
print (large_values.min())
x_train_num = x_train.to_numpy()
large_values = (large_values  > threshold_LOF).reshape(len(large_values ),1)
large_values = np.hstack((large_values,large_values))
x_train_num[large_values]=-9999
x_train_num
row_mask = (x_train_num != -9999).all(axis=1)
new_x_train = x_train_num[row_mask,:]

model = cluster.KMeans(7, random_state=42)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(new_x_train)        # Fit the data to the visualizer
visualizer.show()              # Finalize and render the figure
print ("Average silhouette score is:")
visualizer.silhouette_score_"""

### 3.2.1.3 Model interpretability/visualization

Dimension reduction

In [77]:
"""from sklearn.manifold import MDS
from matplotlib import pyplot as plt
import sklearn.datasets as dt
import seaborn as sns         
import numpy as np
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

# Reduce the dimention

x_train_num = x_train.to_numpy()
dist = manhattan_distances(x_train_num[0:5000])
mds = MDS(dissimilarity='precomputed', random_state=0)
# Get the embeddings
x_train_num_L1 = mds.fit_transform(dist)
"""

In [78]:
"""kmeans = cluster.KMeans(7, random_state=42).fit(x_train)
data = np.expand_dims(kmeans.labels_[0:5000], axis=1)
data = np.append(x_train_num_L1, data, axis=1)
df_vis_dts = pd. DataFrame(data, columns=['dim_1', 'dim_2', 'cluster'])
sns.set(rc={'figure.figsize':(11.7,8.27)})

sns.scatterplot(data=df_vis_dts , x="dim_1", y="dim_2", hue="cluster", size="cluster", palette="deep", sizes=(100, 20), legend="full")"""

In [79]:
"""print(kmeans)"""

## 3.2.2 KMedoids Algorithm

### 3.2.2.1 Parameter Tuning

In [80]:
from sklearn import cluster
from sklearn_extra import cluster

from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer, InterclusterDistance

model = cluster.KMedoids()
visualizer = KElbowVisualizer(model, k=(4,12))
visualizer.fit(x_train)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure


print ('The is optimal value of k:')
visualizer.elbow_value_

### 3.2.2.2 Evaluation measures

##### 3.2.2.2.1 Internal measures

In [None]:
from sklearn_extra import cluster

model = cluster.KMedoids(5, random_state=42)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(x_train)        # Fit the data to the visualizer
visualizer.show()              # Finalize and render the figure
print ("Average silhouette score is:")
visualizer.silhouette_score_

#### 3.2.2.2.2 External measures

In [None]:
eoc, poc = compute_entrophy_purity(model.labels_, x_train, y_train, 5)
print("Purity:",poc)
print("Entrophy:",eoc)

#### 3.2.2.2.3 Discriminative behavior

In [None]:
# frequency of "Sex" feature in different cluster
plot_help = x_train
plot_help["cluster"] = pd.DataFrame(model.labels_ , columns=["cluster"])
plot_help["race"] = df["race"]
plot_help["sex"] = df["sex"]
sns.countplot(x = "cluster", hue="sex", data=plot_help)

In [None]:
# frequency of "race" feature in different cluster
sns.countplot(x = "cluster", hue="race", data=plot_help)
x_train = x_train.iloc[:,0:2]

### 3.2.2.3 Model interpretability/visualization

In [None]:
from sklearn import cluster
from sklearn_extra import cluster

from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer, InterclusterDistance


kmedoids = cluster.KMedoids(5, random_state=42).fit(x_train)
data_medo = np.expand_dims(kmedoids.labels_[0:5000], axis=1)
data_medo = np.append(x_train_num_L1, data_medo, axis=1)
df_vis_dts = pd. DataFrame(data_medo, columns=['dim_1', 'dim_2', 'cluster'])
sns.set(rc={'figure.figsize':(11.7,8.27)})

sns.scatterplot(data=df_vis_dts , x="dim_1", y="dim_2", hue="cluster", size="cluster", palette="deep", sizes=(100, 20), legend="full")

## 3.2.3 DBSCAN Algorithm

## 3.2.3.1 Parameter Tuning

In [81]:
from sklearn import cluster
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import graphviz
import subprocess
from sklearn import datasets, tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, make_scorer, plot_roc_curve, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from os import system
from IPython.display import Image
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import graphviz
import subprocess
from sklearn import datasets, tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, make_scorer, plot_roc_curve, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from os import system
from IPython.display import Image
from sklearn.neighbors import NearestNeighbors
%matplotlib widget
%matplotlib inline
%matplotlib inline
import mpld3
mpld3.enable_notebook()
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import sklearn.metrics as metrics
from sklearn import cluster 
from yellowbrick.cluster import SilhouetteVisualizer
from pylab import rcParams
from plotnine import *
from sklearn.mixture import GaussianMixture
import pandas as pd

In [82]:
df_new = df.drop(labels=["education","class-label", "education-num", "marital-status", "occupation", "relationship", "race","sex","workclass","capital-loss","native-country","capital-gain","hours-per-week"],axis=1)
df_new

### 3.2.3.1.1 Determining the epsilon value 

In [83]:
from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors=100)
neighbors_fit = neighbors.fit(x_train)
distances, indices = neighbors_fit.kneighbors(x_train)
#Step 3: Sort distance values by ascending value and plot

distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances, color='r', linestyle='dashed',
     linewidth=1)

plt.xlim([30080, 30170])
plt.gca().invert_xaxis()
plt.xlabel("Data-set")
plt.ylabel("epsilon value")

### 3.2.3.1.2 Determing the optimal minimum sample

In [None]:
silhouette = [] 
itr = []
for i in range (40,800,20):
    db = cluster.DBSCAN ( eps = 0.5, min_samples = i ).fit(x_train)
    labels = db.labels_
    silhouette_score = metrics.silhouette_score(x_train, labels)
    itr.append(i)
    silhouette.append(silhouette_score)

In [None]:
plt.plot(itr, silhouette)
plt.xlabel("min sample")
plt.ylabel("silhouette score")
plt.savefig("Silhouette Score vs Min Sample")

## 3.2.3.2 DBSCAN implementation

### 3.2.3.2.1.1 DBSCAN modelling with min_sample = 50

In [84]:
db = DBSCAN ( eps = 0.5 , min_samples = 50).fit(df_new)

labsList = ["Noise"]
labsList = labsList + ["cluster" + str(i) for i in range (1,len(set(db.labels_)))]

df_new["assignments"] = db.labels_

DBscan external measures

In [85]:
unique, counts = np.unique(db.labels_, return_counts=True)
print(np.asarray((unique, counts)).T)

In [86]:
eoc, poc = compute_entrophy_purity(db.labels_, x_train, y_train, 56)
print("Purity:\n",poc)
print("Entrophy:\n",eoc)

Discriminative Behaviour


In [87]:
plot_help = x_train
plot_help["cluster"] = pd.DataFrame(db.labels_ , columns=["cluster"])
plot_help["race"] = df["race"]
plot_help["sex"] = df["sex"]
sns.countplot(x = "cluster", hue="sex", data=plot_help)

In [88]:
sns.countplot(x = "cluster", hue="race", data=plot_help)

### 3.2.3.2.1.2 DBSCAN visualization for min_sample =50

In [89]:
(ggplot(df_new,aes(x="age",y = "fnlwgt", color = "factor(assignments)"))+geom_point()+ theme_minimal()+ 
theme(panel_grid_major = element_blank())+
labs(title = "DBSCAN with eps 0.5 , min_samples = 50"))


## 3.2.3.2.1.3 Results for min_sample = 50

In [90]:
d1_clustered = df_new.loc[(df_new.assignments >= 0)]
labels = db.labels_
#silhouette_score(d1_clustered[["age","fnlwgt"]],d1_clustered[["assignments"]])
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(df_new, labels))

### 3.2.3.2.2.1 DBSCAN modelling with min_sample = 100

In [91]:
db = DBSCAN ( eps = 0.5 , min_samples = 100).fit(df_new)

labsList = ["Noise"]
labsList = labsList + ["cluster" + str(i) for i in range (1,len(set(db.labels_)))]

df_new["assignments"] = db.labels_

### 3.2.3.2.2.2 DBSCAN visualization for min_sample =100

In [92]:
(ggplot(df_new,aes(x="age",y = "fnlwgt", color = "factor(assignments)"))+geom_point()+ theme_minimal()+ 
theme(panel_grid_major = element_blank())+
labs(title = "DBSCAN with eps 0.5 , min_samples = 100"))

## 3.2.3.2.2.3 Results for min_sample = 100

In [None]:
d1_clustered = df_new.loc[(df_new.assignments >= 0)]
labels = db.labels_
#silhouette_score(d1_clustered[["age","fnlwgt"]],d1_clustered[["assignments"]])
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(df_new, labels))

### 3.2.3.2.3.1 DBSCAN modelling with min_sample = 300

In [None]:
db = DBSCAN ( eps = 0.5 , min_samples = 300).fit(df_new)

labsList = ["Noise"]
labsList = labsList + ["cluster" + str(i) for i in range (1,len(set(db.labels_)))]

df_new["assignments"] = db.labels_

### 3.2.3.2.3.2 DBSCAN visualization for min_sample =300

In [None]:
(ggplot(df_new,aes(x="age",y = "fnlwgt", color = "factor(assignments)"))+geom_point()+ theme_minimal()+ 
theme(panel_grid_major = element_blank())+
labs(title = "DBSCAN with eps 0.5 , min_samples = 300"))

# 3.2.3.2.3.3 Results for min_sample = 300

In [None]:
d1_clustered = df_new.loc[(df_new.assignments >= 0)]
labels = db.labels_
#silhouette_score(d1_clustered[["age","fnlwgt"]],d1_clustered[["assignments"]])
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(df_new, labels))