# Project

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import csv
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
import math
%matplotlib inline

In this notebook two different data tables are clustered with different clustering methods and clusters are the evaulated with the Normalized Mutual Information Score. The clusterings are mostly done with the help of the python library sklearn which has been used alot in the course. The order of this document will be:

Data in 'msdata.csv' is clustered with:

    1. K-means clustering
    2. Spectral clustering
    
Then the data in 'genedata.csv' is clustered with:

    1. K-means clustering

In [4]:
msdf = pd.read_csv('msdata.csv')
msdfdata = msdf[msdf.columns[2:]]

1. K-means clustering

In [10]:
mixture = GaussianMixture(n_components=3).fit(msdfdata)
label = mixture.fit_predict(X=msdfdata)
mixtureNorm = normalized_mutual_info_score(msdf['class'], label)
print("Gaussian Mixture with {} clusters Normalized Mutual Information Score: {}".format(5, mixtureNorm))

Gaussian Mixture with 5 clusters Normalized Mutual Information Score: 0.19647120081991729


In [141]:
for i in range(10):
    kmeans = KMeans(n_clusters=3, algorithm="elkan").fit(msdfdata)
    label = kmeans.labels_
    kmeanNorm = normalized_mutual_info_score(msdf['class'], label)
    print("KMeans with {} clusters Normalized Mutual Information Score: {}".format(3, kmeanNorm))


KMeans with 3 clusters Normalized Mutual Information Score: 0.0021235077943874726
KMeans with 3 clusters Normalized Mutual Information Score: 0.2881858402978518
KMeans with 3 clusters Normalized Mutual Information Score: 0.3438910697221467
KMeans with 3 clusters Normalized Mutual Information Score: 0.0021235077943874726
KMeans with 3 clusters Normalized Mutual Information Score: 0.2605956585788048
KMeans with 3 clusters Normalized Mutual Information Score: 0.0685672903239692
KMeans with 3 clusters Normalized Mutual Information Score: 0.0025629996526432964
KMeans with 3 clusters Normalized Mutual Information Score: 0.12225507895618175
KMeans with 3 clusters Normalized Mutual Information Score: 0.07573738871720363
KMeans with 3 clusters Normalized Mutual Information Score: 0.06901569612116151


2. Spectral clustering

In [53]:
clustering = SpectralClustering(n_clusters=3).fit(msdfdata)
label = clustering.labels_
specNorm = normalized_mutual_info_score(msdf['class'], label)
print("Spectral Clustering with {} clusters Normalized Mutual Information Score: {}".format(3, specNorm))



Spectral Clustering with 3 clusters Normalized Mutual Information Score: 0.00198937067956181


In [49]:
brch = Birch(n_clusters=3).fit(msdfdata)
label = brch.labels_
brchNorm = normalized_mutual_info_score(msdf['class'], label)
print("Birch with {} clusters Normalized Mutual Information Score: {}".format(3, brchNorm))

Birch with 3 clusters Normalized Mutual Information Score: 0.27482636965510837


In [156]:
agglo = AgglomerativeClustering(n_clusters=3, linkage="ward").fit(msdfdata)
label = agglo.labels_
aggloNorm = normalized_mutual_info_score(msdf['class'], label)
print("Agglomerative with {} clusters Normalized Mutual Information Score: {}".format(3, aggloNorm))

Agglomerative with 3 clusters Normalized Mutual Information Score: 0.27482636965510837


In [96]:
agglo = AgglomerativeClustering(n_clusters=3, linkage="complete").fit(msdfdata)
label = agglo.labels_
aggloNorm = normalized_mutual_info_score(msdf['class'], label)
print("Agglomerative with {} clusters Normalized Mutual Information Score: {}".format(3, aggloNorm))

Agglomerative with 3 clusters Normalized Mutual Information Score: 0.24888999110524618


In [95]:
agglo = AgglomerativeClustering(n_clusters=3, linkage="average").fit(msdfdata)
label = agglo.labels_
aggloNorm = normalized_mutual_info_score(msdf['class'], label)
print("Agglomerative with {} clusters Normalized Mutual Information Score: {}".format(3, aggloNorm))

Agglomerative with 3 clusters Normalized Mutual Information Score: 0.005381200906424743


In [94]:
agglo = AgglomerativeClustering(n_clusters=3, linkage="single").fit(msdfdata)
label = agglo.labels_
aggloNorm = normalized_mutual_info_score(msdf['class'], label)
print("Agglomerative with {} clusters Normalized Mutual Information Score: {}".format(3, aggloNorm))

Agglomerative with 3 clusters Normalized Mutual Information Score: 0.005381200906424743


Starting to cluster the other data file

In [2]:
gdf = pd.read_csv('genedata.csv')
gdfdata = gdf[gdf.columns[2:]]

In [165]:
for i in range(10):
    kmeans = KMeans(n_clusters=5, algorithm="full").fit(gdfdata)
    label = kmeans.labels_
    kmeanNorm = normalized_mutual_info_score(gdf['class'], label)
    print("KMeans with {} clusters Normalized Mutual Information Score: {}".format(6, kmeanNorm))


KMeans with 6 clusters Normalized Mutual Information Score: 0.8585223963240158
KMeans with 6 clusters Normalized Mutual Information Score: 0.8585223963240157
KMeans with 6 clusters Normalized Mutual Information Score: 0.8562552442666413
KMeans with 6 clusters Normalized Mutual Information Score: 0.8585223963240158
KMeans with 6 clusters Normalized Mutual Information Score: 0.8573817801714915
KMeans with 6 clusters Normalized Mutual Information Score: 0.8562552442666413
KMeans with 6 clusters Normalized Mutual Information Score: 0.8562552442666411
KMeans with 6 clusters Normalized Mutual Information Score: 0.8562552442666413
KMeans with 6 clusters Normalized Mutual Information Score: 0.8562552442666411
KMeans with 6 clusters Normalized Mutual Information Score: 0.8585223963240158


Spectral clustering

In [162]:
clustering = SpectralClustering(n_clusters=5).fit(gdfdata)
label = clustering.labels_
specNorm = normalized_mutual_info_score(gdf['class'], label)
print("Spectral Clustering with {} clusters Normalized Mutual Information Score: {}".format(6, specNorm))



Spectral Clustering with 6 clusters Normalized Mutual Information Score: 0.017081943187279937


In [161]:
brch = Birch(n_clusters=5).fit(gdfdata)
label = brch.labels_
brchNorm = normalized_mutual_info_score(gdf['class'], label)
print("Birch with {} clusters Normalized Mutual Information Score: {}".format(6, brchNorm))

Birch with 6 clusters Normalized Mutual Information Score: 0.8837237641659977


In [160]:
agglo = AgglomerativeClustering(n_clusters=5, linkage="ward").fit(gdfdata)
label = agglo.labels_
aggloNorm = normalized_mutual_info_score(gdf['class'], label)
print("Agglomerative with {} clusters Normalized Mutual Information Score: {}".format(6, aggloNorm))

Agglomerative with 6 clusters Normalized Mutual Information Score: 0.8837237641659977


In [159]:
agglo = AgglomerativeClustering(n_clusters=5, linkage="complete").fit(gdfdata)
label = agglo.labels_
aggloNorm = normalized_mutual_info_score(gdf['class'], label)
print("Agglomerative with {} clusters Normalized Mutual Information Score: {}".format(6, aggloNorm))

Agglomerative with 6 clusters Normalized Mutual Information Score: 0.4273656083972991


In [158]:
agglo = AgglomerativeClustering(n_clusters=5, linkage="average").fit(gdfdata)
label = agglo.labels_
aggloNorm = normalized_mutual_info_score(gdf['class'], label)
print("Agglomerative with {} clusters Normalized Mutual Information Score: {}".format(6, aggloNorm))

Agglomerative with 6 clusters Normalized Mutual Information Score: 0.01318020230023837


In [157]:
agglo = AgglomerativeClustering(n_clusters=5, linkage="single").fit(gdfdata)
label = agglo.labels_
aggloNorm = normalized_mutual_info_score(gdf['class'], label)
print("Agglomerative with {} clusters Normalized Mutual Information Score: {}".format(6, aggloNorm))

Agglomerative with 6 clusters Normalized Mutual Information Score: 0.008870614329632032


In [None]:
mixture = GaussianMixture(n_components=5).fit(gdfdata)
label = mixture.fit_predict(X=gdfdata)
mixtureNorm = normalized_mutual_info_score(gdf['class'], label)
print("Gaussian Mixture with {} clusters Normalized Mutual Information Score: {}".format(5, mixtureNorm))