# Project

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import csv
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import normalized_mutual_info_score
import math
%matplotlib inline

In this notebook two different data tables are clustered with different clustering methods and clusters are the evaulated with the Normalized Mutual Information Score. The clusterings are mostly done with the help of the python library sklearn which has been used alot in the course. The order of this document will be:

Data in 'msdata.csv' is clustered with:

    1. K-means clustering
    2. Spectral clustering
    
Then the data in 'genedata.csv' is clustered with:

    1. K-means clustering

In [6]:
msdf = pd.read_csv('msdata.csv')
msdf

Unnamed: 0,id,class,f1,f2,f3,f4,f5,f6,f7,f8,...,f4991,f4992,f4993,f4994,f4995,f4996,f4997,f4998,f4999,f5000
0,1,1,15.5033,11.8980,9.4663,11.9219,14.3933,12.6991,13.7691,15.1736,...,14.5236,17.3897,16.1978,17.4665,11.7005,12.8821,14.7766,15.8183,13.2718,14.8386
1,2,1,15.5907,14.5124,15.1989,8.7911,14.6605,13.6276,12.7524,15.0325,...,14.5052,17.3300,16.0187,17.6672,11.2964,15.3765,14.5783,16.3226,16.2743,15.2883
2,3,1,15.4491,14.2083,9.6666,12.5831,13.9520,13.3233,14.2603,15.2630,...,14.1587,17.2666,15.9008,18.0802,15.1937,16.2641,14.3947,15.8199,15.5093,14.6663
3,4,1,15.4677,10.6231,11.5586,11.6899,7.4421,12.8914,15.0213,14.8480,...,14.1745,17.2752,15.8610,16.8160,9.0467,14.6076,13.8033,15.9128,13.8341,15.7347
4,5,1,15.2949,14.6039,14.0875,10.4170,10.8258,13.6768,13.6044,14.7807,...,13.9510,17.3293,15.8083,17.6435,10.0007,16.7461,14.4605,15.6075,15.7684,14.7838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,690,3,9.1589,14.0772,13.6372,13.1490,14.0318,13.6903,13.9127,7.9003,...,8.4532,17.3978,10.7727,10.7645,13.8810,14.6866,14.7414,7.7132,15.5579,15.3616
690,691,3,15.1379,14.1653,8.8926,13.2418,15.0677,9.4982,14.3200,8.0398,...,14.4231,10.2822,15.7388,17.0101,13.7257,14.0526,15.1503,15.6570,15.4638,7.5428
691,692,3,15.3464,12.9593,8.1637,5.7031,14.7252,13.2267,7.5769,15.2880,...,14.1774,16.7365,15.8708,17.0907,15.0305,14.9608,14.2541,15.8987,15.2844,10.2789
692,693,3,7.8809,13.3970,12.9687,7.0843,14.8701,13.2468,14.4842,14.7805,...,14.2379,17.0042,15.6800,12.6676,14.0994,14.6514,14.7748,15.8078,7.0376,16.4152


1. K-means clustering

In [4]:
for i in range(1,10):
    kmeans = KMeans(n_clusters=i).fit(msdf)
    label = kmeans.labels_
    kmeanNorm = normalized_mutual_info_score(msdf['class'], label)
    print("KMeans with {} clusters Normalized Mutual Information Score: {}".format(i, kmeanNorm))


msdata
KMeans with 1 clusters Normalized Mutual Information Score: 1.0710532741620512e-16
KMeans with 2 clusters Normalized Mutual Information Score: 0.6305496975170901
KMeans with 3 clusters Normalized Mutual Information Score: 0.5575335174479019
KMeans with 4 clusters Normalized Mutual Information Score: 0.6307148475085809
KMeans with 5 clusters Normalized Mutual Information Score: 0.749100407201445
KMeans with 6 clusters Normalized Mutual Information Score: 0.7011317589025333
KMeans with 7 clusters Normalized Mutual Information Score: 0.6730561949577666
KMeans with 8 clusters Normalized Mutual Information Score: 0.5871206406984811
KMeans with 9 clusters Normalized Mutual Information Score: 0.6300858836197982


2. Spectral clustering

In [8]:
for i in range(1,10):
    clustering = SpectralClustering(n_clusters=i).fit(msdf)
    label = clustering.labels_
    specNorm = normalized_mutual_info_score(msdf['class'], label)
    print("Spectral Clustering with {} clusters Normalized Mutual Information Score: {}".format(i, specNorm))



Spectral Clustering with 1 clusters Normalized Mutual Information Score: 1.0710532741620512e-16
Spectral Clustering with 2 clusters Normalized Mutual Information Score: 0.0021203589729356103




Spectral Clustering with 3 clusters Normalized Mutual Information Score: 0.0010626176004380234
Spectral Clustering with 4 clusters Normalized Mutual Information Score: 0.006624651205189287




Spectral Clustering with 5 clusters Normalized Mutual Information Score: 0.007905271981584984
Spectral Clustering with 6 clusters Normalized Mutual Information Score: 0.01001046356768751




Spectral Clustering with 7 clusters Normalized Mutual Information Score: 0.012421259939541894
Spectral Clustering with 8 clusters Normalized Mutual Information Score: 0.004220733168514145
Spectral Clustering with 9 clusters Normalized Mutual Information Score: 0.01241841184823881




Starting to cluster the other data file

In [9]:
gdf = pd.read_csv('genedata.csv')
gdf

Unnamed: 0,id,class,f1,f2,f3,f4,f5,f6,f7,f8,...,f6991,f6992,f6993,f6994,f6995,f6996,f6997,f6998,f6999,f7000
0,1,5,6.9339,9.6526,11.1247,4.5114,7.3911,9.9115,7.3242,7.0316,...,8.5022,9.5293,12.8665,8.1185,6.3778,8.9000,10.3001,7.5826,7.3468,7.9920
1,2,4,9.0120,10.0063,9.8172,6.9005,6.8289,9.6244,7.3552,4.6378,...,10.4605,10.3219,10.9588,10.5984,6.5823,7.9658,10.1417,6.4959,6.1276,6.0310
2,3,5,7.7850,9.1760,9.9648,3.8345,8.2786,11.0152,7.4537,0.7966,...,4.9642,10.3124,11.2672,9.3884,2.3373,6.8622,9.7874,5.8419,4.9812,7.4597
3,4,5,7.2457,9.2069,10.4146,3.4857,7.9823,10.4864,8.9805,4.0114,...,8.5397,9.7009,11.6935,9.0250,4.4665,7.7356,9.5898,6.0307,4.2164,7.2438
4,5,1,7.6318,9.1694,10.1147,5.0248,7.9659,8.4597,6.1713,4.8222,...,9.8154,9.5333,11.4868,9.3767,8.1257,8.4863,8.9712,7.1976,4.9203,6.1073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,791,1,7.6485,9.7275,9.8213,3.4958,8.5245,8.2773,5.6539,3.7812,...,7.2857,10.4410,11.2979,9.2816,9.4766,8.2306,9.3093,5.8308,6.3864,6.3137
791,792,4,6.7876,8.4118,10.7032,6.6506,8.2790,8.5663,6.8840,8.1681,...,6.3245,9.0097,12.0085,9.1466,5.8634,8.4454,10.4765,8.5514,6.2146,5.8634
792,793,3,6.6792,9.7643,10.8172,8.2813,7.5902,9.1889,8.1293,8.5268,...,8.7568,9.5140,12.0448,9.3384,9.9151,7.3802,9.7442,11.1931,7.4769,5.7194
793,794,5,7.1960,10.0220,10.7668,5.5300,8.2050,11.0196,7.0622,6.9565,...,9.8346,9.7143,12.4598,9.1618,5.5908,8.0241,9.6336,7.3635,6.4746,7.0729


In [5]:
print("genedata")
for i in range(40,50):
    kmeans = KMeans(n_clusters=i).fit(gdf)
    label = kmeans.labels_
    kmeanNorm = normalized_mutual_info_score(gdf['class'], label)
    print("KMeans with {} clusters Normalized Mutual Information Score: {}".format(i, kmeanNorm))


genedata
KMeans with 40 clusters Normalized Mutual Information Score: 0.4755223895101539
KMeans with 41 clusters Normalized Mutual Information Score: 0.47275288260726517
KMeans with 42 clusters Normalized Mutual Information Score: 0.47395754790930067
KMeans with 43 clusters Normalized Mutual Information Score: 0.48519247948857885
KMeans with 44 clusters Normalized Mutual Information Score: 0.48408446897076124
KMeans with 45 clusters Normalized Mutual Information Score: 0.4720705480380191
KMeans with 46 clusters Normalized Mutual Information Score: 0.4981494161577209
KMeans with 47 clusters Normalized Mutual Information Score: 0.4799453731658008
KMeans with 48 clusters Normalized Mutual Information Score: 0.495336049322506
KMeans with 49 clusters Normalized Mutual Information Score: 0.4684189514074314


Spectral clustering

In [10]:
for i in range(1,10):
    clustering = SpectralClustering(n_clusters=i).fit(gdf)
    label = clustering.labels_
    specNorm = normalized_mutual_info_score(gdf['class'], label)
    print("Spectral Clustering with {} clusters Normalized Mutual Information Score: {}".format(i, specNorm))



Spectral Clustering with 1 clusters Normalized Mutual Information Score: 7.708720110735765e-16




Spectral Clustering with 2 clusters Normalized Mutual Information Score: 0.006655415898158779




Spectral Clustering with 3 clusters Normalized Mutual Information Score: 0.009400233485121142




Spectral Clustering with 4 clusters Normalized Mutual Information Score: 0.008013921806764855




Spectral Clustering with 5 clusters Normalized Mutual Information Score: 0.007264472596428789




Spectral Clustering with 6 clusters Normalized Mutual Information Score: 0.020248095617786843




Spectral Clustering with 7 clusters Normalized Mutual Information Score: 0.012554102285501635




Spectral Clustering with 8 clusters Normalized Mutual Information Score: 0.013202676024435782
Spectral Clustering with 9 clusters Normalized Mutual Information Score: 0.009827632165922721


