In [1]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import homogeneity_score
from sklearn.cluster import SpectralClustering

In [11]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import fowlkes_mallows_score

In [2]:
import sys
sys.path.append('../../src/GLFMpython/')
import GLFM

In [3]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [4]:
import numpy as np
import pandas as pd

In [12]:
np.random.seed(5)

iris = datasets.load_iris()
X = iris.data
y = iris.target

estimators = [('k_means_iris_8', KMeans(n_clusters=8)),
              ('k_means_iris_3', KMeans(n_clusters=3)),
              ('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1,
                                               init='random'))]

titles = ['8 clusters', '3 clusters', '3 clusters, bad initialization']
for name, est in estimators:
    est.fit(X)
    labels = est.labels_
    print(name)
    print("homogeneity_score: " + str(homogeneity_score(y, labels)))
    print("adjusted_rand_score: " + str(adjusted_rand_score(y, labels)))
    print("adjusted_mutual_info_score: " + str(adjusted_mutual_info_score(y, labels)))
    print("fowlkes_mallows_score: " + str(fowlkes_mallows_score(y, labels)))

k_means_iris_8
homogeneity_score: 0.9255604738929597
adjusted_rand_score: 0.4556052836500101
adjusted_mutual_info_score: 0.49759750257088975
fowlkes_mallows_score: 0.6156750941383347
k_means_iris_3
homogeneity_score: 0.7514854021988338
adjusted_rand_score: 0.7302382722834697
adjusted_mutual_info_score: 0.7483723933229484
fowlkes_mallows_score: 0.8208080729114153
k_means_iris_bad_init
homogeneity_score: 0.7364192881252849
adjusted_rand_score: 0.7163421126838475
adjusted_mutual_info_score: 0.7331180735280008
fowlkes_mallows_score: 0.8112427991975698


In [13]:
X = iris.data
y = iris.target

data = {}
data['C'] = 'pppp'
data['X'] = X
hidden = dict()
N = len(X)
hidden['Z'] = np.random.randint(0,2,size=(N,2)).astype('float64')
params = dict()
params['alpha'] = 2   # concentration parameter for the IBP
params['Niter'] = 100 # number of algorithm iterations
params['maxK'] = 10
params['verbose'] = 0 #do not show messages
hidden = GLFM.infer(data, hidden,params)
z = hidden['Z']




estimators = [('k_means_iris_8', KMeans(n_clusters=8)),
              ('k_means_iris_3', KMeans(n_clusters=3)),
              ('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1))]
titles = ['8 clusters', '3 clusters', '3 clusters, bad initialization']
for name, est in estimators:
    est.fit(z)
    labels = est.labels_
    print(name)
    print("homogeneity_score: " + str(homogeneity_score(y, labels)))
    print("adjusted_rand_score: " + str(adjusted_rand_score(y, labels)))
    print("adjusted_mutual_info_score: " + str(adjusted_mutual_info_score(y, labels)))
    print("fowlkes_mallows_score: " + str(fowlkes_mallows_score(y, labels)))
    

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=4, Kest=8, maxR=1
k_means_iris_8
homogeneity_score: 0.6365886462832168
adjusted_rand_score: 0.44277230955889707
adjusted_mutual_info_score: 0.4660208602870075
fowlkes_mallows_score: 0.6146986215688501
k_means_iris_3
homogeneity_score: 0.4255205924478095
adjusted_rand_score: 0.36973857568823243
adjusted_mutual_info_score: 0.4183154489169639
fowlkes_mallows_score: 0.5853528746746595
k_means_iris_bad_init
homogeneity_score: 0.6297211322109244
adjusted_rand_score: 0.6081729102324068
adjusted_mutual_info_score: 0.6250770799545919
fowlkes_mallows_score: 0.7424945859967156


# Then Wine Data

In [14]:
wine = datasets.load_wine()
X = wine.data
y = wine.target


estimators = [('k_means_iris_8', KMeans(n_clusters=8)),
              ('k_means_iris_3', KMeans(n_clusters=3)),
              ('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1))]

for name, est in estimators:
    est.fit(X)
    labels = est.labels_
    print(name)
    print("homogeneity_score: " + str(homogeneity_score(y, labels)))
    print("adjusted_rand_score: " + str(adjusted_rand_score(y, labels)))
    print("adjusted_mutual_info_score: " + str(adjusted_mutual_info_score(y, labels)))
    print("fowlkes_mallows_score: " + str(fowlkes_mallows_score(y, labels)))

k_means_iris_8
homogeneity_score: 0.5040755938331212
adjusted_rand_score: 0.2017221013734161
adjusted_mutual_info_score: 0.2580996575353887
fowlkes_mallows_score: 0.3943122118545804
k_means_iris_3
homogeneity_score: 0.42881231997856467
adjusted_rand_score: 0.37111371823084754
adjusted_mutual_info_score: 0.4226314226240145
fowlkes_mallows_score: 0.5835370218944976
k_means_iris_bad_init
homogeneity_score: 0.42881231997856467
adjusted_rand_score: 0.37111371823084754
adjusted_mutual_info_score: 0.4226314226240146
fowlkes_mallows_score: 0.5835370218944976


In [15]:
X = wine.data
y = wine.target

data = {}
data['C'] = 'ppppppppppppp'
data['X'] = X
hidden = dict()
N = len(X)
hidden['Z'] = np.random.randint(0,2,size=(N,2)).astype('float64')
params = dict()
params['alpha'] = 2   # concentration parameter for the IBP
params['Niter'] = 100 # number of algorithm iterations
params['maxK'] = 10
params['verbose'] = 0 #do not show messages
hidden = GLFM.infer(data, hidden,params)
z = hidden['Z']

estimators = [('iris_8', KMeans(n_clusters=8)),
              ('iris_3', KMeans(n_clusters=3)),
              ('iris_bad_init', KMeans(n_clusters=3, n_init=1))]
for name, est in estimators:
    est.fit(z)
    labels = est.labels_
    print(name)
    print("homogeneity_score: " + str(homogeneity_score(y, labels)))
    print("adjusted_rand_score: " + str(adjusted_rand_score(y, labels)))
    print("adjusted_mutual_info_score: " + str(adjusted_mutual_info_score(y, labels)))
    print("fowlkes_mallows_score: " + str(fowlkes_mallows_score(y, labels)))

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=13, Kest=8, maxR=1
iris_8
homogeneity_score: 0.7806207663551691
adjusted_rand_score: 0.6850929441052168
adjusted_mutual_info_score: 0.6414744874016439
fowlkes_mallows_score: 0.7877403056010569
iris_3
homogeneity_score: 0.7131822988727058
adjusted_rand_score: 0.6862588490855763
adjusted_mutual_info_score: 0.7101332540399043
fowlkes_mallows_score: 0.7931266087329674
iris_bad_init
homogeneity_score: 0.33331673985824306
adjusted_rand_score: 0.31676195809814844
adjusted_mutual_info_score: 0.3259731950172763
fowlkes_mallows_score: 0.6236857125796222


# covtype

In [16]:
kdd = pd.read_csv('kddcup.data_10_percent_corrected.csv', header = None, names = ['duration','protocol_type','service',
'flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
'root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds',
'is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate',
'srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count',
'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','target'])

In [17]:
kdd.head().iloc[:,:15]

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted
0,0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0
1,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0
2,0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0
3,0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0
4,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0


In [18]:
kdd.head().iloc[:,15:30]

Unnamed: 0,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate
0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0
1,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0
2,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0
3,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0
4,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
kdd.head().iloc[:,30:]

Unnamed: 0,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target
0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [20]:
kdd.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,...,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0
mean,47.979302,3025.61,868.5324,4.5e-05,0.006433,1.4e-05,0.034519,0.000152,0.148247,0.010212,...,232.470778,188.66567,0.75378,0.030906,0.601935,0.006684,0.176754,0.176443,0.058118,0.057412
std,707.746472,988218.1,33040.0,0.006673,0.134805,0.00551,0.782103,0.01552,0.355345,1.798326,...,64.74538,106.040437,0.410781,0.109259,0.481309,0.042133,0.380593,0.380919,0.23059,0.23014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,46.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,693375600.0,5155468.0,1.0,3.0,3.0,30.0,5.0,1.0,884.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
kdd['protocol_type'].unique()

array(['tcp', 'udp', 'icmp'], dtype=object)

In [22]:
kdd['service'].unique()

array(['http', 'smtp', 'finger', 'domain_u', 'auth', 'telnet', 'ftp',
       'eco_i', 'ntp_u', 'ecr_i', 'other', 'private', 'pop_3', 'ftp_data',
       'rje', 'time', 'mtp', 'link', 'remote_job', 'gopher', 'ssh',
       'name', 'whois', 'domain', 'login', 'imap4', 'daytime', 'ctf',
       'nntp', 'shell', 'IRC', 'nnsp', 'http_443', 'exec', 'printer',
       'efs', 'courier', 'uucp', 'klogin', 'kshell', 'echo', 'discard',
       'systat', 'supdup', 'iso_tsap', 'hostnames', 'csnet_ns', 'pop_2',
       'sunrpc', 'uucp_path', 'netbios_ns', 'netbios_ssn', 'netbios_dgm',
       'sql_net', 'vmnet', 'bgp', 'Z39_50', 'ldap', 'netstat', 'urh_i',
       'X11', 'urp_i', 'pm_dump', 'tftp_u', 'tim_i', 'red_i'],
      dtype=object)

In [23]:
kdd['flag'].unique()

array(['SF', 'S1', 'REJ', 'S2', 'S0', 'S3', 'RSTO', 'RSTR', 'RSTOS0',
       'OTH', 'SH'], dtype=object)

In [24]:
kdd['target'].unique()

array(['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.',
       'smurf.', 'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.',
       'ipsweep.', 'land.', 'ftp_write.', 'back.', 'imap.', 'satan.',
       'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
       'spy.', 'rootkit.'], dtype=object)

In [25]:
len(kdd['target'].unique())

23

In [26]:
kdd = kdd.sample(frac=1).reset_index(drop=True)

In [27]:
kdd_cut = kdd.iloc[:10000,:].copy()

In [28]:
kdd_cut['protocol_type'].replace(kdd['protocol_type'].unique(),
                                 [i for i in range(1, len(kdd_cut['protocol_type'].unique()) + 1)], inplace = True)

In [29]:
kdd_cut['service'].replace(kdd_cut['service'].unique(),
                           [i for i in range(1, len(kdd_cut['service'].unique()) + 1)], inplace = True)

In [30]:
kdd_cut['flag'].replace(kdd_cut['flag'].unique(),
                           [i for i in range(1, len(kdd_cut['flag'].unique()) + 1)], inplace = True)

In [31]:
kdd_cut['target'].replace(kdd_cut['target'].unique(),
                           [i for i in range(1, len(kdd_cut['target'].unique()) + 1)], inplace = True)

In [32]:
kdd_cut.shape

(10000, 42)

In [33]:
x = kdd_cut.iloc[:,:-1]

In [34]:
y = kdd_cut.iloc[:,-1]

In [35]:
len(kdd_cut.iloc[:,-1].unique())

14

In [36]:
X = x.values

In [37]:
y = y.values

In [38]:
estimators = [('k_means_iris_15', KMeans(n_clusters=15)),
              ('k_means_iris_20', KMeans(n_clusters=20)),
              ('k_means_iris_bad_init', KMeans(n_clusters=15, n_init=1))]

for name, est in estimators:
    est.fit(X)
    labels = est.labels_
    print(name)
    print("homogeneity_score: " + str(homogeneity_score(y, labels)))
    print("adjusted_rand_score: " + str(adjusted_rand_score(y, labels)))
    print("adjusted_mutual_info_score: " + str(adjusted_mutual_info_score(y, labels)))
    print("fowlkes_mallows_score: " + str(fowlkes_mallows_score(y, labels)))

k_means_iris_15
homogeneity_score: 0.6568901391298708
adjusted_rand_score: 0.7999842190373041
adjusted_mutual_info_score: 0.6556581727650675
fowlkes_mallows_score: 0.8893107765458023
k_means_iris_20
homogeneity_score: 0.6834635230731902
adjusted_rand_score: 0.8124942904903939
adjusted_mutual_info_score: 0.6794915229842153
fowlkes_mallows_score: 0.8944356188516799
k_means_iris_bad_init
homogeneity_score: 0.6594556760185442
adjusted_rand_score: 0.8015174856745026
adjusted_mutual_info_score: 0.6582380693566509
fowlkes_mallows_score: 0.8898776538323688


In [39]:
data = {}
data['C'] = 'ncccnnnnnnnnnnnnnnnnnnnnpppppppnnpppppppp'
data['X'] = X
hidden = dict()
N = len(X)
hidden['Z'] = np.random.randint(0,2,size=(N,2)).astype('float64')
params = dict()
params['alpha'] = 2   # concentration parameter for the IBP
params['Niter'] = 100 # number of algorithm iterations
params['maxK'] = 10
params['verbose'] = 0 #do not show messages
hidden = GLFM.infer(data, hidden,params)
z = hidden['Z']

estimators = [('iris_8', KMeans(n_clusters=15)),
              ('iris_3', KMeans(n_clusters=20)),
              ('iris_bad_init', KMeans(n_clusters=15, n_init=1))]
for name, est in estimators:
    est.fit(z)
    labels = est.labels_
    print(name)
    print("homogeneity_score: " + str(homogeneity_score(y, labels)))
    print("adjusted_rand_score: " + str(adjusted_rand_score(y, labels)))
    print("adjusted_mutual_info_score: " + str(adjusted_mutual_info_score(y, labels)))
    print("fowlkes_mallows_score: " + str(fowlkes_mallows_score(y, labels)))

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=41, Kest=8, maxR=57
iris_8
homogeneity_score: 0.8572432431816553
adjusted_rand_score: 0.8380695247476126
adjusted_mutual_info_score: 0.5464399907277137
fowlkes_mallows_score: 0.9027255694926491
iris_3
homogeneity_score: 0.8870963049481544
adjusted_rand_score: 0.8496892280541309
adjusted_mutual_info_score: 0.5359912637475869
fowlkes_mallows_score: 0.9099503673752537
iris_bad_init
homogeneity_score: 0.7999607643709444
adjusted_rand_score: 0.8325252052998763
adjusted_mutual_info_score: 0.520236126122152
fowlkes_mallows_score: 0.89933617860153
