In [1]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import homogeneity_score
from sklearn.cluster import SpectralClustering

In [2]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import fowlkes_mallows_score

In [3]:
import sys
sys.path.append('../../src/GLFMpython/')
import GLFM

In [4]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [5]:
import numpy as np
import pandas as pd

In [6]:
import time
import datetime
import math
import csv
import random
import string

import train # import art_train
import test #import art_test

In [7]:
from kmodes.kmodes import KModes

In [8]:
def clustering_score(X, y, n, C):
    #X is the data, y is target, n is the number of classes, C is the types of X features.
    np.random.seed(5)


    k_means = KMeans(n_clusters=n)
    k_means.fit(X)
    labels = k_means.labels_
    
    k_means_score = [homogeneity_score(y, labels),adjusted_rand_score(y, labels),
                         adjusted_mutual_info_score(y, labels),fowlkes_mallows_score(y, labels)]
    data = {}
    data['C'] = C
    data['X'] = X
    hidden = dict()
    N = len(X)
    hidden['Z'] = np.random.randint(0,2,size=(N,2)).astype('float64')
    params = dict()
    params['alpha'] = 2   # concentration parameter for the IBP
    params['Niter'] = 100 # number of algorithm iterations
    params['maxK'] = 10
    params['verbose'] = 0 #do not show messages
    hidden = GLFM.infer(data, hidden,params)
    z = hidden['Z']


    k_modes = KModes(n_clusters=n, init='Huang', n_init=5, verbose=1)

    k_modes.fit(z)
    labels = k_modes.labels_

    k_modes_score = [homogeneity_score(y, labels),adjusted_rand_score(y, labels),
                                 adjusted_mutual_info_score(y, labels),fowlkes_mallows_score(y, labels)]


    r = 0.9
    Tmatrix = train.art_train(z,rho=r) #,beta=0.000001,alpha=1.0,nep=1)
    T = test.art_test(z,Tmatrix,rho=r) #,beta=0.000001,alpha=1.0,nep=1)
    labels = T['Template']

    Art_score = [homogeneity_score(y, labels),adjusted_rand_score(y, labels),
                                 adjusted_mutual_info_score(y, labels),fowlkes_mallows_score(y, labels)]

    table = pd.DataFrame(data={'k_means':k_means_score,'k_modes + GLFM' : k_modes_score,
                                'Art + GLFM':Art_score}, columns = ['k_means','k_modes + GLFM','Art + GLFM'],
                          index = ['homogeneity','adjusted_rand','adjusted_mutual_info','fowlkes_mallows'])
    return table



In [9]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [10]:
table_iris = clustering_score(X,y,3,'pppp')

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=4, Kest=8, maxR=1
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 101.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 93.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 42.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 17, cost: 73.0
Run 4, iteration: 2/100, moves: 0, cost: 73.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 42.0
Best run was number 3


In [11]:
table_iris

Unnamed: 0,k_means,k_modes + GLFM,Art + GLFM
homogeneity,0.751485,0.438703,0.689271
adjusted_rand,0.730238,0.396603,0.457327
adjusted_mutual_info,0.748372,0.431673,0.455505
fowlkes_mallows,0.820808,0.597999,0.61712


# Then Wine Data

In [12]:
wine = datasets.load_wine()
X = wine.data
y = wine.target

In [13]:
table_wine = clustering_score(X,y,3,'ppppppppppppp')

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=13, Kest=8, maxR=1
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 1, cost: 75.0
Run 1, iteration: 2/100, moves: 0, cost: 75.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 1, cost: 75.0
Run 2, iteration: 2/100, moves: 0, cost: 75.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 48, cost: 22.0
Run 3, iteration: 2/100, moves: 3, cost: 22.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 17.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 50, cost: 22.0
Run 5, iteration: 2/100, moves: 1, cost: 22.0
Best run was number 4


In [14]:
table_wine

Unnamed: 0,k_means,k_modes + GLFM,Art + GLFM
homogeneity,0.428812,0.432348,0.485538
adjusted_rand,0.371114,0.395561,0.379785
adjusted_mutual_info,0.422631,0.426009,0.450858
fowlkes_mallows,0.583537,0.661489,0.637345


# kdd

In [15]:
kdd = pd.read_csv('kddcup.data_10_percent_corrected.csv', header = None, names = ['duration','protocol_type','service',
'flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised',
'root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds',
'is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate',
'srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count',
'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','target'])

In [16]:
kdd = kdd.sample(frac=1).reset_index(drop=True)

In [17]:
kdd_cut = kdd.iloc[:10000,:].copy()

In [18]:
kdd_cut['protocol_type'].replace(kdd['protocol_type'].unique(),
                                 [i for i in range(1, len(kdd_cut['protocol_type'].unique()) + 1)], inplace = True)

In [19]:
kdd_cut['service'].replace(kdd_cut['service'].unique(),
                           [i for i in range(1, len(kdd_cut['service'].unique()) + 1)], inplace = True)

In [20]:
kdd_cut['flag'].replace(kdd_cut['flag'].unique(),
                           [i for i in range(1, len(kdd_cut['flag'].unique()) + 1)], inplace = True)

In [21]:
kdd_cut['target'].replace(kdd_cut['target'].unique(),
                           [i for i in range(1, len(kdd_cut['target'].unique()) + 1)], inplace = True)

In [22]:
X = kdd_cut.iloc[:,:-1]

In [23]:
y = kdd_cut.iloc[:,-1]

In [24]:
X = X.values

In [25]:
y = y.values

In [26]:
table_kdd = clustering_score(X,y,14,'ncccnnnnnnnnnnnnnnnnnnnnpppppppnnpppppppp')

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=41, Kest=8, maxR=52
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 269, cost: 506.0
Run 1, iteration: 2/100, moves: 1, cost: 506.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 677.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 245, cost: 1446.0
Run 3, iteration: 2/100, moves: 0, cost: 1446.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 3, cost: 974.0
Run 4, iteration: 2/100, moves: 0, cost: 974.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 1207.0
Best run was number 1


In [27]:
table_kdd

Unnamed: 0,k_means,k_modes + GLFM,Art + GLFM
homogeneity,0.645238,0.656114,0.721916
adjusted_rand,0.792445,0.369759,0.38011
adjusted_mutual_info,0.644008,0.384299,0.392812
fowlkes_mallows,0.885765,0.57583,0.583114


In [28]:
covtype = pd.read_csv('covtype.data.csv',header = None)

In [29]:
covtype.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [30]:
covtype = covtype.sample(frac=1).reset_index(drop=True)

In [31]:
covtype_cut = covtype.iloc[:10000,:].copy()

In [32]:
covtype_cut.iloc[:,-1].unique()

array([2, 1, 3, 6, 7, 5, 4])

In [33]:
covtype_cut.iloc[545,:]

0     3237
1       36
2       11
3       30
4        1
5     2068
6      219
7      215
8      130
9      201
10       0
11       0
12       1
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
30       0
31       0
32       0
33       0
34       0
35       0
36       0
37       0
38       0
39       0
40       0
41       0
42       0
43       0
44       0
45       0
46       0
47       0
48       1
49       0
50       0
51       0
52       0
53       0
54       7
Name: 545, dtype: int64

In [34]:
X = covtype_cut.iloc[:,:-1].values.astype('float64')

In [35]:
y = covtype_cut.iloc[:,-1].values

In [38]:
C = []
for i  in range(0,6):
    C.append('p')
C.append('nnnp')

for i in range(10,54):
    C.append('o')

In [39]:
''.join(C)

'ppppppnnnpoooooooooooooooooooooooooooooooooooooooooooo'

In [40]:
C = ''.join(C)

In [41]:
C

'ppppppnnnpoooooooooooooooooooooooooooooooooooooooooooo'

In [42]:
table_covtype = clustering_score(X,y,7,C)

In C++: transforming input data...
done


Entering C++: Running Inference Routine...


Back to Python: OK

B_out[D,Kest,maxR] where D=54, Kest=8, maxR=2
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 14, cost: 3892.0
Run 1, iteration: 2/100, moves: 0, cost: 3892.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 2100, cost: 879.0
Run 2, iteration: 2/100, moves: 0, cost: 879.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 39, cost: 1741.0
Run 3, iteration: 2/100, moves: 0, cost: 1741.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 171, cost: 1808.0
Run 4, iteration: 2/100, moves: 0, cost: 1808.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 251, cost: 875.0
Run 5, it

In [43]:
table_covtype

Unnamed: 0,k_means,k_modes + GLFM,Art + GLFM
homogeneity,0.091532,0.031678,0.155231
adjusted_rand,-0.005636,0.016709,0.060596
adjusted_mutual_info,0.060384,0.024425,0.099817
fowlkes_mallows,0.257579,0.323648,0.331799
