In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import normalize
datasetFolder = "isc_datasets"
def loadData_txt(filename,delimiter):
    filePath = os.path.join(datasetFolder,filename + ".txt")
    return pd.read_table(filePath,delimiter = delimiter)
dataset = loadData_txt("transfusionData",',')
dataset.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [2]:
transfusionData = dataset.to_numpy()
transfusionData = np.delete(transfusionData,-1,1)
print(transfusionData)


[[    2    50 12500    98]
 [    0    13  3250    28]
 [    1    16  4000    35]
 ...
 [   23     3   750    62]
 [   39     1   250    39]
 [   72     1   250    72]]


In [3]:
#returns -> [[clusters in a 1D subspace -> each cluster [no of datapoint in a cluster] ] , [clusters in a 1D subspac] , ...]
def find1DSubspaceClusters(eps,minpts,dataSet):
    one_D_subspaces = dataSet.T
    one_D_clusters = []
    result = []
    for i in one_D_subspaces:
        i = i.reshape(-1,1)
        clustering = DBSCAN(eps = eps,min_samples = minpts).fit(i)
        one_D_clusters.append(clustering)
    for i in one_D_clusters:
        points = i.labels_
        LabelsNum = max(points) + 2
        esets=[[] for i in range(LabelsNum)]
        for j in range(len(points)):
            esets[points[j] + 1].append(j)
        result.append(esets)
    return result          

clusters = find1DSubspaceClusters(1,5,transfusionData)     
print("no of 1D subspaces :",len(clusters),"\n")
print("1D subspaces and their clusters :","\n")
for i in clusters:
    print("no of clusters: ",len(i),"\n")
    for j in i:
        print(j,"\n")

no of 1D subspaces : 4 

1D subspaces and their clusters : 

no of clusters:  3 

[475, 495, 496, 497, 498, 499, 726, 746, 747] 

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,

In [4]:
from scikit_roughsets.rs_reduction import RoughSetsReducer

#P and Q are subspaces
#this method finds P-lower Approximation of Qi(ith cluster in Q subspace)
def findLowApprx(P,Q,i,dataset):

    reducer = RoughSetsReducer()
    return reducer.rslower(Q[i],P,dataset)
    
    ''''
    result = []
    for j in range(len(P)):
        if set(P[j]).issubset(set(Q[i])):
             result = P[j]
    return result
    '''


#calculates Attribute dependency measure between P and Q subspaces
def attrDependency(P,Q,DB_size):
    AllOfLowApprxs = 0
    for i in range(len(Q)):
        AllOfLowApprxs += len(findLowApprx(P,Q,i))
    return AllOfLowApprxs / DB_size
        
def isInterestingSubset(P,Q,DB_size,gamma):
    return attrDependency(P,Q,DB_size) < gamma

# for example eligible subspace would be 2D and we're looking for the best 1D subspace
# eligible subspace is [p,q] whereas p and q are subspaces that are interesting subset together
# HAS TO WORK FOR DIFFERENT DIMENSIONS ALSO
# returns list of dimensions that are best subspace together
def findBestSubspace(eligibleSubspace):
    elementsUncovered = np.zeros(len(eligibleSubspace))
    for i in range(len(eligibleSubspace)):
        testSubspc = np.delete(eligibleSubspace,i,0)
        uncoveredElements = 0
        for j in testSubspc:
            uncoveredElements += len(j[0])
        elementsUncovered [i] = uncoveredElements
    elementsUncoveredList = elementsUncovered.tolist()
    bestSubspaceIndex = elementsUncoveredList.index(max(elementsUncoveredList))
    return [i for i in range(len(eligibleSubspace)) if i != bestSubspaceIndex]

#returns 2D subspace made of two 1D subspaces p and q
#Should also work for other dimensions
#data has to be numpy array in (records,dimensions) shape
#dimList in the [p,q] in which p and q are the # of dimensions we're working on
def getSubspace(dimList,data):
    return data[:,dimList]

#returns ndarray of point numbers in their cluster after the clustering result based on DBSCAN method
#clstrObj is the result of clustering 
#clstrPointsList is the cluster in which the number of data points included in the cluster are stored
def clstrToPointsList(clstrObj,clstrPointsList):
    labels = clstrObj.labels_
    labelMx = max(labels) + 2
    clstrs = [[] for i in range(labelMx)]
    for i in range(len(labels)):
        clstrs[labels[i] + 1].append(clstrPointsList[i])
    return clstrs

# This will partition the clusters in best subspace using DBSCAN based on distances in the 2D subspace(Or the higher one)
# Best subspace is 1D and eligible subspace is 2D
# Eligible subspace is [p,q] whereas p and q are subspaces that are interesting subset together
# Data has to be numpy array in (records,dimensions) shape
# DimList in the [p,q] in which p and q are the # of dimensions we're working on
# DONT FORGET in the result cluster, the first subList is for the "outlier"s so if it's empty it means we dont have any outliers
def partitionBestSubspace(eps,minpts,bestSubspaceClustered,dimList,data):
    rawEligible2dSubspace = getSubspace(dimList,data)
    # the 0th index of Result is for the outliers
    Result = [[]]
    for clstr in bestSubspaceClustered:
        clstrInHighD = np.array([rawEligible2dSubspace[i] for i in clstr])
        rsltForClstr  = DBSCAN(eps = eps, min_samples = minpts).fit(clstrInHighD)
        rsltForClstr_in_points = clstrToPointsList(rsltForClstr,clstr)
        for newClstr in rsltForClstr_in_points:
            if rsltForClstr_in_points.index(newClstr) == 0 and len(newClstr) > 0:
                #then it's an 'outlier'. what should we do with it ? add it to the 0th index of Result
                for pnt in newClstr:
                    Result[0].append(pnt)
            elif len(newClstr) != 0 :
                Result.append(newClstr)
    return Result

#used as one of the base cases for clustering more than 1D->2D dimensions
# Clusters -> [[for 2D clusters],[for 3D clusters],...]: each [] for clusters in the specific dimension in which -> [[subspaces],[clusters]]
# threshold: int : if the number of points in a cluster is less it is Not considered Dense
# dim: int > 1 : in which dimension we're looking for dense clusters
#another base case is to see we have no other eligible subspace
def denseCluster_found(clusters,threshold,dim):
    pass

one_D_clusters = find1DSubspaceClusters(1,5,transfusionData)
p = one_D_clusters[2]
q = one_D_clusters[0]
i = 1
db_size = len(transfusionData)
print(findLowApprx(p,q,i,one_D_clusters))
print(attrDependency(p,q,db_size))
print(findBestSubspace([p,q]))
partitionBestSubspace(2,4,p,[0,2],transfusionData)

AttributeError: 'list' object has no attribute 'ndim'

In [None]:
#Time for the Algorithm itself
def ISC(eps,minPts,gamma,Data):
    one_D_clusters = find1DSubspaceClusters(eps,minPts,Data)
    db_size = len(Data)
    eligible_Subspaces = []
    
    # in clustered_Subspaces we're gone store High dimensional (>1) subspaces that we just clustered
    # the first (in future) element would be the 2D clustering result based on ISC algorithm
    # e.g in the 2D clstr part we have to specify the dims so the elements
    clustered_Subspaces= [[] for i in range(len(one_D_clusters) - 1)]
    
    for attr_i in range(len(one_D_clusters)):
        for attr_j in range(len(one_D_clusters)):
            if attr_i != attr_j:
                if isInterestingSubset(one_D_clusters[attr_i],one_D_clusters[attr_j],db_size,gamma):
                    eligible_Subspaces.append([attr_i,attr_j])
    #1D to 2D process:
    for eligible_Subspace in eligible_Subspaces:
        bestSubspace = findBestSubspace([one_D_clusters[i] for i in eligible_Subspace])
        HD_ClusteredPoints = partitionBestSubspace(eps,minPts,one_D_clusters[bestSubspace[0]],eligible_Subspace,Data)
        clustered_Subspaces[0].append([eligible_Subspace,HD_ClusteredPoints]) 
    
        
    
ISC(1,2,0.4,transfusionData)

[[[0, 1], [[0, 9, 341, 504, 505, 4, 10, 105, 106, 115, 188, 241, 261, 276, 278, 359, 363, 400, 439, 444, 475, 495, 499, 501, 528, 546, 593, 633, 677, 680, 726, 747, 502, 517], [1, 2, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 107, 108, 109, 110, 111, 112, 113, 114, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 139, 140, 141, 142, 143, 144, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 177, 178, 179, 180, 182, 183, 186, 187, 189, 190, 191, 192, 194, 195, 197, 200, 201, 203, 206, 207, 210, 211, 212, 213, 214

  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
  arr = asarray(arr)
