In [1]:
import numpy as np
import pandas as pd
import random
import math

In [2]:
class Input_Datas(object):
    def __init__(self, subsample_size, data_path, bin1, alpha):
        # 子样本大小
        self.subsample_size = subsample_size
        self.Address = data_path
        self.bin1 = bin1
        self.alpha = alpha
        
    def discretization_features(self, data, bin1):
        '''
        连续值离散化
        '''
        cols = data.columns
        rst = data.copy()
        for col in cols:
            feature_data = rst[col].values
            x_max = np.max(feature_data)
            x_min = np.min(feature_data)
            scale = (x_max - x_min)/bin1
            scales = [x_min + n * scale for n in range(1,bin1)]
            scales.insert(0,x_min)
            scales.append(x_max + 1)
            temp = 0
            for i in range(len(scales) - 1):
                min_v = scales[i]
                max_v = scales[i + 1]
                rst[col][(rst[col] >= min_v) & (rst[col] < max_v)] = temp
                temp += 1
        return rst
        
    def calc_ent(self, x):
        '''
        计算信息熵
        '''
        x_value_list = set([x[i] for i in range(x.shape[0])])
        ent = 0.0
        for x_value in x_value_list:
            try:
                p = float(x[x == x_value].shape[0]) / x.shape[0]
                if p != 0:
                    logp = np.log2(p)
                    ent -= p * logp
                else:
                    continue
            except:
                continue
        return ent
    
    def select_features(self, data):
        '''
        根据信息熵选择满足alpha条件的特征
        '''
        cols = data.columns
        rst = list()
        dict_ent = dict()
        max_ent = -1.
        for col in cols:
            temp_values = data[col].values
            ent = self.calc_ent(temp_values)
            dict_ent[col] = ent
            if ent > max_ent:
                max_ent = ent
        for col in dict_ent:
            ent = dict_ent[col]
            if ent > 0 and ent < self.alpha * max_ent:
                rst.append(col)
        return rst
        
    def  Input(self):
        #导入数据，并将数据转换成list格式
        self.Initial_Datas = pd.read_csv(self.Address).loc[:, select_cols]
        self.Initial_Datas = self.discretization_features(self.Initial_Datas, self.bin1)
        select_features = self.select_features(self.Initial_Datas)
        self.Initial_Datas = pd.read_csv(self.Address).loc[:, select_features]
        # self.Sample（总样本）
        self.Sample = self.Initial_Datas.values
        self.Sample = np.delete(self.Sample, 0, axis=1)
        self.Sample = list(self.Sample)
        self.length = len(self.Sample)
        if self.subsample_size >= self.length:
            self.subsample_size = self.length
        self.ranges = list(range(self.length))

    def Subsample(self):
        #从Input处理后的数据中，选取子样本
        if self.subsample_size >= len(self.ranges):
            self.subsample_size = len(self.ranges)
        self.random_datas = random.sample(self.ranges, self.subsample_size)
        self.subsample = []
        for temp in self.random_datas:
            self.subsample.append(temp)
        for temp in self.random_datas:
            self.ranges.remove(temp)

In [3]:
class Select_Attribute(object):
    def __init__(self, sample):
        self.Sample = sample

    def random_attribute(self):
        #随机挑选一个属性
        length = len(self.Sample[0])
        ranges = list(range(length))
        self.random_attribute_datas = random.sample(ranges, 1)
        
    def calculate_divide_point_and_path_length(self, Sample):
        '''
        计算划分点与路径长度
        '''
        sort_sample = sorted(Sample)
        self.attribute_value = 0.
        max_span = -1.
        for i in range(len(sort_sample) - 1):
            min_v = sort_sample[i]
            max_v = sort_sample[i + 1]
            mid = (min_v + max_v) / 2.
            span = max_v - min_v
            if span > max_span:
                max_span = span
                self.attribute_value = mid
        try:
            self.path_length = 1 - (max_span / (max(Sample) - min(Sample)))
        except:
            self.path_length = 1

    def random_values(self, Sample):
        print(Sample)
        i = 0
        max = self.Sample[Sample[0]][self.random_attribute_datas[0]]
        min = max
        while i < len(Sample):
            if self.Sample[Sample[i]][self.random_attribute_datas[0]] > max:
                max = self.Sample[Sample[i]][self.random_attribute_datas[0]]

            if self.Sample[Sample[i]][self.random_attribute_datas[0]] < min:
                min = self.Sample[Sample[i]][self.random_attribute_datas[0]]
            i += 1

        self.attribute_value = max - random.random() * (max - min)

In [4]:
class ITree(object):
    #建立孤立树
    def __init__(self, depth, subsample, Sample):
        self.root = subsample
        self.depth = depth
        self.Sample = Sample

    def itree(self):
        #建立孤立树
        attribute = 0
        depth = 0
        self.Tree_1 = []
        self.Tree = [[self.root, 0, attribute]]
        while self.Tree and (depth <= self.depth):
            self.lift = []
            self.right = []
            root, depth, attribute = self.Tree.pop(0)
            set_attribute = Select_Attribute(self.Sample)
            set_attribute.random_attribute()
            attribute = set_attribute.random_attribute_datas[0]
            #set_attribute.random_values(root)
            set_attribute.calculate_divide_point_and_path_length(root)
            attribute_value = set_attribute.attribute_value
            
            i = 0
            while i < len(self.Sample[0]):
                j = 0
                while j < len(self.root):
                    if self.Sample[self.root[0]][i] == self.Sample[self.root[j]][i]:
                        self.judge = True
                    else:
                        self.judge = False
                        break
                    j += 1
                if self.judge == False:
                    break
                i += 1

            i = 0
            while i < len(root):
                if (len(root) == 1)or(depth == self.depth - 1)or(self.judge):
                    self.Tree_1.append([root, depth+set_attribute.path_length])
                    break

                if self.Sample[root[i]][attribute] < attribute_value:
                    self.lift.append(root[i])
                else:
                    self.right.append(root[i])
                i += 1
            depth += 1
            if not(self.lift == []):
                self.Tree.append([self.lift, depth, attribute])
            if not(self.right == []):
                self.Tree.append([self.right, depth, attribute])


    def prediction(self):
        #计算每一个数据的路径长度
        self.cn = 2*(math.log(len(self.root) - 1, math.e) + 0.5772156649) - (2*(len(self.root) - 1)/(len(self.root)))

        # 对子样本从小到大进行排序
        self.original = sorted(self.root)
        self.path = []
        i = 0
        while i < len(self.original):
            self.path.append(0)
            i += 1
        i = 0
        while i < len(self.original):
            j = 0
            while j < len(self.Tree_1):
                k = 0
                while k < len(self.Tree_1[j][0]):
                    if self.Tree_1[j][0][k] == self.original[i]:
                        self.path[i] = self.Tree_1[j][1]
                    k += 1
                j += 1
            i += 1

In [5]:
class IForest(object):
    #建立多棵树，并求出每一个数据的异常分数
    def __init__(self, Number, subsmaple_size, max_depth, data_path, alpha, bin1):
        self.data_path = data_path
        #孤立树数量
        self.number = Number
        self.subsample_size = subsmaple_size
        #树最大高度
        self.max_depth = max_depth
        self.alpha = alpha
        self.bin1 = bin1
    
    def Build_Forest(self):
        #建立孤立森林
        ranges =[0]
        self.scores_1 = []
        self.index = []
        
        example_a = Input_Datas(self.subsample_size, self.data_path, self.bin1, self.alpha)
        example_a.Input()
        while ranges:
            example_a.Subsample()
            ranges = example_a.ranges
            example_b = ITree(self.max_depth, example_a.subsample, example_a.Sample)
            # 平均路径长度
            level_path = []
            # 异常分数
            score = []
            i = 0
            j = 0
            while j < example_a.subsample_size:
                level_path.append(0)
                score.append(0)
                j += 1

            while i < self.number:
                example_b.itree()
                example_b.prediction()
                k = 0
                while k < example_a.subsample_size:
                    level_path[k] = level_path[k] + example_b.path[k]
                    k += 1
                i += 1
            k = 0
            while k < example_a.subsample_size:
                level_path[k] = level_path[k]/self.number
                score[k] = 2**(-level_path[k]/example_b.cn)
                k += 1

            for temp in score:
                self.scores_1.append(temp)
            for temp in example_b.original:
                self.index.append(temp)

        a = self.sort_1(self.scores_1, self.index)
        #print(a)
        i = 0
        sum = 0
        while i < len(a):
            if a[i] > 0.40:
                sum += 1
            i += 1

        #print(sum/len(a))
        return a


    def sort_1(self, scores_1, index):
        #排序
        i = 0
        score = []
        while i < len(index):
            score.append(0)
            i += 1

        i = 0
        while i < len(index):
            result = index[i]
            score[result] = scores_1[i]
            i += 1
        return score

In [6]:
select_cols = ['mchnt_cd_id', 'mchnt_net_id',
       'cen_betweenness', 'degree_centrality', 'closeness_centrality',
       'current_flow_closeness_centrality', 'information_centrality',
       'load_centrality', 'harmonic_centrality',
       'approximate_current_flow_betweenness_centrality', 'density_factor',
       'count', 'count_N', 'per_amt', 'large_amt', 'cv_amt',
       'amt_d1_d2', 'cnt_d3_d4', 'sd_d5_d6', 'time1_cnt_w1_w2',
       'time2_cnt_w1_w2', 'time_cnt_per_N2', 'time_cnt_per_N2_2',
       'time_cnt_per_N2_3', 'time_cnt_per_N2_4', 'time_cnt_per_N2_5',
       'time_cnt_per_N2_6', 'entropy', 'count_payee', 'sum_amt',
       'large_cnt_per', 'large_amt_per', 'sum_amt_a', 'count_a', 'sum_large',
       'large_max', 'sd_amt', 'mean_amt', 'amt_d1', 'amt_d2', 'cnt_d1',
       'cnt_d2', 'amt_d3', 'amt_d4', 'cnt_d3', 'cnt_d4', 'time1_cnt_w1',
       'time1_cnt_w2', 'time1_amt_w1', 'time1_amt_w2', 'time2_cnt_w1',
       'time2_cnt_w2', 'time2_amt_w1', 'time2_amt_w2', 'sd_d5', 'sd_d6',
       'time_amt_N2_t', 'time_cnt_N2_t', 'time_amt_N2_t2', 'time_cnt_N2_t2',
       'time_amt_N2_t3', 'time_cnt_N2_t3', 'time_amt_N2_t4', 'time_cnt_N2_t4',
       'time_amt_N2_t5', 'time_cnt_N2_t5', 'time_amt_N2_t6', 'time_cnt_N2_t6',
       'time_amt_N2_all', 'time_cnt_N2_all', 'sum_amt_w', 'count_w', '0', '1',
       '2', '3', '4', '5', '6', '7', '8', '9']

In [7]:
forest = IForest(256, 50, 15, "mchnt_deepwalk.csv", 0.8, 10)
scores = forest.Build_Forest()
print(scores)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rst[col][(rst[col] >= min_v) & (rst[col] < max_v)] = temp


[0.24019515873101624, 0.24027763365406274, 0.24562737822021835, 0.2657318729088929, 0.375217909655347, 0.2355930920252138, 0.2445296245157515, 0.2346546311646773, 0.26697076751557314, 0.28971763301180004, 0.24428984948214053, 0.23259878307036017, 0.24074526566536197, 0.23595449328344328, 0.24949587374958845, 0.2563123650216291, 0.2533934133566305, 0.2586201933900829, 0.23492319167183578, 0.23617713742622826, 0.23586854279596128, 0.25534276579888077, 0.24916360512443322, 0.4295034628118683, 0.2371112128731303, 0.23917512894018703, 0.23499033293894608, 0.23453711044165568, 0.23539207247614682, 0.232225350716042, 0.24150303856224248, 0.23848921594513067, 0.23969720808614314, 0.2500845033250229, 0.2376803600147569, 0.23971421756778408, 0.23727218569274086, 0.24612123732770924, 0.24929632668588603, 0.28598466660047284, 0.2380483304039427, 0.23257118657953416, 0.2784561060057321, 0.24992884817194083, 0.24232575026232886, 0.2486476205637534, 0.22614786499868844, 0.24668695233788349, 0.2452352

In [8]:
def change_score_to_label(percentage, scores):
    '''
    将score转换为label
    '''
    sort_scores = sorted(scores)
    index = int(len(scores) * (1. - percentage))
    threshold = sort_scores[index]
    rst = np.array(scores)
    rst[rst < threshold] = 0
    rst[rst >= threshold] = 1
    return rst

In [9]:
labels = change_score_to_label(0.2, scores)
labels

array([0., 0., 0., ..., 1., 0., 0.])

In [10]:
len(labels[labels == 1]) / len(labels)

0.20046439628482973