In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  
import copy 
import re
from sklearn.cluster import DBSCAN
from sklearn import preprocessing
from sklearn import metrics,datasets,manifold

# 6TOPSIS算法
## 6.1源数据准备

In [5]:
# 源数据，暂时先使用出租车的数据做一个例子，之后可以改成在所有数据中进行分析
testdata = pd.DataFrame({'col1':[1 , 3 , 8 , 8] , 'col2':[101,102,103,104]})
testdata

Unnamed: 0,col1,col2
0,1,101
1,3,102
2,8,103
3,8,104


## 6.2指标正向化
将各个指标正向化的函数，其实在本分析中，不正向化也是可以的，因为我并不要求指标越大越好或越小越好，最终会以出租车的score范围作为阈值的标准。

In [6]:
#极小型指标 -> 极大型指标
def dataDirection_1(datas):         
        return np.max(datas)-datas     #套公式

#中间型指标 -> 极大型指标
def dataDirection_2(datas, x_best):
    temp_datas = datas - x_best
    M = np.max(abs(temp_datas))
    answer_datas = 1 - abs(datas - x_best) / M     #套公式
    return answer_datas
    
#区间型指标 -> 极大型指标
def dataDirection_3(datas, x_min, x_max):
    M = max(x_min - np.min(datas), np.max(datas) - x_max)
    answer_list = []
    for i in datas:
        if(i < x_min):
            answer_list.append(1 - (x_min-i) /M)      #套公式
        elif( x_min <= i <= x_max):
            answer_list.append(1)
        else:
            answer_list.append(1 - (i - x_max)/M)
    return np.array(answer_list)   

In [7]:
testdata

Unnamed: 0,col1,col2
0,1,101
1,3,102
2,8,103
3,8,104


In [8]:
testdata['col1'] = dataDirection_2(testdata['col1'] , 5)
testdata

Unnamed: 0,col1,col2
0,0.0,101
1,0.5,102
2,0.25,103
3,0.25,104


## 6.3正向化矩阵标准化(去除量纲影响)

In [9]:
def Standard(datas):
    K = np.power(np.sum(pow(datas,2),axis = 0),0.5)
    for i in range(len(K)):
        datas[: , i] = datas[: , i] / K[i]
    return datas 

In [10]:
sta_data = Standard(testdata.values)
sta_data

array([[0.        , 0.49265362],
       [0.81649658, 0.49753138],
       [0.40824829, 0.50240914],
       [0.40824829, 0.5072869 ]])

## 6.4计算得分并归一化

In [11]:
def Score(sta_data):
    z_max = np.amax(sta_data , axis=0)
    z_min = np.amin(sta_data , axis=0)
    # 计算每一个样本点与最大值的距离
    tmpmaxdist = np.power(np.sum(np.power((z_max - sta_data) , 2) , axis = 1) , 0.5)  # 每个样本距离Z+的距离
    tmpmindist = np.power(np.sum(np.power((z_min - sta_data) , 2) , axis = 1) , 0.5)  # 每个样本距离Z+的距离
    score = tmpmindist / (tmpmindist + tmpmaxdist)
    score = score / np.sum(score)  # 归一化处理
    return score

In [13]:
sco = Score(sta_data)
sco

array([0.        , 0.49697729, 0.25148445, 0.25153826])

## 6.5将计算得到的得分与源数据一起进行整理，形成dataframe

In [14]:
testdata['score'] = sco
testdata.head(5)

Unnamed: 0,col1,col2,score
0,0.0,101,0.0
1,0.5,102,0.496977
2,0.25,103,0.251484
3,0.25,104,0.251538
