# 公共数据

## 题目初始数据

In [3]:
import pandas as pd

In [4]:
df1 = pd.read_excel('dataset/附件一：已结束项目任务数据.xls')
task_location_values = df1[['任务gps经度','任务gps 纬度']].values
task_price_values = df1['任务标价'].values

In [5]:
df2 = pd.read_excel('dataset/附件二：会员信息数据-分列.xlsx')
vip_location_values = df2[['会员位置经度','会员位置纬度']].values
vip_lim_values = df2['预订任务限额'].values
vip_credit_values = df2['信誉值'].values

In [6]:
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']   #解决中文显示问题
plt.rcParams['axes.unicode_minus'] = False    # 解决中文显示问题

In [7]:
import numpy as np
def get_euclidean_dis(x1, y1, x2, y2):
    x_squ = (x1 - x2)**2
    y_squ = (y1 - y2)**2
    dis = np.sqrt(x_squ + y_squ) * 111 * 1000
    return dis

## 计算得到的数据

### 与城市中心的距离

In [6]:
from sklearn.cluster import KMeans
import numpy as np
cluster = KMeans(n_clusters=4,random_state=0).fit(task_location_values)
y_pred = cluster.labels_
centroid = cluster.cluster_centers_
task_dis_to_city_center = np.zeros(len(df1))
for i in range(len(task_location_values)):
    center_loc = centroid[y_pred[i]]
    task_dis_to_city_center[i] = get_euclidean_dis(task_location_values[i,0],
                                                   task_location_values[i,1],
                                                   center_loc[0],
                                                   center_loc[1])
pd.DataFrame(task_dis_to_city_center).describe()

Unnamed: 0,0
count,835.0
mean,17783.392481
std,9654.577805
min,396.992079
25%,11623.089207
50%,16396.632905
75%,22787.597732
max,94366.816502


### 附近任务数

In [7]:
import numpy as np
task_num = len(df1)
task_around_task_count = np.zeros(task_num)
for i in range(task_num):
    for j in range(task_num):
        if i == j:
            continue
        if get_euclidean_dis(task_location_values[i,0],task_location_values[i,1],
                             task_location_values[j,0],task_location_values[j,1],)<=2000:
            task_around_task_count[i] += 1

### 附近会员数及平均完成能力

In [11]:
def entropy_weight(data):
    # 假设data已经正向化过了
    data = np.array(data)
    # 归一化（可能已经归约过了）
    data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))
    # 概率矩阵
    p = data / data.sum(axis=0)
    # 信息熵
    E = - np.nansum(p * np.log(np.where(p==0,1,p)), axis=0) / np.log(len(data[0]))
    # 熵权
    return (1 - E) / (1 - E).sum()

In [12]:
vip_lim_credit = np.hstack((vip_lim_values.reshape(-1,1),vip_credit_values.reshape(-1,1)))
vip_cap_w = entropy_weight(vip_lim_credit)
print(vip_cap_w)
vip_lim_credit_con = (vip_lim_credit - vip_lim_credit.min(axis=0)) / (vip_lim_credit.max(axis=0) - vip_lim_credit.min(axis=0))
vip_capbility = (vip_cap_w * vip_lim_credit_con).sum(axis=1)
print(vip_lim_credit_con)
print(vip_capbility)

[0.61346148 0.38653852]
[[4.89177489e-01 1.00000000e+00]
 [7.01298701e-01 5.57764693e-01]
 [5.97402597e-01 4.11089860e-01]
 ...
 [0.00000000e+00 8.97093300e-08]
 [0.00000000e+00 5.14725664e-08]
 [0.00000000e+00 0.00000000e+00]]
[6.86630069e-01 6.45817277e-01 5.25385547e-01 ... 3.46761120e-08
 1.98961298e-08 0.00000000e+00]


In [13]:
import numpy as np
task_num = len(df1)
vip_num = len(df2)
task_around_vip_count = np.zeros(task_num)
task_around_vip_capbility_sum = np.zeros(task_num)
for i in range(task_num):
    for j in range(vip_num):
        if get_euclidean_dis(task_location_values[i,0],task_location_values[i,1],
                             vip_location_values[j,0],vip_location_values[j,1]) <= 2000:
            task_around_vip_count[i] += 1
            task_around_vip_capbility_sum[i] += vip_capbility[i]
task_around_vip_capbility_ave = task_around_vip_capbility_sum / np.where(task_around_vip_count==0,
                                                              1,task_around_vip_count)