# rbf, tensorflow相似度计算

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

tf.enable_eager_execution()

tf.executing_eagerly()

### 先整体填充nan为均值，整体标准化，再把-1和1的样本拿出来的

data_train = pd.read_pickle("/home/mountain/atec/data/raw_pickle/train")

def data_fillna_mean(data):
    # id, label, date要去掉
    for feature in data.columns:
        if feature not in ["id", "label", "date"]:
            print feature
            data[feature] = data[feature].fillna(data[feature].mean())

data_fillna_mean(data_train)

def data_scale(data_train_feature):
    for feature in data_train_feature.columns:
        if feature not in ["id", "label", "date"]:
            print feature
            scaler = MinMaxScaler()
            train_scaler = scaler.fit(data_train_feature[feature].values.reshape(-1, 1))
            train_transform = scaler.transform(data_train_feature[feature].values.reshape(-1, 1))
            data_train_feature[feature] = pd.DataFrame(train_transform, dtype=np.float32)

data_scale(data_train)

data_train_pos_1 = data_train[data_train["label"] == 1].drop(labels=["id", "label", "date"], axis=1)
data_train_neg_1 = data_train[data_train["label"] == -1].drop(labels=["id", "label", "date"], axis=1)

data_train_pos_1.to_pickle("../data/raw_pickle_split/train_feature_pos_1")
data_train_neg_1.to_pickle("../data/raw_pickle_split/train_feature_neg_1")

data_train_pos_1 = pd.read_pickle("../data/raw_pickle_split/train_feature_pos_1")
data_train_neg_1 = pd.read_pickle("../data/raw_pickle_split/train_feature_neg_1")

### 计算欧氏距离

A = tf.constant(data_train_neg_1.values)
B = tf.constant(data_train_pos_1.values)

A_Squ_A_Sum = tf.reduce_sum(A*A, 1)
A_Squ_A_Sum = tf.reshape(A_Squ_A_Sum, [-1, 1])
B_Squ_B_Sum = tf.reduce_sum(B*B, 1)
B_Squ_B_Sum = tf.reshape(B_Squ_B_Sum, [-1, 1])

# 这里Dist是平方了
Dist = (A_Squ_A_Sum - 2 * tf.matmul(A, tf.transpose(B)) + tf.transpose(B_Squ_B_Sum))

Dist_Min = tf.reduce_min(Dist, axis=1)
Dist_Min_Index = np.array(tf.argmin(Dist, axis=1))

Dist_Min_New = []
for _ in Dist_Min:
    Dist_Min_New.append(float(_))

Dist_Min_Index_New = []
for _ in Dist_Min_Index:
    Dist_Min_Index_New.append(int(data_train_pos_1.iloc[_].name))

### 再变成rbf

import math
# 欧氏距离转换为对应的相似度
data_sim = [math.exp(-1 / 2.0 * d) for d in Dist_Min_New]

### 生成结果

res = []
for i in range(len(data_sim)):
    res_single = []
    neg_index = data_train_neg_1.index[i]
    pos_index = Dist_Min_Index_New[i]
    sim = data_sim[i]
    res_single = [neg_index, pos_index, sim]
    res.append(res_single)

# 这里的index只最原始的所有数据的行索引
res = pd.DataFrame(res, columns=["neg_index", "pos_index", "rbf_sim"])

res.to_pickle("../data/rbf_sim")

res = pd.read_pickle("../data/rbf_sim")

(res["rbf_sim"] < 0.1).sum()

### 验证

neg_row = data_train_neg_1.loc[2331]
pos_row = data_train_pos_1.loc[270516]

from scipy import spatial
d = spatial.distance.euclidean(neg_row, pos_row)
print math.exp(-1.0 / 2 * math.pow(d, 2))

# 用tensorflow计算相似度

## 计算欧氏距离，采用分治

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm

tf.enable_eager_execution()

In [None]:
tf.executing_eagerly()

In [None]:
train_feature_neg_1 = pd.read_pickle("../data/tmp/train_feature_-1")
train_feature_0_1 = pd.read_pickle("../data/tmp/train_feature_0_1")

### 数据分治

In [None]:
split_list = np.arange(0, train_feature_0_1.shape[0]+10000, 10000)

### 分别计算欧氏距离，取最小值，再合并

In [None]:
Dist_Min_All = []
Dist_Min_Index_All = []

for i in tqdm(range(len(split_list)-1)):
    train_feature_0_1_split = train_feature_0_1.iloc[split_list[i]:split_list[i+1]]

    A = tf.constant(train_feature_neg_1.values)
    B = tf.constant(train_feature_0_1_split.values)

    A_Squ_A_Sum = tf.reduce_sum(A*A, 1)
    A_Squ_A_Sum = tf.reshape(A_Squ_A_Sum, [-1, 1])
    B_Squ_B_Sum = tf.reduce_sum(B*B, 1)
    B_Squ_B_Sum = tf.reshape(B_Squ_B_Sum, [-1, 1])

    Dist = tf.sqrt((A_Squ_A_Sum - 2 * tf.matmul(A, tf.transpose(B)) + tf.transpose(B_Squ_B_Sum)))
        
    Dist_Min = tf.reduce_min(Dist, axis=1)
    Dist_Min_Index = np.array(tf.argmin(Dist, axis=1))
    
    Dist_Min_New = []
    for _ in Dist_Min:
        Dist_Min_New.append(float(_))
    
    Dist_Min_Index_New = []
    for _ in Dist_Min_Index:
        Dist_Min_Index_New.append(int(train_feature_0_1_split.iloc[_].name))
    Dist_Min_All.append(Dist_Min_New)
    Dist_Min_Index_All.append(Dist_Min_Index_New)

### 合并之后，再取最小值，注意index的变化


In [None]:
data_dist = pd.DataFrame(Dist_Min_All)
data_dist_index = pd.DataFrame(Dist_Min_Index_All)

data_dist_min = data_dist.min(axis=0)
data_dist_min_index = data_dist.idxmin(axis=0)

### index转换

In [None]:
# index转换
data_dist_min_index_new = []
for _ in data_dist_min_index.index:
    print _
    column_index = data_dist_index[_]
    min_index = data_dist_min_index[_]
    min_index_new = column_index[min_index]
    data_dist_min_index_new.append(min_index_new)

In [None]:
# 欧氏距离转换为对应的相似度
data_sim = [1.0 / (1.0 + d) for d in data_dist_min]

### 生成结果

In [None]:
res = []
for i in range(len(data_dist_min)):
    res_single = []
    neg_index = train_feature_neg_1.index[i]
    pos_index = data_dist_min_index_new[i]
    sim = data_sim[i]
    res_single = [neg_index, pos_index, sim]
    res.append(res_single)

In [None]:
# 这里的index只最原始的所有数据的行索引
res = pd.DataFrame(res, columns=["neg_index", "pos_index", "euc_sim"])

In [None]:
res.to_pickle("../data/euc_similarity")

### 验证

In [None]:
from scipy import spatial
d = spatial.distance.euclidean(neg_row, pos_row)

# 传统方法计算相似度（卡死）

In [None]:
# coding: utf-8
import pandas as pd
import numpy as np
from scipy import spatial
import logging
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filename='./log/euc_dist.log', level=logging.INFO)

data_train_feature_neg_1 = pd.read_pickle("../data/train_feature_-1")
data_train_feature_0_1 = pd.read_pickle("../data/train_feature_0_1")
data_train_label = pd.read_pickle("../data/train_label")

res_ndarr = np.empty((0,4))

for neg_row_index, neg_row in data_train_feature_neg_1.iterrows():
    
    logging.info("    neg_row_index: %s", neg_row_index)
    
    euc_dist = 999999
    opt_index = 0
    for pos_row_index, pos_row in data_train_feature_0_1.iterrows():
        
        if pos_row_index % 100000 == 0:
            logging.info("    pos_row_index: %s", pos_row_index)

        d = spatial.distance.euclidean(neg_row, pos_row)
        if d < euc_dist:
            euc_dist = d
            opt_index = pos_row_index
    euc_dist_sim = 1.0 / (1.0 + d)
    opt_index_label = data_train_label["label"][opt_index]
    res_ndarr = np.append(res_ndarr, np.array([[neg_row_index, opt_index, euc_dist_sim, opt_index_label]]), axis=0)

res_ndarr.dump("../data/similarity")