In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

## 报告数据标准化

In [2]:
data = json.load(open('data/conference/conference.json', encoding='utf-8', mode='r'))
reports = []
halls = set()
for report in data:
    title = report['title']
    institutions = report['institutions']
    if title and institutions:
        reports.append(report)
        halls.add(report['hall'])

hall_2_id = {h:i for i, h in enumerate(sorted(halls))}
id_2_hall = {i:h for h, i in hall_2_id.items()}
print(len(reports), reports[0], halls)
print(hall_2_id)

481 {'date': '2021-10-16 星期六', 'hall': '黄龙水晶宫', 'host': '陆锋', 'topic': '09:10-10:40 | 特邀报告', 'time': '09:10-09:40', 'title': '全球位置服务网关键技术与研究进展', 'reporters': '龚健雅 院士', 'institutions': '武汉大学'} {'分会场十', '黄龙水晶宫', '分会场十二', '分会场十一', '分会场三', '分会场七', '分会场四', '分会场十四', '分会场六', '分会场九', '分会场十三', '分会场五', '分会场八', '分会场二', '分会场一'}
{'分会场一': 0, '分会场七': 1, '分会场三': 2, '分会场九': 3, '分会场二': 4, '分会场五': 5, '分会场八': 6, '分会场六': 7, '分会场十': 8, '分会场十一': 9, '分会场十三': 10, '分会场十二': 11, '分会场十四': 12, '分会场四': 13, '黄龙水晶宫': 14}


### 报告时间与会场标准化
需要提一下，有重复报告，这里我们采用去重的方法解决

In [3]:
from datetime import datetime
time_format = '%Y-%m-%d %H:%M'
base_time = datetime(2021, 10, 15)
new_data = []
start_time_set = set()
title_set = set()
for report in reports:
    title = report['title']
    if title in title_set: 
        print(title)
        continue
    title_set.add(title)
    date = report['date']
    time = report['time']
    hall = report['hall']
    topic = report['topic']
    topic = topic.split('|')[-1].split('：')[-1].strip()
    date = date.split(' ')[0]
    start, end = time.split('-')
    start_time = date + ' ' + start # datetime.strptime(date + ' ' + start, time_format)
    end_time = date + ' ' + end
    s_time = datetime.strptime(start_time, time_format) - base_time
    e_time = datetime.strptime(end_time, time_format) -  base_time
    hall_id = hall_2_id[hall]
    report['date'] = date
    report['start_time'] = int(s_time.total_seconds())
    report['end_time'] = int(e_time.total_seconds())
    report['hall_id'] = hall_id
    report['topic'] = topic
    start_time_set.add(start_time)
    new_data.append(report)
json.dump(new_data, open('data/conference/standard_conference_data.json', 'w', encoding='utf-8'), ensure_ascii=False)
print(new_data[0])
print(len(title_set), len(reports))

康巴藏区藏传佛教寺院的空间布局特征及其影响因素
基于GPS轨迹的西藏骑行游客时空行为分析
基于轨迹大数据的游客行为研究：总结、局限与展望
{'date': '2021-10-16', 'hall': '黄龙水晶宫', 'host': '陆锋', 'topic': '特邀报告', 'time': '09:10-09:40', 'title': '全球位置服务网关键技术与研究进展', 'reporters': '龚健雅 院士', 'institutions': '武汉大学', 'start_time': 119400, 'end_time': 121200, 'hall_id': 14}
478 481


#### 按时间段分组

In [4]:
min_delta = 600
time_spans = []
sorted_data = sorted(new_data, key=lambda x: x['start_time']) # 按开始时间排序
t = sorted_data[0]['start_time']
group = 0
group_dict = defaultdict(lambda: list())
# group_dict[group] = []
topic_2_reports = defaultdict(lambda: list())
for i, report in enumerate(sorted_data):
    s_time = int(report['start_time'])
    e_time = int(report['end_time'])
    topic = report['topic']
    delta = s_time - t

    if delta > min_delta:
        group += 1
        t = s_time
    group_dict[group].append(i)   
    report['time_group'] = group
    report['id'] = i
    topic_2_reports[topic].append((i, report['title']))
print(group)
json.dump(sorted_data, open(f'data/conference/grouped_conference_data_{min_delta}.json', 'w', encoding='utf-8'), ensure_ascii=False)
    

47


最长组

In [5]:
group_max_len = max([len(g) for k, g in group_dict.items()])
print(group_max_len)

23


### 生成测试数据

In [6]:
testing_keywords = ['时空大数据', '人工智能', '粤港澳大湾区', '空间可达性', '光学遥感',  
                    '轨迹数据挖掘', '时空模拟与预测', '地理时空建模', '遥感变化检测与地图更新', '地理信息共享与互操作'] # 五个抽取的关键词，五个专题名
with open('data/text_match/predict.jl', 'w', encoding='utf-8') as writer:
    for keyword in testing_keywords:
        for report in sorted_data:
            writer.write(json.dumps({'title': report['title'], 'keywords': [keyword]}, ensure_ascii=False))
            writer.write('\n')

### 预测数据读取

In [39]:
prob_matrix = np.zeros((len(sorted_data), len(testing_keywords)), dtype=float)
with open('outputs/text_match/predictions/bert_bert_wwm_ext_2_prediction_text_matching.jl', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        d_json = json.loads(line.strip())
        prob = float(d_json['prob'])
        col = i // len(sorted_data)
        row = i - col * len(sorted_data)
        prob_matrix[row, col] = prob
    assert i == len(sorted_data) * len(testing_keywords) - 1

#### 查看与关键词最匹配的标题

In [49]:
ith = 9
keyword = testing_keywords[ith]
probs = prob_matrix[:, ith]
ranks = np.argsort(-probs) # 由于默认使用升序，所以使用负号
print(keyword, ':')
for i, r in enumerate(ranks[:30]): # 看前十个
    report = sorted_data[r]
    print(report['title'], probs[r])

print('gold reports:')
gold_reports = topic_2_reports.get(keyword, [])
for i, report in gold_reports:
    print(report)

地理信息共享与互操作 :
地理信息共享新模式：从分析就绪、AI就绪到决策就绪 12.150110244750977
地理分析模型的共享与重用方法研究 4.1197638511657715
支持地理信息服务链可视化构建与执行的BPMN扩展框架模型设计 3.643040657043457
地理空间模型数据自动匹配研究：地理空间数据—模型共享的新范式（线上） 3.2376365661621094
智慧城市背景下的地理信息知识组织与重用 3.184375762939453
NSFC信息地理学的机遇与挑战 3.0859227180480957
一种顾及QoS的地理信息服务关系网络模型 2.878934621810913
一种多元地理信息处理服务拓展的框架构建 2.776102304458618
不同可持续发展目标情景下地理智能建模 2.673800468444824
地理大数据挖掘的方法体系及主要进展 2.5489742755889893
地图语言的延伸与多模态地理信息融合表达 2.5290110111236572
多源空间信息同化与共享服务关键技术研究 2.485527753829956
一种互信息领域自适应网络用于高分辨率语义分割 2.378546953201294
从地理空间到神经空间：类脑导航研究进展 2.3710427284240723
地理时空认知与医学内镜手术 2.3696205615997314
地理认知与GIS创新 2.3609416484832764
地理大数据支持下的人群动态多尺度制图方法研究 2.2649688720703125
WebGL技术在虚拟地理环境构建中的应用研究 2.1219546794891357
利用地理大数据刻画空间交互模式 1.9393177032470703
自然与人为过程耦合的地理模拟平台搭建及应用 1.7854048013687134
Urban Expansion and Drying Climate in Urban Agglomerations of China 1.7411761283874512
明清江南市镇体系演化的若干地理特征 1.7225759029388428
Beyond absolute space: An exploration of relative and relational space in S

## 概率转移矩阵中参数的确定

### 空间距离因素

In [50]:
area1 = {'分会场一', '分会场二', '分会场三', '分会场四', '黄龙水晶宫'}
area2 = {'分会场五', '分会场六', '分会场七', '分会场八', '分会场九'}
area3 = {'分会场十', '分会场十一', '分会场十二', '分会场十三', '分会场十四'}
distance_dict = {
    '11': 2, '22': 3, '33': 3,
    '12': 10, '21': 10, '13': 15, '31':15,
    '23': 20, '32': 20
}
distance_matrix = np.zeros((len(hall_2_id), len(hall_2_id)), dtype=float)
hall_2_area = {h: 1 for h in area1}
hall_2_area.update({h: 2 for h in area2})
hall_2_area.update({h: 3 for h in area3})

for i in range(len(hall_2_id)):
    from_hall = id_2_hall[i]
    from_area = hall_2_area[from_hall]
    distance_matrix[i, i] = 1
    for j in range(i):
        to_hall = id_2_hall[j]
        to_area = hall_2_area[to_hall]
        dis = distance_dict[str(from_area)+str(to_area)]
        distance_matrix[i, j] = distance_matrix[j, i] = dis
print(distance_matrix)

[[ 1. 10.  2. 10.  2. 10. 10. 10. 15. 15. 15. 15. 15.  2.  2.]
 [10.  1. 10.  3. 10.  3.  3.  3. 20. 20. 20. 20. 20. 10. 10.]
 [ 2. 10.  1. 10.  2. 10. 10. 10. 15. 15. 15. 15. 15.  2.  2.]
 [10.  3. 10.  1. 10.  3.  3.  3. 20. 20. 20. 20. 20. 10. 10.]
 [ 2. 10.  2. 10.  1. 10. 10. 10. 15. 15. 15. 15. 15.  2.  2.]
 [10.  3. 10.  3. 10.  1.  3.  3. 20. 20. 20. 20. 20. 10. 10.]
 [10.  3. 10.  3. 10.  3.  1.  3. 20. 20. 20. 20. 20. 10. 10.]
 [10.  3. 10.  3. 10.  3.  3.  1. 20. 20. 20. 20. 20. 10. 10.]
 [15. 20. 15. 20. 15. 20. 20. 20.  1.  3.  3.  3.  3. 15. 15.]
 [15. 20. 15. 20. 15. 20. 20. 20.  3.  1.  3.  3.  3. 15. 15.]
 [15. 20. 15. 20. 15. 20. 20. 20.  3.  3.  1.  3.  3. 15. 15.]
 [15. 20. 15. 20. 15. 20. 20. 20.  3.  3.  3.  1.  3. 15. 15.]
 [15. 20. 15. 20. 15. 20. 20. 20.  3.  3.  3.  3.  1. 15. 15.]
 [ 2. 10.  2. 10.  2. 10. 10. 10. 15. 15. 15. 15. 15.  1.  2.]
 [ 2. 10.  2. 10.  2. 10. 10. 10. 15. 15. 15. 15. 15.  2.  1.]]


### 时间间隔因素

In [None]:
len(start_time_set)
transfer_matrices = []
for i in range(len(group_dict)-1):
    transfer_mat = np.zeros((group_max_len, group_max_len), dtype=float)
    from_group = group_dict[i]
    to_group = group_dict[i+1]
    for i, from_r in enumerate(from_group):
        from_hall_id = sorted_data[from_r]['hall_id']
        for j, to_r in enumerate(to_group):
            to_hall_id = sorted_data[to_r]['hall_id']
            distance = distance_matrix[from_hall_id, to_hall_id]
            transfer_mat[i, j] = 1 / distance # 距离的倒数
    transfer_matrices.append(transfer_mat)
print(transfer_matrices[9])

In [56]:
len(transfer_matrices)

47

### 条件随机场

In [4]:
from typing import Tuple, List
def step(mu_prev: np.ndarray,
         emission_probs: np.ndarray,
         transition_probs: np.ndarray,
         observed_state: int) -> Tuple[np.ndarray, np.ndarray]:
    """Runs one step of the Viterbi algorithm.
    
    Args:
        mu_prev: probability distribution with shape (num_hidden),
            the previous mu
        emission_probs: the emission probability matrix (num_hidden,
            num_observed)
        transition_probs: the transition probability matrix, with
            shape (num_hidden, num_hidden)
        observed_state: the observed state at the current step
    
    Returns:
        - the mu for the next step
        - the maximizing previous state, before the current state,
          as an int array with shape (num_hidden)
    """
    
    pre_max = mu_prev * transition_probs.T
    max_prev_states = np.argmax(pre_max, axis=1)
    max_vals = pre_max[np.arange(len(max_prev_states)), max_prev_states]
    mu_new = max_vals * emission_probs[:, observed_state]
    
    return mu_new, max_prev_states


def viterbi(emission_probs: np.ndarray,
            transition_probs: np.ndarray) -> Tuple[List[int], float]:
    """Runs the Viterbi algorithm to get the most likely state sequence.
    
    Args:
        emission_probs: the emission probability matrix (num_hidden,
            steps)
        transition_probs: the transition probability matrix, with
            shape (steps-1, num_hidden, num_hidden)
    
    Returns:
        - the most likely series of states
        - the joint probability of that series of states and the observed
    """
    
    # Runs the forward pass, storing the most likely previous state.
    assert emission_probs.shape[-1] - 1 == transition_probs.shape[0]
    mu = emission_probs[:, 0]
    all_prev_states = []
    for t in range(1, emission_probs.shape[-1]):
        mu, prevs = step(mu, emission_probs, transition_probs[t-1], t)
        all_prev_states.append(prevs)
    
    # Traces backwards to get the maximum likelihood sequence.
    state = np.argmax(mu)
    sequence_prob = mu[state]
    state_sequence = [state]
    for prev_states in all_prev_states[::-1]:
        state = prev_states[state]
        state_sequence.append(state)
    
    return state_sequence[::-1], sequence_prob

In [None]:
group_dict

In [75]:
kw_i = 6 # 第几个 testing_keywords
steps = len(group_dict)
num_hidden = group_max_len
transition_probs = np.array(transfer_matrices, dtype=float) # np.random.rand(steps-1, num_hidden, num_hidden)
emission_probs = np.full((num_hidden, steps), -1000, dtype=float) # (num_hidden, steps)
probs = prob_matrix[:, kw_i]
for g_id in range(len(group_dict)):
    group = group_dict[g_id]
    for ind, report_id in enumerate(group):
        emission_probs[ind, g_id] = probs[report_id]

seq, prob = viterbi(emission_probs, transition_probs)
print(seq, prob)

[0, 0, 0, 0, 0, 0, 0, 9, 10, 4, 1, 3, 9, 3, 6, 5, 2, 8, 8, 7, 3, 12, 4, 6, 6, 0, 19, 0, 0, 0, 4, 0, 11, 7, 9, 11, 14, 6, 4, 12, 1, 4, 0, 0, 0, 0, 0, 0] 1.6875989651340056e-11


#### 展示推荐序列

In [76]:
print(testing_keywords[kw_i], ':')
for i_group, ind in enumerate(seq):
    report_id = group_dict[i_group][ind]
    selected_report = sorted_data[report_id]
    title = selected_report['title']
    date, time = selected_report['date'], selected_report['time']
    emi_prob = prob_matrix[report_id, kw_i]
    if emi_prob > 3:
        print(i_group, date+' '+time , title, emi_prob)

时空模拟与预测 :
7 2021-10-16 13:30-13:50 多粒度时空对象建模理论研究 3.290858507156372
8 2021-10-16 13:50-14:10 历史-现势一体的时空大数据管理 3.0428085327148438
9 2021-10-16 14:00-14:15 基于轨迹数据的城市内涝交通暴露性时空模拟 7.124446868896484
10 2021-10-16 14:15-14:30 基于眼动数据的人类空间导航行为模式识别与预测 3.5094242095947266
18 2021-10-16 16:30-16:50 时空信息预测：从线性假设到不变性假设 15.986931800842285
20 2021-10-16 16:50-17:05 两种时空模拟方法—贝叶斯最大熵和集成嵌套拉普拉斯—在江苏省土壤有机质制图中的应用 7.694623947143555
23 2021-10-16 17:35-17:50 城市洪涝灾害脆弱性的综合量化模型与时空模拟 10.590811729431152
26 2021-10-17 8:10-8:20 登革热时空预报模型 3.7622668743133545
30 2021-10-17 9:00-9:15 面向粤港澳大湾区的时空数据分析与综合模拟 3.81354022026062
31 2021-10-17 9:10-9:20 武汉新冠肺炎治愈患者时空行为分析 3.1768651008605957
38 2021-10-17 11:06-11:14 基于出租车轨迹数据的居民区房价分析与预测 4.899379253387451
39 2021-10-17 11:30-11:38 基于轨迹数据的城市交通信息转向级挖掘与预测 4.563422203063965


## 基于字符串的文本相似度计算

- The Dice similarity score is defined as twice the shared information (intersection) divided by sum of cardinalities. For two sets X and Y, the Dice similarity score is:
$$
dice(X, Y) = \frac{2 * |X \cap Y|}{|X| + |Y|}
$$

- The overlap coefficient is a similarity measure related to the Jaccard measure that measures the overlap between two sets, and is defined as the size of the intersection divided by the smaller of the size of the two sets. For two sets X and Y, the overlap coefficient is:
$$
overlap\_coefficient(X, Y) = \frac{|X \cap Y|}{\min(|X|, |Y|)}
$$

Note, however, that certain measures such as affine gap, Monge-Elkan, Needleman-Wunsch, Smith-Waterman and Soft TF/IDF do not have a get_sim_score method, because there is no straightforward way to normalize the raw scores of these measures into similarity scores in [0,1]

In [11]:
import py_stringmatching.similarity_measure as SM

In [31]:
text1 = '轨迹数据'
text2 = '出租车轨迹数据'
list1 = list(text1)
list2 = list(text2)

cos = SM.cosine.Cosine()
raw_s = cos.get_raw_score(list1, list2)
sim_s = cos.get_sim_score(list1, list2)
print('Cosine:', raw_s, sim_s)

bd = SM.bag_distance.BagDistance()
raw_s = bd.get_raw_score(text1, text2)
sim_s = bd.get_sim_score(text1, text2)
print('BagDistance:', raw_s, sim_s)

dice = SM.dice.Dice()
raw_s = dice.get_raw_score(list1, list2)
sim_s = dice.get_sim_score(list1, list2)
print('Dice:', raw_s, sim_s)

gj = SM.generalized_jaccard.GeneralizedJaccard()
raw_s = gj.get_raw_score(list1, list2)
sim_s = gj.get_sim_score(list1, list2)
print('GeneralizedJaccard:', raw_s, sim_s)

jac = SM.jaccard.Jaccard()
raw_s = jac.get_raw_score(list1, list2)
sim_s = jac.get_sim_score(list1, list2)
print('Jaccard:', raw_s, sim_s)

jaro = SM.jaro.Jaro()
raw_s = jaro.get_raw_score(text1, text2)
sim_s = jaro.get_sim_score(text1, text2)
print('Jaro:', raw_s, sim_s)

lev = SM.levenshtein.Levenshtein()
raw_s = lev.get_raw_score(text1, text2)
sim_s = lev.get_sim_score(text1, text2)
print('Levenshtein:', raw_s, sim_s)

me = SM.monge_elkan.MongeElkan()
raw_s = me.get_raw_score(list1, list2)
sim_s = raw_s # me.get_sim_score(list1, list2)
print('MongeElkan:', raw_s, sim_s)

nw = SM.needleman_wunsch.NeedlemanWunsch()
raw_s = nw.get_raw_score(text1, text2)
sim_s = raw_s # me.get_sim_score(list1, list2)
print('NeedlemanWunsch:', raw_s, sim_s)

oc = SM.overlap_coefficient.OverlapCoefficient()
raw_s = oc.get_raw_score(list1, list2)
sim_s = oc.get_sim_score(list1, list2)
print('OverlapCoefficient:', raw_s, sim_s)

s = SM.partial_ratio.PartialRatio()
raw_s = s.get_raw_score(text1, text2)
sim_s = s.get_sim_score(text1, text2)
print('PartialRatio:', raw_s, sim_s)

# s = SM.partial_token_sort.PartialTokenSort()
# raw_s = s.get_raw_score(text1, text2)
# sim_s = s.get_sim_score(text1, text2)
# print('PartialTokenSort:', raw_s, sim_s)

# s = SM.ratio.Ratio()
# raw_s = s.get_raw_score(text1, text2)
# sim_s = s.get_sim_score(text1, text2)
# print('PartialRatio:', raw_s, sim_s)

sw = SM.smith_waterman.SmithWaterman()
raw_s = sw.get_raw_score(text1, text2)
sim_s = raw_s # sw.get_sim_score(text1, text2)
print('SmithWaterman:', raw_s, sim_s)

soft_tfidf = SM.soft_tfidf.SoftTfIdf()
raw_s = soft_tfidf.get_raw_score(list1, list2)
sim_s = raw_s
print('SoftTfIdf:', raw_s, sim_s)

tfidf = SM.tfidf.TfIdf()
raw_s = tfidf.get_raw_score(list1, list2)
sim_s = tfidf.get_raw_score(list1, list2)
print('TfIdf:', raw_s, sim_s)

# s = SM.token_sort.TokenSort()
# raw_s = s.get_raw_score(text1, text2)
# sim_s = s.get_sim_score(text1, text2)
# print('TokenSort:', raw_s, sim_s)

tvi = SM.tversky_index.TverskyIndex()
raw_s = tvi.get_raw_score(list1, list2)
sim_s = tvi.get_raw_score(list1, list2)
print('TverskyIndex:', raw_s, sim_s)

Cosine: 0.7559289460184544 0.7559289460184544
BagDistance: 3 0.5714285714285714
Dice: 0.7272727272727273 0.7272727272727273
GeneralizedJaccard: 0.5714285714285714 0.5714285714285714
Jaccard: 0.5714285714285714 0.5714285714285714
Jaro: 0 0
Levenshtein: 3 0.5714285714285714
MongeElkan: 1.0 1.0
NeedlemanWunsch: 1.0 1.0
OverlapCoefficient: 1.0 1.0
PartialRatio: 100 1.0
SmithWaterman: 4.0 4.0
SoftTfIdf: 0.5 0.5
TfIdf: 0.0 0.0
TverskyIndex: 0.7272727272727273 0.7272727272727273


In [12]:
str_smi_model = SM.overlap_coefficient.OverlapCoefficient()
title_list = list(title_set)
for i, keyword in enumerate(testing_keywords):
    pairs = []
    for j, title in enumerate(title_list):
        score = str_smi_model.get_sim_score(list(keyword), list(title))
        pairs.append({'index': j, 'score': score})
        #
    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
    for pair in pairs[:10]:
        print('{} \t {} \t Score: {:.4f}'.format(keyword, title_list[pair['index']], pair['score']))

时空大数据 	 基于手机大数据的城市居民时空停留模式研究 	 Score: 1.0000
时空大数据 	 基于时空大数据的粤港澳大湾区城市群综合决策和协同服务研究框架 	 Score: 1.0000
时空大数据 	 面向粤港澳大湾区的时空数据分析与综合模拟 	 Score: 1.0000
时空大数据 	 多源时空大数据驱动的COVID-19风险建模与制图 	 Score: 1.0000
时空大数据 	 城市群时空多尺度大数据的认知计算 	 Score: 1.0000
时空大数据 	 大数据时空化的基本问题与挑战 	 Score: 1.0000
时空大数据 	 历史-现势一体的时空大数据管理 	 Score: 1.0000
时空大数据 	 多源时空数据支撑下的城市人-车协同疏散 	 Score: 0.8000
时空大数据 	 基于大数据的中国创业情感的空间差异及其对创业率的影响 	 Score: 0.8000
时空大数据 	 多源、多尺度极地海洋环境数据动态粒子模型构建及时空过程分析方法研究 	 Score: 0.8000
人工智能 	 人工智能驱动的领域知识建模与深度共享服务 	 Score: 1.0000
人工智能 	 一种非刚性网格标识的复杂场景多无人机智能协同路径规划方法 	 Score: 0.7500
人工智能 	 面向AI的多源时空遥感影像智能处理及其应用 	 Score: 0.5000
人工智能 	 个体轨迹数据支持下的基于空间显式智能体模型模拟COVID-19在城市内部的传播 	 Score: 0.5000
人工智能 	 构建“智能”的时空实体——行为与认知建模 	 Score: 0.5000
人工智能 	 多光谱LiDAR点云智能分类 	 Score: 0.5000
人工智能 	 大数据-小样本背景下的遥感图像智能识别 	 Score: 0.5000
人工智能 	 不同可持续发展目标情景下地理智能建模 	 Score: 0.5000
人工智能 	 基于多时相卫星影像的地块与作物智能化识别 	 Score: 0.5000
人工智能 	 室内点云智能处理 	 Score: 0.5000
粤港澳大湾区 	 面向粤港澳大湾区的土地利用-人口-GDP多要素协同模拟模型 	 Score: 1.0000
粤港澳大湾区 	 基于GEE与长时序遥感影像的粤港澳大湾区红

## 基于词向量的文本相似度计算

In [10]:
from text2vec import Similarity
title_list = list(title_set)
# embedding_type: ['w2v', 'sbert']
sim_model = Similarity(similarity_type='cosine', embedding_type='w2v')

for i, keyword in enumerate(testing_keywords):
    pairs = []
    for j, title in enumerate(title_list):
        score = sim_model.get_score(keyword, title)
        pairs.append({'index': j, 'score': score})
        #
    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
    for pair in pairs[:10]:
        print('{} \t {} \t Score: {:.4f}'.format(keyword, title_list[pair['index']], pair['score']))

2022-02-08 12:28:40.703 | DEBUG    | text2vec.word2vec:__init__:81 - Load w2v from w2v-light-tencent-chinese, spend 1.20 sec
2022-02-08 12:28:40.705 | DEBUG    | text2vec.word2vec:__init__:85 - Word count: 143613, emb size: 200
2022-02-08 12:28:40.706 | DEBUG    | text2vec.word2vec:__init__:86 - Set stopwords: ['--', '?', 'ZT', 'ZZ', "a's", 'able', 'about', 'above', 'according', 'accordingly'], count: 1396


时空大数据 	 大数据空间回归方法：从GWR到GNNWR 	 Score: 0.8894
时空大数据 	 大数据时空化的基本问题与挑战 	 Score: 0.8713
时空大数据 	 历史-现势一体的时空大数据管理 	 Score: 0.8578
时空大数据 	 基于最邻近时空距离的LUCC时空模式分析 	 Score: 0.8406
时空大数据 	 全空间技术在数据资产管理中的应用 	 Score: 0.8403
时空大数据 	 面向AI的多源时空遥感影像智能处理及其应用 	 Score: 0.8381
时空大数据 	 城市群时空多尺度大数据的认知计算 	 Score: 0.8310
时空大数据 	 遥感数据空时谱信息融合重建及应用 	 Score: 0.8259
时空大数据 	 多源时空数据支撑下的城市人-车协同疏散 	 Score: 0.8256
时空大数据 	 城市群时空数据感知、融合与质量分析 	 Score: 0.8231
人工智能 	 人工智能驱动的领域知识建模与深度共享服务 	 Score: 0.8557
人工智能 	 人群活动与城市空间交互模式的研究 	 Score: 0.8209
人工智能 	 一种非刚性网格标识的复杂场景多无人机智能协同路径规划方法 	 Score: 0.8192
人工智能 	 基于OCEAN大五人格的私家车轨迹画像研究 	 Score: 0.8161
人工智能 	 智慧城市背景下的地理信息知识组织与重用 	 Score: 0.8121
人工智能 	 从学术地图发布平台到古籍智慧大数据平台 	 Score: 0.8119
人工智能 	 新冠疫情不同阶段下北京市人群职住活动变化分析 	 Score: 0.8068
人工智能 	 不同可持续发展目标情景下地理智能建模 	 Score: 0.8068
人工智能 	 面向可持续发展目标的人类数字足迹挖掘研究 	 Score: 0.8067
人工智能 	 青藏高原城市生态系统中人类活动影响异质性测度方法研究 	 Score: 0.8054
粤港澳大湾区 	 新时期粤港澳大湾区城市群协同发展的内涵与综合测度 	 Score: 0.8269
粤港澳大湾区 	 粤港澳大湾区城市群时空演化分异特征分析 	 Score: 0.8187
粤港澳大湾区 	 “灵魂”引领下粤港澳大湾区智慧城市群的构建 	