In [3]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

## 报告数据标准化

In [14]:
data = json.load(open('data/conference/conference.json', encoding='utf-8', mode='r'))
reports = []
halls = set()
for report in data:
    title = report['title']
    institutions = report['institutions']
    if title and institutions:
        reports.append(report)
        halls.add(report['hall'])

hall_2_id = {h:i for i, h in enumerate(sorted(halls))}
id_2_hall = {i:h for h, i in hall_2_id.items()}
print(len(reports), reports[0], halls)
print(hall_2_id)

481 {'date': '2021-10-16 星期六', 'hall': '黄龙水晶宫', 'host': '陆锋', 'topic': '09:10-10:40 | 特邀报告', 'time': '09:10-09:40', 'title': '全球位置服务网关键技术与研究进展', 'reporters': '龚健雅 院士', 'institutions': '武汉大学'} {'分会场六', '分会场十一', '分会场五', '分会场十二', '分会场十三', '分会场三', '分会场十', '分会场二', '分会场七', '分会场四', '分会场八', '分会场一', '分会场十四', '黄龙水晶宫', '分会场九'}
{'分会场一': 0, '分会场七': 1, '分会场三': 2, '分会场九': 3, '分会场二': 4, '分会场五': 5, '分会场八': 6, '分会场六': 7, '分会场十': 8, '分会场十一': 9, '分会场十三': 10, '分会场十二': 11, '分会场十四': 12, '分会场四': 13, '黄龙水晶宫': 14}


### 报告时间与会场标准化

In [47]:
from datetime import datetime
time_format = '%Y-%m-%d %H:%M'
base_time = datetime(2021, 10, 15)
new_data = []
start_time_set = set()
for report in reports:
    date = report['date']
    time = report['time']
    hall = report['hall']
    date = date.split(' ')[0]
    start, end = time.split('-')
    start_time = date + ' ' + start # datetime.strptime(date + ' ' + start, time_format)
    end_time = date + ' ' + end
    s_time = datetime.strptime(start_time, time_format) - base_time
    e_time = datetime.strptime(end_time, time_format) -  base_time
    hall_id = hall_2_id[hall]
    report['start_time'] = int(s_time.total_seconds())
    report['end_time'] = int(e_time.total_seconds())
    report['hall_id'] = hall_id
    start_time_set.add(start_time)
    new_data.append(report)
json.dump(new_data, open('data/conference/standard_conference_data.json', 'w', encoding='utf-8'), ensure_ascii=False)
print(new_data[0])

{'date': '2021-10-16 星期六', 'hall': '黄龙水晶宫', 'host': '陆锋', 'topic': '09:10-10:40 | 特邀报告', 'time': '09:10-09:40', 'title': '全球位置服务网关键技术与研究进展', 'reporters': '龚健雅 院士', 'institutions': '武汉大学', 'start_time': 119400, 'end_time': 121200, 'hall_id': 14}


#### 按时间段分组

In [66]:
min_delta = 600
time_spans = []
sorted_data = sorted(new_data, key=lambda x: x['start_time']) # 按开始时间排序
t = sorted_data[0]['start_time']
group = 0
group_dict = dict()
group_dict[group] = []
for i, report in enumerate(sorted_data):
    s_time = int(report['start_time'])
    e_time = int(report['end_time'])
    delta = s_time - t
    if delta <= min_delta:
        group_dict[group].append(i)
    else:
        group += 1
        t = s_time
        group_dict[group] = [i]
    report['time_group'] = group
print(group)
json.dump(sorted_data, open(f'data/conference/grouped_conference_data_{min_delta}.json', 'w', encoding='utf-8'), ensure_ascii=False)
    

47


最长组

In [65]:
group_max_len = max([len(g) for k, g in group_dict.items()])
print(group_max_len)

23


## 概率转移矩阵中参数的确定

### 空间距离因素

In [68]:
area1 = {'分会场一', '分会场二', '分会场三', '分会场四', '黄龙水晶宫'}
area2 = {'分会场五', '分会场六', '分会场七', '分会场八', '分会场九'}
area3 = {'分会场十', '分会场十一', '分会场十二', '分会场十三', '分会场十四'}
distance_dict = {
    '11': 2, '22': 3, '33': 3,
    '12': 10, '21': 10, '13': 15, '31':15,
    '23': 20, '32': 20
}
distance_matrix = np.zeros((len(hall_2_id), len(hall_2_id)), dtype=float)
hall_2_area = {h: 1 for h in area1}
hall_2_area.update({h: 2 for h in area2})
hall_2_area.update({h: 3 for h in area3})

for i in range(len(hall_2_id)):
    from_hall = id_2_hall[i]
    from_area = hall_2_area[from_hall]
    distance_matrix[i, i] = 1
    for j in range(i):
        to_hall = id_2_hall[j]
        to_area = hall_2_area[to_hall]
        dis = distance_dict[str(from_area)+str(to_area)]
        distance_matrix[i, j] = distance_matrix[j, i] = dis
print(distance_matrix)

[[ 1. 10.  2. 10.  2. 10. 10. 10. 15. 15. 15. 15. 15.  2.  2.]
 [10.  1. 10.  3. 10.  3.  3.  3. 20. 20. 20. 20. 20. 10. 10.]
 [ 2. 10.  1. 10.  2. 10. 10. 10. 15. 15. 15. 15. 15.  2.  2.]
 [10.  3. 10.  1. 10.  3.  3.  3. 20. 20. 20. 20. 20. 10. 10.]
 [ 2. 10.  2. 10.  1. 10. 10. 10. 15. 15. 15. 15. 15.  2.  2.]
 [10.  3. 10.  3. 10.  1.  3.  3. 20. 20. 20. 20. 20. 10. 10.]
 [10.  3. 10.  3. 10.  3.  1.  3. 20. 20. 20. 20. 20. 10. 10.]
 [10.  3. 10.  3. 10.  3.  3.  1. 20. 20. 20. 20. 20. 10. 10.]
 [15. 20. 15. 20. 15. 20. 20. 20.  1.  3.  3.  3.  3. 15. 15.]
 [15. 20. 15. 20. 15. 20. 20. 20.  3.  1.  3.  3.  3. 15. 15.]
 [15. 20. 15. 20. 15. 20. 20. 20.  3.  3.  1.  3.  3. 15. 15.]
 [15. 20. 15. 20. 15. 20. 20. 20.  3.  3.  3.  1.  3. 15. 15.]
 [15. 20. 15. 20. 15. 20. 20. 20.  3.  3.  3.  3.  1. 15. 15.]
 [ 2. 10.  2. 10.  2. 10. 10. 10. 15. 15. 15. 15. 15.  1.  2.]
 [ 2. 10.  2. 10.  2. 10. 10. 10. 15. 15. 15. 15. 15.  2.  1.]]


### 时间间隔因素

In [None]:
len(start_time_set)
transfer_matrices = []
for i in range(len(group_dict)-1):
    transfer_mat = np.zeros((group_max_len, group_max_len), dtype=float)
    from_group = group_dict[i]
    to_group = group_dict[i+1]
    for i, from_r in enumerate(from_group):
        from_hall_id = sorted_data[from_r]['hall_id']
        for j, to_r in enumerate(to_group):
            to_hall_id = sorted_data[to_r]['hall_id']
            distance = distance_matrix[from_hall_id, to_hall_id]
            transfer_mat[i, j] = 1 / distance # 距离的倒数
    transfer_matrices.append(transfer_mat)
print(transfer_matrices[9])

### 条件随机场

In [None]:
from typing import Tuple, List
def step(mu_prev: np.ndarray,
         emission_probs: np.ndarray,
         transition_probs: np.ndarray,
         observed_state: int) -> Tuple[np.ndarray, np.ndarray]:
    """Runs one step of the Viterbi algorithm.
    
    Args:
        mu_prev: probability distribution with shape (num_hidden),
            the previous mu
        emission_probs: the emission probability matrix (num_hidden,
            num_observed)
        transition_probs: the transition probability matrix, with
            shape (num_hidden, num_hidden)
        observed_state: the observed state at the current step
    
    Returns:
        - the mu for the next step
        - the maximizing previous state, before the current state,
          as an int array with shape (num_hidden)
    """
    
    pre_max = mu_prev * transition_probs.T
    max_prev_states = np.argmax(pre_max, axis=1)
    max_vals = pre_max[np.arange(len(max_prev_states)), max_prev_states]
    mu_new = max_vals * emission_probs[:, observed_state]
    
    return mu_new, max_prev_states


def viterbi(emission_probs: np.ndarray,
            transition_probs: np.ndarray,
            start_probs: np.ndarray,
            observed_states: List[int]) -> Tuple[List[int], float]:
    """Runs the Viterbi algorithm to get the most likely state sequence.
    
    Args:
        emission_probs: the emission probability matrix (num_hidden,
            num_observed)
        transition_probs: the transition probability matrix, with
            shape (num_hidden, num_hidden)
        start_probs: the initial probabilies for each state, with shape
            (num_hidden)
        observed_states: the observed states at each step
    
    Returns:
        - the most likely series of states
        - the joint probability of that series of states and the observed
    """
    
    # Runs the forward pass, storing the most likely previous state.
    mu = start_probs * emission_probs[:, observed_states[0]]
    all_prev_states = []
    for observed_state in observed_states[1:]:
        mu, prevs = step(mu, emission_probs, transition_probs, observed_state)
        all_prev_states.append(prevs)
    
    # Traces backwards to get the maximum likelihood sequence.
    state = np.argmax(mu)
    sequence_prob = mu[state]
    state_sequence = [state]
    for prev_states in all_prev_states[::-1]:
        state = prev_states[state]
        state_sequence.append(state)
    
    return state_sequence[::-1], sequence_prob