In [1]:
import numpy as np
import pandas as pd
import time
from matplotlib import pyplot as plt
from utils import *
import random

In [2]:
from sklearn.decomposition import PCA
#TSNE
from sklearn.manifold import TSNE
# import umap.umap_ as umap

In [3]:
import matplotlib
matplotlib.rcParams['font.sans-serif']=['SimSun','KaiTi','SimHei','FangSong']   # 用黑体显示中文
matplotlib.rcParams['axes.unicode_minus']=False 

from matplotlib.lines import Line2D

In [4]:
import re

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
def get_topic_label(topic_code, topic_lv=1):
    topic = '空'
    if topic_lv == 1:
        topic_I_mapper = {'A':'援助信息','B':'救援需求信息','C':'事件基本信息','D':'建筑设施及公共事业','E':'应急行动进展','F':'预警与建议','G':'公众舆情','H':'其他','NULL':'NULL'}
        topic = topic_I_mapper.get(topic_code[:1],'空')
    elif topic_lv == 2:
        topic_II_mapper = {'B1':'一般救援需求','B2':'生活物资需求','B3':'寻人需求','B4':'紧急转移需求',}
        topic = topic_II_mapper.get(topic_code,'空')
    return topic

def get_topic_code(topics_dict):
    topic_code = 'X'
    topics_mapper = open_json('./data/results/topics_mapping0312.json')
    if topics_dict:
        topic_code = sorted(list(topics_dict.items()), key=lambda x:x[1], reverse=True)[0][0]
    return topics_mapper[topic_code][1]

def mapping_phases(date,phases_mapper):
    date = str(date)[:10]
    if date <= phases_mapper['0'][1]:
        return '0'
    elif date <= phases_mapper['1'][1]:
        return '1'
    elif date <= phases_mapper['2'][1]:
        return '2'
    elif date <= phases_mapper['3'][1]:
        return '3'
    else:
        return '-1'
    
def get_sentiment_label(sentiments, label_mapper):
    senti = 'NULL'
    if sentiments:
        senti = label_mapper[np.argmax(np.array(sentiments))]
    return senti

def refine_topic(content):
    if isinstance(content, str):
        if re.findall("求救|被困", content):
            return "紧急转移需求"
        elif re.findall("急需", content):
            return "生活物资需求"
        elif re.findall("求助", content):
            return "一般救援需求"
        elif re.findall("搜救|搜寻|寻人|失联", content):
            return "寻人需求"
    else:
        return None

In [7]:
set_workspace('/Users/roccc/Desktop/Projects/HenanFlood')

workspace has been set at /Users/roccc/Desktop/Projects/HenanFlood.


## load and prepare data 

### load embeddings

In [8]:
emb = open_json('./data/emb_gatne/filtered_cleaned_emb_Kqv0ZBxGy_2.json')

In [9]:
emb = dict(filter(lambda x: x[1]!='0', emb.items()))

In [10]:
emb_df = pd.concat([pd.Series(emb),],
                    axis=1)

In [11]:
emb_df.dropna(inplace=True)

### labels

In [12]:
nodes_labels = open_json('./data/results/nodes_with_attributes.json')

In [13]:
nodes_labels = json.loads(nodes_labels)

In [14]:
nodes_labels_df = pd.DataFrame.from_dict(nodes_labels,orient='index')

In [15]:
contents_list = open_json('./data/cleaned_data/weibos_content_1124.json')
contents = dict(map(lambda x: (x.get('weibo_id'),x.get('content')), contents_list))

In [16]:
phases = {
    '0': ['2021-07-10', '2021-07-19'],
    '1': ['2021-07-20', '2021-07-24'],
    '2': ['2021-07-25', '2021-07-30'],
    '3': ['2021-07-31', '2021-08-31']
}

### merge into samples

In [17]:
emb_df['content'] = emb_df.index.map(contents)

In [18]:
emb_df['sentiment_raw'] = emb_df.index.map(nodes_labels_df['sentiment'])

In [19]:
emb_df['topic_raw'] = emb_df.index.map(nodes_labels_df['topic'])

In [20]:
emb_df['label_type'] = emb_df.index.map(nodes_labels_df['label_type'])

#### merge date and phase

In [21]:
time_info = get_date_info()

In [22]:
emb_df['date'] = emb_df.index.map(time_info)

In [23]:
emb_df['phase'] = emb_df['date'].apply(lambda x:mapping_phases(x, phases))

### extract label of topic and sentiment.

#### sentiments

In [24]:
sentiment_label_mapping = {'愤怒':0,'积极':1,'中性':2,'惊奇':3,'恐惧':4,'悲伤':5}
label_mapping_r = dict(map(lambda x: (x[1],x[0]), sentiment_label_mapping.items() ))

In [25]:
emb_df['sentiment'] = emb_df['sentiment_raw'].apply(get_sentiment_label, label_mapper=label_mapping_r)

#### topics

In [26]:
emb_df['topic_code'] = emb_df['topic_raw'].apply(get_topic_code)

In [27]:
# emb_df['topic_code'].value_counts()
# emb_df.drop(['topic_I','topic_II'],axis=1, inplace=True)

In [28]:
emb_df['topic_I'] = emb_df.topic_code.apply(get_topic_label, topic_lv=1)
emb_df['topic_II'] = emb_df.topic_code.apply(get_topic_label, topic_lv=2)

In [29]:
emb_df.topic_II.value_counts()

空         884
一般救援需求     45
紧急转移需求     39
寻人需求       17
生活物资需求     12
Name: topic_II, dtype: int64

In [30]:
emb_df['topic_II'] = emb_df[['content','topic_II']].apply(lambda x: refine_topic(x[0]) if refine_topic(x[0]) else x[1], axis=1)

In [31]:
emb_df.topic_II.value_counts()

空         557
紧急转移需求    261
一般救援需求     90
寻人需求       61
生活物资需求     28
Name: topic_II, dtype: int64

In [32]:
emb_df_demand_refine = emb_df[emb_df.topic_II.str.contains("需求")]

## 降维并可视化emb

定义降维函数，注意调参
1. TSNE
    - 先pca，不然会很慢
    - n_components, 2 or 3
    - perplexity, balance the local feature and global feature.
    - n_iter, make sure large enough
2. UMAP
    - n_components, 2 or 3
    - n_neighbors, balance the local feature and global feature.
    - min_dist, [0,1] distance between nodes.
    

In [33]:
def embeddings_by_dimension(emb_df,dimension):
    length = len(emb_df[dimension][0])
    # emb_df[dimension] = emb_df[dimension].apply(lambda x: [0]*length if not isinstance(x, list) else x)
    emb_df[dimension] = emb_df[dimension].apply(lambda x: [0]*length if not x or len(x)!=length else x)
    embeddings = np.hstack(emb_df[dimension].values.tolist()).reshape((emb_df.shape[0],length))
    return embeddings

In [34]:
def tsne(x,init_dim=10,random_state=42,n_components=2,n_iter=3000,perplexity=15):
    start = time.time()
    pca_ = PCA(n_components=init_dim, random_state=random_state)
    pca_result = pca_.fit_transform(x)
    # perplexity = x.shape[0]**(0.5)
    tsne = TSNE(random_state=random_state, n_components=n_components,verbose=0, perplexity=perplexity, n_iter=n_iter).fit_transform(pca_result)
    print('Duration: {} seconds'.format(time.time() - start))
    return tsne

In [35]:
def umap_(x,random_state=42,n_components=2,n_neighbors=100,min_dist=0.01):
    start = time.time()
    reducer = umap.UMAP(random_state=random_state,n_components=n_components, n_neighbors=n_neighbors,min_dist=min_dist)
    embedding = reducer.fit_transform(x)
    print('Duration: {} seconds'.format(time.time() - start))
    return embedding

In [1]:
def visualize_demand(emb_df,x_type,method='tsne',n_neighbors=20,date='',random_state=42):
    # data preparetion
    print(f"Job: {x_type}_{method}_{n_neighbors}_{date}...")
    x = embeddings_by_dimension(emb_df,x_type)
    
    # reduct dimension
    if method == 'tsne':
        x_ = tsne(x,perplexity=n_neighbors,random_state=random_state)
    elif method == 'umap':
        x_ = umap_(x,n_neighbors=n_neighbors)
    emb_2 = pd.concat([pd.DataFrame(x_,index=emb_df.index),emb_df[['topic_II','sentiment']]],axis=1)
    emb_2.to_csv(f'./data/results/dim_reduction/emb_2_{x_type}_{method}_{n_neighbors}.csv', index=True, header=True)

    # visualize
    marker_map = {'一般救援需求':'o','生活物资需求':'s','寻人需求':'p','紧急转移需求':'v'}
    label_cmap = {'一般救援需求':'orange','生活物资需求':'green','寻人需求':'purple','紧急转移需求':'red'}
    label = 'topic_II'
    
    fig, ax = plt.subplots()
    for demand_type, marker in marker_map.items():
        idx = emb_df['topic_II']== demand_type
        scatter1 = ax.scatter(x_[idx,0], x_[idx,1], marker=marker, c=label_cmap[demand_type], 
                          s=40, linewidths=0.1, edgecolors='black',alpha=0.5)

    handles = [Line2D([0], [0], marker='o', color='white', markerfacecolor=v, label=k, markersize=4) for k, v in label_cmap.items()]
    ax.legend(title='', handles=handles, bbox_to_anchor=(1, 1), loc='upper left',prop={'size': 6})

    plt.show()
    fig.savefig(f'./data/results/dim_reduction_demand/demand_{x_type}_{label}_{len(x)}_{method}_{n_neighbors}_{date}_{random_state}.png',dpi=600)
    plt.close()

### 需求的网络演化 

In [39]:

emb_ep = [0,]
neighbors = [10,]
dates = ['2021-07-20','2021-07-21','2021-07-22','2021-07-23','2021-07-24']

#### demand

In [63]:
for col in emb_ep:
    for neighbor in neighbors:
        for date in dates:   
            con = (emb_df_demand_refine['date']>= '2021-07-20') & (emb_df_demand_refine['date'] <= date)
            print(f"col:{col}, #neighbors:{neighbor}, date:{date}, #nodes:{sum(con)}")
            visualize_demand(
                emb_df_demand_refine[con],
                col,
                method='tsne',
                n_neighbors=neighbor,
                date=f'{date}_40',
                random_state=42,
            )
            break

col:0, #neighbors:10, date:2021-07-20, #nodes:61
Job: 0_tsne_10_2021-07-20_40...
Duration: 0.3057851791381836 seconds
