In [1]:
import pandas as pd
import numpy as np

In [2]:
# parameters
early_stop_threshold = 0.18
p = 0.8
q = 2.5
l = 20
r = 5

In [3]:
data = pd.read_csv('../data/preliminary/cleaned_data.csv')
vocab = pd.read_csv('../data/preliminary/vocab.txt')

hash_map = dict()
for index, row in vocab.iterrows():
    hash_map[row['word']] = index
vocab = vocab['word'].values

In [4]:
# function: construct early stopwords list
def construct_early_stopwords():
    mid_map = np.zeros((vocab.shape[0], 5))
    for index, row in data.iterrows():
        if isinstance(row['words'], str):
            word_list = row['words'].split('/')
            score = row['Score']
            for i in range(len(word_list)):
                word = hash_map[word_list[i]]
                mid_map[word][score-1] += 1
    early_stopwords = [vocab[index] for index in np.where((mid_map.var(axis = 1) < early_stop_threshold) == True)[0].tolist()]
    return early_stopwords, mid_map

In [5]:
def construct_adajcent_matrix():
    A = np.zeros((vocab.shape[0], vocab.shape[0]))
    for index, row in data.iterrows():
        if isinstance(row['words'], str):
            word_list = row['words'].split('/')
            for i in range(len(word_list) - 1):
                word1 = hash_map[word_list[i]]
                word2 = hash_map[word_list[i+1]]
                if word1 != word2:
                    A[word1][word2] += 1
                    A[word2][word1] += 1
                #else:
                #    A[word1][word2] += 1
    return A

In [6]:
graph = construct_adajcent_matrix()

In [7]:
early_stopwords, mid_map = construct_early_stopwords()
len(early_stopwords)

39307

In [8]:
early_stopwords[0:10]

['刁民', '省略', '花太多', '16.1', '挂牌', '660', '外圈', '猜测', '宰人', '如图所示']

In [9]:
np.where(mid_map[hash_map['挂牌']] != 0)

(array([0, 1, 3, 4]),)

In [10]:
print(np.max(graph[3]))
print(graph[3][45])
np.where(graph[2] != 0)[0]

452.0
452.0


array([    0,     1,     3, ..., 87511, 87512, 87661])

In [11]:
i = 3
print(vocab[i])
graph[i][i]

景区


0.0

In [33]:
def find_next_ver(graph,cur_ver, last_cur):
    if last_cur == -1:
        return np.where(graph[cur_ver] == np.max(graph[cur_ver]))[0][0]
    
    cur_ver_neighbors = np.where(graph[cur_ver] != 0)[0]
    last_ver_neighbors = np.where(graph[last_cur] != 0)[0]
    max_edge = -1
    max_ver = -1
    for ver in cur_ver_neighbors:
        weight = graph[cur_ver][ver]
        if ver == last_cur:
            weight = float(weight) / p
        elif ver not in last_ver_neighbors:
            weight = float(weight) / q
        if max_edge < weight:
            max_edge = weight
            max_ver = ver
    return max_ver
        

In [34]:
import time
def random_walk(graph):
    Sequences = []
    vertex_num = graph.shape[0]
    for vertex in range(vertex_num):
        print(vertex)
        word = vocab[vertex]
        if word in early_stopwords:
            continue
        for walk_iter in range(r):
            sequence = [word]
            for step_iter in range(l):
                cur_ver = hash_map[sequence[-1]]
                last_ver = hash_map[sequence[-2]] if step_iter >= 1 else -1
                
                next_ver = find_next_ver(graph, cur_ver, last_ver)
                
                if next_ver in early_stopwords:
                    break
                else:
                    sequence.append(vocab[next_ver])
            Sequences.append('/'.join(sequence))
    return Sequences

In [36]:
graph = np.array([[0,1,1], [1,0,1], [1,1,0]])
begin = time.time()
sequences = random_walk(graph)
end = time.time()
print(end - begin)

0
1
2
38.57426977157593


In [19]:
sequences


['好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好',
 '好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好',
 '好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好',
 '好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好',
 '好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好',
 '不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错',
 '不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错',
 '不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错',
 '不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错',
 '不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错',
 '地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方',
 '地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方',
 '地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方',
 '地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方',
 '地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方/好/地方']

In [26]:
df = pd.DataFrame(sequences, columns = ['sequence'])

In [27]:
df.shape

(15, 1)

In [28]:
df.head()

Unnamed: 0,sequenc
0,好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好...
1,好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好...
2,好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好...
3,好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好...
4,好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好/不错/好...
