In [1]:
import numpy as np
import networkx as nx
from os.path import join

# import utils
from glimpse_attention_model import GlimpseAttentionModel
import logging

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import numpy as np
import collections

# from sklearn.metrics import roc_auc_score
# from sklearn.metrics import roc_curve, auc
# from sklearn.metrics import average_precision_score
from sklearn.preprocessing import label_binarize
from scipy.stats import rankdata


def _retype(y_prob, y):
    if not isinstance(y, (collections.Sequence, np.ndarray)):
        y_prob = [y_prob]
        y = [y]
    y_prob = np.array(y_prob)
    y = np.array(y)

    return y_prob, y


def _binarize(y, n_classes=None):
    return label_binarize(y, classes=range(n_classes))


def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(y_prob, y, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    predicted = [np.argsort(p_)[-k:][::-1] for p_ in y_prob]
    actual = [[y_] for y_ in y]
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


def mean_rank(y_prob, y):
    ranks = []
    n_classes = y_prob.shape[1]
    for p_, y_ in zip(y_prob, y):
        ranks += [n_classes - rankdata(p_, method='max')[y_]]

    return sum(ranks) / float(len(ranks))


def hits_k(y_prob, y, k=10):
    acc = []
    for p_, y_ in zip(y_prob, y):
        top_k = p_.argsort()[-k:][::-1]
        acc += [1. if y_ in top_k else 0.]
    return sum(acc) / len(acc)


# def roc_auc(y_prob, y):
#     y = _binarize(y, n_classes=y_prob.shape[1])
#     fpr, tpr, _ = roc_curve(y.ravel(), y_prob.ravel())
#     return auc(fpr, tpr)


# def log_prob(y_prob, y):
#     scores = []
#     for p_, y_ in zip(y_prob, y):
#         assert abs(np.sum(p_) - 1) < 1e-8
#         scores += [-math.log(p_[y_]) + 1e-8]
#         print p_, y_

#     return sum(scores) / len(scores)


def portfolio(y_prob, y, k_list=None):
    y_prob, y = _retype(y_prob, y)
    # scores = {'auc': roc_auc(y_prob, y)}
    # scores = {'mean-rank:': mean_rank(y_prob, y)}
    scores = {}
    for k in k_list:
        scores['hits@' + str(k)] = hits_k(y_prob, y, k=k)
        scores['map@' + str(k)] = mapk(y_prob, y, k=k)

    return scores

In [4]:
def write_seen_nodes(data_path, seq_len):
    seen_nodes = []
    with open(join(data_path, 'train.txt'), 'r') as read_file:
        for i, line in enumerate(read_file):
            query, cascade = line.strip().split(' ', 1)
            sequence = cascade.split(' ')[::3]
            seen_nodes.extend(sequence)
            
    with open(join(data_path, 'test.txt'), 'r') as read_file:
        for i, line in enumerate(read_file):
            query, cascade = line.strip().split(' ', 1)
            sequence = cascade.split(' ')[::3]
            seen_nodes.extend(sequence)
    seen_nodes = set(seen_nodes)
    with open(join(data_path, 'seen_nodes.txt'), 'w+') as write_file:
        for node in seen_nodes:
            write_file.write(node + '\n')
#     print(len(seen_nodes))

In [5]:
import datetime
def process_timestamps(timestamps):
#     print(len(timestamps))
    lst2 = []
    for i in range(0, len(timestamps), 2):
        lst2.append(timestamps[i] +" " +timestamps[i+1])
#     print(lst2)
    diff =[]
    for t in range(0,len(lst2)-1):
        format = '%Y-%m-%d %H:%M:%S'
        timeA  = datetime.datetime.strptime(lst2[t+1], format)
        timeB  = datetime.datetime.strptime(lst2[t], format)
        
        diff.append(((timeA-timeB).total_seconds()/60.0))
    
    
#     arr = np.asarray(lst2)
# #     diff = list(np.diff(arr))
#     print(diff)
#     diff = [d / 60.0 for d in diff]
#     format = '%Y-%m-%d %H:%M:%S'
#     timeA  = datetime.datetime.strptime(t1, format)
#     timeB  = datetime.datetime.strptime(tweet_time[k2_int], format)
#     ((timeA-timeB).total_seconds()/60.0)
    for i in range(0, len(diff)-1):
        if diff[i] == diff[i+1]:
            for j in range(0, len(diff)):
                diff[j] += 0.25
#     print(len(diff))
    return diff

In [3]:
# len(node_index)
# load_instances('data/twitter','train',node_index,100,limit=-1)

In [6]:
def process_cascade(cascade, timestamps, testing=False):
    size = len(cascade)
    examples = []
    for i, node in enumerate(cascade):
        if i == size - 1 and not testing:
            return examples
        if i < size - 1 and testing:
            continue
        prefix_c = cascade[: i + 1]
        prefix_t = timestamps[: i + 1]
        # predecessors = set(network[node]) & set(prefix_c)
        # others = set(prefix_c).difference(set(predecessors))

        '''if i == 0:
            times.extend([0.0])
        else:
            # print(i)
            times.extend([(timestamps[i-1] - timestamps[i])])'''

        if not testing:
            label_n = cascade[i+1]
            label_t = timestamps[i+1]
        else:
            label_n = None
            label_t = None

        example = {'sequence': prefix_c, 'time': prefix_t,
                   'label_n': label_n, 'label_t': label_t}

        if not testing:
            examples.append(example)
        else:
            return example


In [44]:
node_index=load_graph('data/twitter')

In [7]:
def load_graph(data_path):
    node_file = join(data_path, 'seen_nodes.txt')
    with open(node_file, 'r') as f:
        seen_nodes = [int(x.strip()) for x in f]
    

    # builds node index
    node_index = {v: i for i, v in enumerate(seen_nodes)}
#     print(node_index)
    return node_index

In [7]:
print(node_index)

NameError: name 'node_index' is not defined

In [8]:
def load_instances(data_path, file_type, node_index, seq_len, limit, ratio=1.0, testing=False):
    print(len(node_index))
    max_diff = 0
    pkl_path = join(data_path, file_type + '.pkl')
#     if isfile(pkl_path):
#         instances = pickle.load(open(pkl_path, 'rb'))
#     else:
    file_name = join(data_path, file_type + '.txt')
    instances = []
    with open(file_name, 'r') as read_file:
        for i, line in enumerate(read_file):
            query, cascade = line.strip().split(' ', 1)
            cascade_nodes = list(map(int, cascade.split(' ')[::3]))
            cascade_times2 = list(cascade.split(' ')[2::3])
            cascade_times1 = list(cascade.split(' ')[1::3])
            cascade_times = (np.array([[i, j] for i, j in zip(cascade_times1, cascade_times2)]).ravel())

            if seq_len is not None:
                cascade_nodes = cascade_nodes[:seq_len+1]
                cascade_times = cascade_times[:(seq_len*2)+2]
#                 print("before",len(cascade_nodes),len(cascade_times))
                if len(cascade_nodes) == (len(cascade_times)/2):
                    cascade_nodes.pop()
                cascade_times = process_timestamps(cascade_times)
#                 print("after",len(cascade_nodes),len(cascade_times))
                assert len(cascade_nodes) == len(cascade_times)
                
            cascade_nodes = [node_index[x] for x in cascade_nodes]
            if not cascade_nodes or not cascade_times:
                continue
            max_diff = max(max_diff, max(cascade_times))
            ins = process_cascade(cascade_nodes, cascade_times, testing)
            instances.extend(ins)
            
            if limit is not None and i == limit:
                break
        # pickle.dump(instances, open(pkl_path, 'wb+'))
    total_samples = len(instances)
    indices = np.random.choice(total_samples, int(total_samples * ratio), replace=False)
#     print(indices)
    sampled_instances = [instances[i] for i in indices]
#     print(sampled_instances)
    return sampled_instances, max_diff

In [11]:
import pickle
data=pickle.load(open("/home/sakshi18013/Documents/codes/final_data_deep_diffuse_sorted_jamia.pkl","rb"))

In [13]:
import random
random.shuffle(data)

train_data = data[:70]
test_data = data[30:]

In [15]:
with open('twitter/train.txt', 'w') as f:
    
    for data in train_data:
#         print(*data)
        for d in data:
            f.write(d+" ")
        f.write("\n")
        

FileNotFoundError: [Errno 2] No such file or directory: 'twitter/train.txt'

In [97]:
with open('test.txt', 'w') as f:
    
    for data in test_data:
#         print(*data)
        for d in data:
            f.write(d)
        f.write("\n")
        

In [27]:
test_data=[]
for i in range(len(data)):
    print((data[i]))
    if i in test_set:
        
        print(i,data[i])
        test_data.append(data[i])
        break

NameError: name 'data' is not defined

In [None]:
import numpy as np
import networkx as nx
from os.path import join

import utils
from glimpse_attention_model import GlimpseAttentionModel
import logging
train_len = 5000


options = utils.load_params()
__processor__ = options['cell_type']
# model_type = options['cell_type']
handler = logging.FileHandler('{}-{}.log'.format(__processor__, options['dataset_name']), 'w')
log = logging.getLogger(__processor__)
log.addHandler(handler)
log.setLevel(logging.DEBUG)

data_path = join(options['dataset_name'])
write_seen_nodes(join(options['dataset_name']), 30)
node_index = utils.load_graph(data_path)
options['node_size'] = len(node_index)
# print(nx.info(G))
train_instances, max_diff_train = load_instances(data_path, 'train', node_index, options['seq_len'],
                                                       limit=-1)
test_instances, max_diff_test = load_instances(data_path, 'test', node_index, options['seq_len'],
                                                     limit=-1)
options['max_diff'] = max_diff_train
# print(len(train_instances), len(test_instances))
options['n_train'] = len(train_instances)
# print(len(train_instances))
train_loader = utils.Loader(train_instances, options)
test_loader = utils.Loader(test_instances, options)
print(len(train_loader),len(test_loader))


2444
2444
114 83636
114




Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





In [3]:

log.info('running glimpse attention model')
log.info('using attention:' + str(options['use_attention']))
log.info(options)
glimpse_ins = GlimpseAttentionModel(options, options['use_attention'], options['n_train'])

glimpse_ins.run_model(train_loader, test_loader, options)
