In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.preprocessing import Normalizer
from gensim import corpora, models, matutils
from gensim.models.ldamulticore import LdaMulticore

#!pip uninstall cdlib
#!pip install numpy

In [2]:
from input_values import (TV_SHOW, PRE_PROCESSED_FILE_NAME, LDA_FILE_NAME, 
                          OUT_DIR, GAMMA, TOLERANCE, ITERATIONS, NUM_TOPICS)

In [3]:
TV_SHOW, PRE_PROCESSED_FILE_NAME, LDA_FILE_NAME, OUT_DIR

('YouNetflix_100', 'YouNetflix_100', 'YouNetflix_100', '../tvshows/output/')

In [4]:
with open(OUT_DIR + PRE_PROCESSED_FILE_NAME + '_preprocessed_tweets_with_userid.csv', 'r') as infile:
    df = pd.read_csv(infile, names=['userid', 'tweets'], usecols=['userid'], delimiter='|')
    #df.tweets = df.tweets.apply(lambda x: literal_eval(x))
    #df['tweets'] = df.tweets.str.replace(r'\W+',' ')
# Convert the tweet_doc into tweet_tokens and remove non_alphanumeric strings in the tokens
#df['tweet_tokens'] = df['tweets'].apply(lambda x: x.split())
#%%

In [5]:
len(df)

0

In [6]:
def plot_graph(G):
    pos = nx.spring_layout(G, k=0.3*1/np.sqrt(len(G.nodes())), iterations=20)
    nx.draw_networkx_nodes(G, pos, node_size = 50, with_labels=True)
    #nx.draw_networkx_labels(G, pos)
    nx.draw_networkx_edges(G, pos, with_labels=True, edge_color='black', arrows=True)
    plt.rcParams['figure.figsize'] = [200, 200]
    plt.title("Retweet Network drawn from 200 random nodes", { 'fontsize': 20 })
    plt.axis('off')
    plt.rcParams["figure.figsize"] = (30,30)
    plt.show()
    
def remove_isolated_nodes(G):
    print(nx.info(G))
    isolated_nodes = list(nx.isolates(G))
    print('\nIsolated nodes: {}\n'.format(len(isolated_nodes)))
    print('removing isolated nodes...\n')
    G.remove_nodes_from(isolated_nodes)
    G.remove_edges_from(G.selfloop_edges())
    print(nx.info(G))
    return G

#%%
graph = nx.read_graphml(OUT_DIR + TV_SHOW + '.graphml')

graph = remove_isolated_nodes(graph)
#%%

Name: 
Type: DiGraph
Number of nodes: 77389
Number of edges: 101893
Average in degree:   1.3166
Average out degree:   1.3166

Isolated nodes: 7297

removing isolated nodes...

Name: 
Type: DiGraph
Number of nodes: 70092
Number of edges: 101810
Average in degree:   1.4525
Average out degree:   1.4525


In [7]:
def add_weights(graph):
    degree_list = ['retweet_count', 'mention_count', 'reply_count', 'quote_count']
    attrs = {}
    for (node1,node2,*data) in graph.edges(data=True):
        weight = sum([value for key, value in data[0].items() if key in degree_list])
        attrs[(node1, node2)] = {'weight': weight}
    nx.set_edge_attributes(graph, attrs)
    return graph

graph = add_weights(graph)
#%%
#%%
# This functions takes the LDA topic model and returns document topic vectors

In [8]:
nx.write_graphml(graph, TV_SHOW+'.graphml')

In [10]:
print(nx.info(graph))

Name: 
Type: DiGraph
Number of nodes: 323078
Number of edges: 755150
Average in degree:   2.3374
Average out degree:   2.3374


In [11]:
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):

    if not matutils.ismatrix(corpus):
        corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary))
    else:
        corpus_csc = corpus
        # Need corpus to be a streaming gensim list corpus for len and inference functions below:
        corpus = matutils.Sparse2Corpus(corpus_csc)

    beta = 0.01
    fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
    term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
    term_freqs[term_freqs == 0] = beta
    doc_lengths = corpus_csc.sum(axis=0).A.ravel()

    assert term_freqs.shape[0] == len(dictionary), 'Term frequencies and dictionary have different shape {} != {}'.format(
        term_freqs.shape[0], len(dictionary))
    assert doc_lengths.shape[0] == len(corpus), 'Document lengths and corpus have different sizes {} != {}'.format(
        doc_lengths.shape[0], len(corpus))

    if hasattr(topic_model, 'lda_alpha'):
        num_topics = len(topic_model.lda_alpha)
    else:
        num_topics = topic_model.num_topics

    if doc_topic_dists is None:
        # If its an HDP model.
        if hasattr(topic_model, 'lda_beta'):
            gamma = topic_model.inference(corpus)
        else:
            gamma, _ = topic_model.inference(corpus)
        doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
    else:
        if isinstance(doc_topic_dists, list):
            doc_topic_dists = matutils.corpus2dense(doc_topic_dists, num_topics).T
        elif issparse(doc_topic_dists):
            doc_topic_dists = doc_topic_dists.T.todense()
        doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

    assert doc_topic_dists.shape[1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(
        doc_topic_dists.shape[1], num_topics)

    # get the topic-term distribution straight from gensim without
    # iterating over tuples
    if hasattr(topic_model, 'lda_beta'):
        topic = topic_model.lda_beta
    else:
        topic = topic_model.state.get_lambda()
    topic = topic / topic.sum(axis=1)[:, None]
    topic_term_dists = topic[:, fnames_argsort]

    assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]

    return doc_topic_dists

def get_doc_topic_dist(OUT_DIR=OUT_DIR):
    lda_dict = corpora.Dictionary.load(OUT_DIR + LDA_FILE_NAME + '.dict') 
    lda_corpus = corpora.MmCorpus(OUT_DIR + LDA_FILE_NAME + '.mm')
    lda = LdaMulticore.load(OUT_DIR + LDA_FILE_NAME + '.lda')
    return _extract_data(topic_model=lda, dictionary=lda_dict, corpus=lda_corpus)
#%%
def get_DT_row_norm(doc_topic_dist):
    DT_row_norm = np.asmatrix(normalize(doc_topic_dist, axis=1, norm='l1'))
    return DT_row_norm

def get_DT_col_norm(doc_topic_dist):
    DT_col_norm = np.asmatrix(normalize(doc_topic_dist, axis=0, norm='l1'))
    return DT_col_norm

def get_sim(DT_row_norm, i, j, k):
    sim = 1 - abs(DT_row_norm.item((i,k))-DT_row_norm.item(j,k))
    return sim    

def get_weight(nodei, nodej, graph):
    ''' Adds weights to the Transition matrix by accepting two nodes: node1, nodej.
    weight is computed as follows:
    
        weight = (sum of weighted in-degrees of nodej)/(sum of weighted degrees of node1)
        Returns 0.0 if both numerator and denominator of the above expression is 0
    '''
    if nx.has_path(graph, nodei, nodej) and graph.has_edge(nodei, nodej):
        #print(nodei , nodej)
        return (graph.get_edge_data(nodei, nodej)['weight'] / graph.out_degree(nodei, weight='weight'))
    else:
        return 0.0

def get_Pt(DT_row_norm, k, graph, data=df):
    size = DT_row_norm.shape[0]
    trans_mat = np.zeros((size, size))
    for i in range(0, size):
        for j in range(0, size):
            if graph.has_node(str(data['userid'].iloc[i])) and graph.has_node(str(data['userid'].iloc[j])):
                trans_mat[i][j] = get_weight(str(data['userid'].iloc[i]), str(data['userid'].iloc[j]), graph) * get_sim(DT_row_norm, i, j, k)   
            else:
                trans_mat[i][j] = 0.0      
    return trans_mat


def get_TRt(gamma, trans_mat, Et, iter=1000, tolerance=1e-16):
    old_TRt = Et
    i = 0
    while i < iter:
        TRt = (gamma*np.dot(trans_mat,old_TRt)) + ((1 - gamma) * Et)
        euclidean_dis = np.linalg.norm(TRt - old_TRt)
        if euclidean_dis < tolerance: 
            print('Topic Rank vectors have converged...')
            break
        old_TRt = TRt
        i += 1
    return TRt

def get_TR(DT_row_norm, DT_col_norm, num_topics, gamma, tolerance, graph, data=df):
    for k in range(0, num_topics):
        trans_mat = get_Pt(DT_row_norm, k, graph, data)
        Et = DT_col_norm[:,k]
        if k==0: TR = get_TRt(gamma, trans_mat, Et)
        else: TR = np.concatenate((TR, get_TRt(gamma, trans_mat, Et)), axis=1)
    return TR

def get_TR_sum(TR, samples, num_topics):
    TR_sum = [0 for i in range(0, samples)]
    for i in range(0, num_topics):
        for j in range(0, samples):
            TR_sum[j] += TR[i][j]
    TR_sum.sort()
    return TR_sum

In [12]:
#%%
doc_topic_dist = get_doc_topic_dist(OUT_DIR)
DT_row_norm = get_DT_row_norm(doc_topic_dist)
DT_col_norm = get_DT_col_norm(doc_topic_dist)
#%%
# Check the transition matrix
#%%

In [13]:
doc_topic_dist.shape

(954, 10)

In [14]:
TR = get_TR(DT_row_norm, DT_col_norm, graph=graph, data=df, 
            num_topics=NUM_TOPICS, gamma=GAMMA, tolerance=TOLERANCE)


Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...


In [15]:
TR_sum = np.sum(TR, axis=1).tolist()
TR_sum = [item for sublist in TR_sum for item in sublist]

#%%

In [None]:
def compute_random_walk_for_each_topic(transition_matrix, state_topic_vector,
                                       damping_factor=0.85, iterations=100, tolerance=1e-6):
  while iterations > 0:
     new_state_topic_vector = damping_factor * np.dot(transition_matrix, state_topic_vector) + \
                                                         (1 - damping_factor) * state_topic_vector
     distance = np.linalg.norm(new_state_topic_vector - state_topic_vector)
     if distance < tolerance:
         break
     else:
         state_topic_vector = new_state_topic_vector

  return new_state_topic_vector

In [None]:
len(graph.nodes)

In [None]:
len(df)

In [None]:
TR_sum

In [19]:
df_graph_nodes = pd.DataFrame(data=np.zeros((graph.number_of_nodes(), NUM_TOPICS)), index=graph.nodes())

In [None]:
df_graph_nodes.to_csv(TV_SHOW + '_graph_nodes.csv')

In [16]:
pd.DataFrame(data=doc_topic_dist, index=df.userid).to_csv(TV_SHOW + '_doc_topic_frame.csv')

In [17]:
df_topic_frame = pd.DataFrame(data=TR, index=df.userid.tolist())

In [18]:
df_topic_frame.to_csv(TV_SHOW + '_topic_rank_frame.csv')

In [None]:
final_df = pd.concat([df_graph_nodes, df_topic_frame], axis=1).iloc[:,10:]

In [None]:
final_df.fillna(np.float64(0.1))

In [None]:
transition_matrix = nx.google_matrix(graph, nodelist=final_df.index.tolist(), weight='weight')

In [None]:
TV_SHOW

In [None]:
df_graph_nodes

In [20]:
df_graph_nodes.to_csv(TV_SHOW + '_graph_nodes_with_no_isolated_nodes.csv')

In [None]:
sorted(graph.degree(), key=lambda x:x[1], reverse=True)

In [None]:
pd.DataFrame(data=DT_row_norm, index=df.userid.tolist())

In [None]:
set(graph.nodes())

In [None]:
df_topic_frame

In [None]:
( list(df_topic_frame.index) in list(graph.nodes()))

In [None]:
df_topic_frame = pd.DataFrame(data=DT_row_norm, index=df.userid.tolist())
df_topic_frame

In [None]:
TR.shape

In [None]:
a = [str(x) for x in df_topic_frame.index]
a

In [None]:
len(set(graph.nodes()) - set(a))

In [None]:
len(set(a))

In [None]:
len(graph.nodes())

In [None]:
final_df.index.dtype

In [None]:
df_topic_frame

In [None]:
set(graph.nodes())

In [None]:
test_graph = remove_isolated_nodes(graph)

In [None]:
len(set(a).intersection(set(graph.nodes())))

In [None]:
len(set(a).intersection(set(test_graph.nodes())))

In [None]:
df_topic_frame

In [None]:
remove_isolated_nodes(graph)

In [None]:
df.userid = df.userid.apply(lambda x)

In [None]:
set(df.userid) - set(graph.nodes())

In [None]:
graph = nx.relabel_nodes(graph, lambda x:int(x))

In [None]:
test_graph = remove_isolated_nodes(graph)

In [None]:
set(df.userid) - set(test_graph.nodes())

In [None]:
df_topic_frame

In [None]:
pd.DataFrame(data=doc_topic_dist)

In [None]:
doc_topic_dist.sum(axis=1)

In [None]:
LDA_FILE_NAME

In [9]:
graph.nodes()

NodeView(('1000010249674395648', '1000282310388932608', '1000290654755450880', '1000304603840241665', '1000600081165438977', '1000799170469486592', '1000995072127549440', '1001111725066870785', '1001196759950004225', '1001225498490560514', '1001670235840176128', '1001787932540317696', '1001826991853596672', '1002184209706954753', '1002941807636779008', '1003291253772038144', '1003437474130128896', '1003721069931483136', '100465577', '1004685427', '1004741645001248768', '1005003261806604288', '1005261587798634496', '1006020949592846336', '1006038096', '1006266788', '1006504926942396417', '1007131977164754945', '1007170742482620416', '100742164', '1007642150178181120', '1007955240916799488', '1007989092599164930', '1008376289500246018', '1008725911544315904', '1008847034546192384', '1009467526797918208', '1009537260805488640', '1009641376794009601', '1009757526', '1009952921125965824', '1010061655235940352', '1010367764182151168', '1010707073758388225', '1011496933444149248', '1011669052