In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.preprocessing import Normalizer
from gensim import corpora, models, matutils
from gensim.models.ldamulticore import LdaMulticore

#!pip uninstall cdlib
!pip install numpy

In [2]:
from input_values import TV_SHOW, PRE_PROCESSED_FILE_NAME, LDA_FILE_NAME, OUT_DIR, GAMMA, TOLERANCE, ITERATIONS, NUM_TOPICS

In [3]:
with open(OUT_DIR + PRE_PROCESSED_FILE_NAME + '_preprocessed_tweets_with_userid.csv', 'r') as infile:
    df = pd.read_csv(infile, names=['userid', 'tweets'], usecols=['userid'], delimiter='|')
    #df.tweets = df.tweets.apply(lambda x: literal_eval(x))
    #df['tweets'] = df.tweets.str.replace(r'\W+',' ')
# Convert the tweet_doc into tweet_tokens and remove non_alphanumeric strings in the tokens
#df['tweet_tokens'] = df['tweets'].apply(lambda x: x.split())
#%%

In [4]:
df.head

<bound method NDFrame.head of           userid
0     1000180003
1      100112839
2      101041090
3     1012608162
4     1014034260
5     1028801107
6     1028822432
7     1035136201
8      103527907
9     1035685855
10     103669987
11    1042909369
12     104291294
13    1050811578
14    1054862096
15    1058504270
16     105873118
17    1063504014
18    1066284529
19    1069878042
20     107342121
21    1073726160
22    1074911185
23     107560430
24     107593866
25    1081348808
26    1081614926
27     108182532
28     108394897
29    1086630528
...          ...
2137   946084788
2138    94862595
2139   955382688
2140    96003412
2141   966598915
2142   967202258
2143   967354854
2144    96736190
2145   967976202
2146   968859452
2147    96907262
2148    97115468
2149   971508163
2150   974980706
2151   975338694
2152   976939640
2153     9782772
2154   980890998
2155   981565674
2156    98205990
2157   984062040
2158    98474093
2159   985042080
2160   985321628
2161    98937081
2

In [5]:
def plot_graph(G):
    pos = nx.spring_layout(G, k=0.3*1/np.sqrt(len(G.nodes())), iterations=20)
    nx.draw_networkx_nodes(G, pos, node_size = 50, with_labels=True)
    #nx.draw_networkx_labels(G, pos)
    nx.draw_networkx_edges(G, pos, with_labels=True, edge_color='black', arrows=True)
    plt.rcParams['figure.figsize'] = [200, 200]
    plt.title("Retweet Network drawn from 200 random nodes", { 'fontsize': 20 })
    plt.axis('off')
    plt.rcParams["figure.figsize"] = (30,30)
    plt.show()
    
def remove_isolated_nodes(G):
    print(nx.info(G))
    isolated_nodes = list(nx.isolates(G))
    print('\nIsolated nodes: {}\n'.format(len(isolated_nodes)))
    print('removing isolated nodes...\n')
    G.remove_nodes_from(isolated_nodes)
    print(nx.info(G))
    return G

#%%
graph = nx.read_graphml(OUT_DIR + TV_SHOW + '.graphml')

graph = remove_isolated_nodes(graph)
#%%

Name: 
Type: DiGraph
Number of nodes: 1969
Number of edges: 457
Average in degree:   0.2321
Average out degree:   0.2321

Isolated nodes: 1592

removing isolated nodes...

Name: 
Type: DiGraph
Number of nodes: 377
Number of edges: 457
Average in degree:   1.2122
Average out degree:   1.2122


In [6]:
def add_weights(graph):
    degree_list = ['retweet_count', 'mention_count', 'reply_count', 'quote_count']
    attrs = {}
    for (node1,node2,*data) in graph.edges(data=True):
        weight = sum([value for key, value in data[0].items() if key in degree_list])
        attrs[(node1, node2)] = {'weight': weight}
    nx.set_edge_attributes(graph, attrs)
    return graph

graph = add_weights(graph)
#%%
#%%
# This functions takes the LDA topic model and returns document topic vectors

In [7]:
OUT_DIR + TV_SHOW + '.graphml'

'../tvshows/output/StrangerThings.graphml'

In [8]:
print(nx.info(graph))


Name: 
Type: DiGraph
Number of nodes: 377
Number of edges: 457
Average in degree:   1.2122
Average out degree:   1.2122


In [9]:
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):

    if not matutils.ismatrix(corpus):
        corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary))
    else:
        corpus_csc = corpus
        # Need corpus to be a streaming gensim list corpus for len and inference functions below:
        corpus = matutils.Sparse2Corpus(corpus_csc)

    beta = 0.01
    fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
    term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
    term_freqs[term_freqs == 0] = beta
    doc_lengths = corpus_csc.sum(axis=0).A.ravel()

    assert term_freqs.shape[0] == len(dictionary), 'Term frequencies and dictionary have different shape {} != {}'.format(
        term_freqs.shape[0], len(dictionary))
    assert doc_lengths.shape[0] == len(corpus), 'Document lengths and corpus have different sizes {} != {}'.format(
        doc_lengths.shape[0], len(corpus))

    if hasattr(topic_model, 'lda_alpha'):
        num_topics = len(topic_model.lda_alpha)
    else:
        num_topics = topic_model.num_topics

    if doc_topic_dists is None:
        # If its an HDP model.
        if hasattr(topic_model, 'lda_beta'):
            gamma = topic_model.inference(corpus)
        else:
            gamma, _ = topic_model.inference(corpus)
        doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
    else:
        if isinstance(doc_topic_dists, list):
            doc_topic_dists = matutils.corpus2dense(doc_topic_dists, num_topics).T
        elif issparse(doc_topic_dists):
            doc_topic_dists = doc_topic_dists.T.todense()
        doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

    assert doc_topic_dists.shape[1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(
        doc_topic_dists.shape[1], num_topics)

    # get the topic-term distribution straight from gensim without
    # iterating over tuples
    if hasattr(topic_model, 'lda_beta'):
        topic = topic_model.lda_beta
    else:
        topic = topic_model.state.get_lambda()
    topic = topic / topic.sum(axis=1)[:, None]
    topic_term_dists = topic[:, fnames_argsort]

    assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]

    return doc_topic_dists

def get_doc_topic_dist(OUT_DIR=OUT_DIR):
    lda_dict = corpora.Dictionary.load(OUT_DIR + LDA_FILE_NAME + '.dict') 
    lda_corpus = corpora.MmCorpus(OUT_DIR + LDA_FILE_NAME + '.mm')
    lda = LdaMulticore.load(OUT_DIR + LDA_FILE_NAME + '.lda')
    return _extract_data(topic_model=lda, dictionary=lda_dict, corpus=lda_corpus)
#%%
def get_DT_row_norm(doc_topic_dist):
    DT_row_norm = np.asmatrix(normalize(doc_topic_dist, axis=1, norm='l1'))
    return DT_row_norm

def get_DT_col_norm(doc_topic_dist):
    DT_col_norm = np.asmatrix(normalize(doc_topic_dist, axis=0, norm='l1'))
    return DT_col_norm

def get_sim(DT_row_norm, i, j, k):
    sim = 1 - abs(DT_row_norm.item((i,k))-DT_row_norm.item(j,k))
    return sim    

def get_weight(nodei, nodej, graph):
    ''' Adds weights to the Transition matrix by accepting two nodes: node1, nodej.
    weight is computed as follows:
    
        weight = (sum of weighted in-degrees of nodej)/(sum of weighted degrees of node1)
        Returns 0.0 if both numerator and denominator of the above expression is 0
    '''
    if nx.has_path(graph, nodei, nodej) and graph.has_edge(nodei, nodej):
        #print(nodei , nodej)
        return (graph.get_edge_data(nodei, nodej)['weight'] / graph.out_degree(nodei, weight='weight'))
    else:
        return 0.0

def get_Pt(DT_row_norm, k, graph, data=df):
    size = DT_row_norm.shape[0]
    trans_mat = np.zeros((size, size))
    for i in range(0, size):
        for j in range(0, size):
            if graph.has_node(str(data['userid'].iloc[i])) and graph.has_node(str(data['userid'].iloc[j])):
                trans_mat[i][j] = get_weight(str(data['userid'].iloc[i]), str(data['userid'].iloc[j]), graph) * get_sim(DT_row_norm, i, j, k)   
            else:
                trans_mat[i][j] = 0.0      
    return trans_mat


def get_TRt(gamma, trans_mat, Et, iter=1000, tolerance=1e-16):
    old_TRt = Et
    i = 0
    while i < iter:
        TRt = (gamma*np.dot(trans_mat,old_TRt)) + ((1 - gamma) * Et)
        euclidean_dis = np.linalg.norm(TRt - old_TRt)
        if euclidean_dis < tolerance: 
            print('Topic Rank vectors have converged...')
            break
        old_TRt = TRt
        i += 1
    return TRt

def get_TR(DT_row_norm, DT_col_norm, num_topics, gamma, tolerance, graph, data=df):
    for k in range(0, num_topics):
        trans_mat = get_Pt(DT_row_norm, k, graph, data)
        Et = DT_col_norm[:,k]
        if k==0: TR = get_TRt(gamma, trans_mat, Et)
        else: TR = np.concatenate((TR, get_TRt(gamma, trans_mat, Et)), axis=1)
    return TR

def get_TR_sum(TR, samples, num_topics):
    TR_sum = [0 for i in range(0, samples)]
    for i in range(0, num_topics):
        for j in range(0, samples):
            TR_sum[j] += TR[i][j]
    TR_sum.sort()
    return TR_sum

In [10]:
#%%
doc_topic_dist = get_doc_topic_dist(OUT_DIR)
DT_row_norm = get_DT_row_norm(doc_topic_dist)
DT_col_norm = get_DT_col_norm(doc_topic_dist)
#%%
# Check the transition matrix
#%%

In [11]:
doc_topic_dist.shape

(2167, 10)

In [12]:
TR = get_TR(DT_row_norm, DT_col_norm, graph=graph, data=df, 
            num_topics=NUM_TOPICS, gamma=GAMMA, tolerance=TOLERANCE)


Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...
Topic Rank vectors have converged...


In [13]:
TR_sum = np.sum(TR, axis=1).tolist()
TR_sum = [item for sublist in TR_sum for item in sublist]

#%%

In [14]:
def compute_random_walk_for_each_topic(transition_matrix, state_topic_vector,
                                       damping_factor=0.85, iterations=100, tolerance=1e-6):
  while iterations > 0:
     new_state_topic_vector = damping_factor * np.dot(transition_matrix, state_topic_vector) + \
                                                         (1 - damping_factor) * state_topic_vector
     distance = np.linalg.norm(new_state_topic_vector - state_topic_vector)
     if distance < tolerance:
         break
     else:
         state_topic_vector = new_state_topic_vector

  return new_state_topic_vector

In [15]:
len(graph.nodes)

377

In [None]:
len(df)

In [None]:
TR_sum

In [16]:
df_graph_nodes = pd.DataFrame(data=np.zeros((graph.number_of_nodes(), NUM_TOPICS)), index=graph.nodes())

In [None]:
df_graph_nodes.to_csv('StrangerThings_graph_nodes.csv')

In [18]:
pd.DataFrame(data=DT_row_norm, index=df.userid.tolist()).to_csv('StrangerThings_topic_frame.csv')

In [None]:
df_topic_frame = pd.DataFrame(data=TR, index=df.userid.tolist())

In [None]:
df_topic_frame.to_csv('StrangerThings_topic_rank_frame.csv')

In [None]:
final_df = pd.concat([df_graph_nodes, df_topic_frame], axis=1).iloc[:,10:]

In [None]:
final_df.fillna(np.float64(0.1))

In [None]:
transition_matrix = nx.google_matrix(graph, nodelist=final_df.index.tolist(), weight='weight')

In [None]:
TV_SHOW

In [None]:
df_graph_nodes

In [17]:
df_graph_nodes.to_csv('StrangerThings_nodes_with_no_isolated_nodes.csv')

In [None]:
sorted(graph.degree(), key=lambda x:x[1], reverse=True)

In [24]:
set(graph.nodes())

{'100112839',
 '103527907',
 '1138293733',
 '1149555042',
 '118171621',
 '1232098286',
 '124184039',
 '1244303484',
 '125253329',
 '1268768983',
 '1270051867',
 '127149523',
 '1317359371',
 '134787107',
 '1353259867',
 '1358192474',
 '1363298984',
 '1373184775',
 '1418942484',
 '1423420471',
 '14260458',
 '1431012283',
 '1447023884',
 '1468725559',
 '147779825',
 '1491480392',
 '149406681',
 '1514224112',
 '151954381',
 '1537186130',
 '155398122',
 '155474970',
 '1557259836',
 '1580414976',
 '1581224048',
 '1627981466',
 '163113197',
 '1634283092',
 '163652800',
 '164996101',
 '1654231795',
 '16837234',
 '1704039948',
 '172355286',
 '1725059330',
 '173507810',
 '175060011',
 '175520856',
 '17580326',
 '17629149',
 '17783278',
 '17960579',
 '18025029',
 '181089044',
 '181574928',
 '181864891',
 '1849649652',
 '187231358',
 '18952713',
 '1912633705',
 '1918161847',
 '194705811',
 '197251803',
 '19870967',
 '20096001',
 '201039253',
 '202662391',
 '208252062',
 '21132805',
 '21539674',
 '

In [None]:
df_topic_frame

In [23]:
len(set(graph.nodes()) - set(df_topic_frame.index))

377

In [20]:
df_topic_frame = pd.DataFrame(data=DT_row_norm, index=df.userid.tolist())
df_topic_frame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1000180003,0.000019,0.937323,0.000019,0.009277,0.000019,0.000019,0.000019,0.000019,0.000019,0.053266
100112839,0.198231,0.471693,0.000007,0.305546,0.000007,0.000007,0.001116,0.001675,0.017944,0.003776
101041090,0.000007,0.067619,0.000007,0.000007,0.000007,0.000007,0.203815,0.000007,0.000007,0.728518
1012608162,0.000007,0.499253,0.000007,0.008003,0.000007,0.000007,0.492695,0.000007,0.000007,0.000007
1014034260,0.040023,0.018980,0.000009,0.000009,0.000009,0.000009,0.749722,0.000009,0.191221,0.000009
1028801107,0.029203,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.970720
1028822432,0.035915,0.843788,0.000024,0.000024,0.000024,0.000024,0.114133,0.000024,0.000024,0.006020
1035136201,0.050214,0.947743,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255,0.000255
103527907,0.000009,0.000009,0.000009,0.790173,0.000009,0.000009,0.209759,0.000009,0.000009,0.000009
1035685855,0.000009,0.999923,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009


In [27]:
TR.shape

(2167, 10)

In [None]:
T