In [None]:
#The motive of this notebook is to generate user embedding using Node2Vec approach. 

In [85]:
import numpy as np
import pandas as pd
import networkx as nx
import random
import re
from gensim.models import Word2Vec

In [36]:
#### Following Functions will extract important users mentioned or indicated to form nodes or vertices of Matrix

#user name are mentioned in URL of the columns "Twitter Reply to","Twitter Retweet of"
def getUserFromUrl(Urlstr):
    #print(Urlstr)
    tok = Urlstr.split('/')
    return tok[3]


#Declaring some global variables
All_User_for_Graph = set()   # Will consider all the users into account if mentioned tweeted retweeted for all the rows
All_OccuranceList = []
BigList = []
#-----

def GetTweetRetweetfromRow(row):
    All_User_for_Graph.add(row['AUTHOR_ID'])
    lis = []
    if(row["Thread Entry Type"] == 'post'):
        return lis
    else:
        ReTweetList = row["Twitter Retweet of"]
        if(len(ReTweetList) >0):
            for reTweet in ReTweetList:
                if(pd.isnull(reTweet)):
                    pass
                else:
                    name = getUserFromUrl(reTweet)
                    lis.append(name)
                    All_User_for_Graph.add(name)
        ReplyList = row['Twitter Reply to']
        if(len(ReplyList) >0):
            for reply in ReplyList:
                if(pd.isnull(reply)):
                    pass
                else:
                    name = getUserFromUrl(reply)
                    lis.append(name)
                    All_User_for_Graph.add(name)
        return list(set(lis))
                
def AllMentionedUser(row):
    All_User_for_Graph.add(row['AUTHOR_ID'])
    lis = []
    tweetList = row["Snippet"]
    for tweet in tweetList:
        lis.extend(re.findall(r"@(\w+)",str(tweet)))
    All_User_for_Graph.update(lis)
    return list(set(lis))

def AllConnections(row):
    conn_lis = []
    conn_lis.append(row['AUTHOR_ID'])
    conn_lis.extend(row['TweetReTweetUser'])
    conn_lis.extend(row['AllMentioneduser'])
    return list(set(conn_lis))

In [37]:
df = pd.read_excel(r'C:\Users\SHAHBAZ\Desktop\study mat\research papers\New folder\New folder\20200507_Cardiology_HCP_Data.xlsx')
Embedding_Column_Name = ["AUTHOR_ID", "AUTHOR_NAME","TRANS_AUTHOR_BIO","Account Type","Location Name","Gender","Mentioned Authors","Tags","Twitter Reply to","Twitter Retweet of","Thread Entry Type","Snippet"]
Main_DF = df[Embedding_Column_Name]
Main_DF = Main_DF.groupby('AUTHOR_ID',as_index= False)["AUTHOR_NAME","TRANS_AUTHOR_BIO","Account Type","Location Name","Gender","Mentioned Authors","Tags","Twitter Reply to","Twitter Retweet of","Thread Entry Type","Snippet"].agg(lambda x: list(x))

In [39]:
#### Making a cleaner dataframe ####
for columName in Embedding_Column_Name[1:]:
    for _,x in Main_DF.iterrows():
        x[columName] = list(set(x[columName]))

In [42]:
#Getting additional user information from the data:
Main_DF['TweetReTweetUser'] = [GetTweetRetweetfromRow(x) for _,x in Main_DF.iterrows()]
Main_DF['AllMentioneduser'] = [AllMentionedUser(x) for _,x in Main_DF.iterrows()]  
Main_DF['AllConnection'] = [AllConnections(x) for _,x in Main_DF.iterrows()] 
for _,x in Main_DF.iterrows():
    All_OccuranceList.extend(x['AllConnection'])
    BigList.append(x['AllConnection'])
All_User = list(All_User_for_Graph)

In [38]:
################# Graph based approach main coding ################

In [69]:
#### declaring some global variables
G = nx.Graph()
p = 1
q = 1 # p,q are the probabilties
alias_nodes = {}
alias_edges = {}


In [60]:
#Main part that generates the walk
def node2vec_walk(walk_length, start_node):
    walk = [start_node]
    while len(walk) < walk_length:
        cur = walk[-1]
        cur_nbrs = sorted(G.neighbors(cur))
        if len(cur_nbrs) > 0:
            if len(walk) == 1:
                walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
            else:
                prev = walk[-2]
                next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],alias_edges[(prev, cur)][1])]
                walk.append(next)
        else:
            break

    return walk

In [66]:
def simulate_walks(num_walks, walk_length):
    walks = []
    nodes = list(G.nodes())
    print ('Walk iteration:')
    for walk_iter in range(num_walks):
        print (str(walk_iter+1), '/', str(num_walks))
        random.shuffle(nodes)
        for node in nodes:
            walks.append(node2vec_walk(walk_length=walk_length, start_node=node))

    return walks

In [61]:
#give probabilty to edges on basis of p and q ( p is for dfs and q is for bfs)
def get_alias_edge(src, dst):
    unnormalized_probs = []
    for dst_nbr in sorted(G.neighbors(dst)):
        if dst_nbr == src:
            unnormalized_probs.append(G[dst][dst_nbr]['weight']/p)
        elif G.has_edge(dst_nbr, src):
            unnormalized_probs.append(G[dst][dst_nbr]['weight'])
        else:
            unnormalized_probs.append(G[dst][dst_nbr]['weight']/q)
    norm_const = sum(unnormalized_probs)
    normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]

    return alias_setup(normalized_probs)

In [62]:
def alias_setup(probs):
    K = len(probs)
    q = np.zeros(K)
    J = np.zeros(K, dtype=np.int)
    smaller = []
    larger = []
    for kk, prob in enumerate(probs):
        q[kk] = K*prob
        if q[kk] < 1.0:
            smaller.append(kk)
        else:
            larger.append(kk)

    while len(smaller) > 0 and len(larger) > 0:
        small = smaller.pop()
        large = larger.pop()
        J[small] = large
        q[large] = q[large] + q[small] - 1.0
        if q[large] < 1.0:
            smaller.append(large)
        else:
            larger.append(large)
    return J, q

In [80]:
#set values to nodes and edges
def preprocess_transition_probs():
    
    for node in G.nodes():
        unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
        norm_const = sum(unnormalized_probs)
        normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]
        alias_nodes[node] = alias_setup(normalized_probs)

    for edge in G.edges():
        alias_edges[edge] = get_alias_edge(edge[0], edge[1])
        alias_edges[(edge[1], edge[0])] = get_alias_edge(edge[1], edge[0])
        

In [64]:
def alias_draw(J, q):
    K = len(J)
    kk = int(np.floor(np.random.rand()*K))
    if np.random.rand() < q[kk]:
        return kk
    else:
        return J[kk]

In [11]:
###### Read table obtain data and prepare the Graph ########

In [44]:
def CreateTargetConnection(authorid):
    totalList= []
    for lis in BigList:
        if(lis.count(authorid) > 0):
            totalList.extend(lis)
    k = list(filter((authorid).__ne__, totalList))
    return k 

In [45]:
#Not all columns are needed to create the Graphs, a source node and target nodes are needed 
New_DF = Main_DF[["AUTHOR_ID","AllConnection"]]
New_DF['Target_Connection'] = ""
New_DF['Target_Connection'] = [CreateTargetConnection(x['AUTHOR_ID']) for _,x in New_DF.iterrows()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [46]:
New_DF.head()

Unnamed: 0,AUTHOR_ID,AllConnection,Target_Connection
0,0moba,"[victorr_ugo, docneto, 0moba]","[victorr_ugo, docneto]"
1,1600ronnell,"[1600ronnell, fox5dc]",[fox5dc]
2,1963Kelli,"[dailysoundnfury, 1963Kelli, koolkaryn]","[dailysoundnfury, koolkaryn]"
3,1EyeSmart1,"[1EyeSmart1, BusyBrain_Very]",[BusyBrain_Very]
4,1HopelesslyHope,"[CNN, 1HopelesslyHope]",[CNN]


In [70]:
G = nx.Graph()
for user in All_User:
    G.add_node(user)
    
for _,x in New_DF.iterrows():
    source = x['AUTHOR_ID']
    lis = x['Target_Connection']
    
    for target in lis:
        G.add_edge(source, target)
        G[source][target]['weight'] = 1

In [79]:
alias_nodes

{}

In [81]:
preprocess_transition_probs()

In [91]:
walks = simulate_walks(10,10)

Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


In [92]:
#skip gram model to generate embedding from walks
model = Word2Vec(window = 10, sg = 1, size = 100, hs = 0,negative = 10,alpha=0.03, min_alpha=0.0007,seed = 14)
#size of 100,200 and 300 dimensions were tested and out of which 100 dimension performed best in MultiView
model.build_vocab(walks, progress_per=2)
model.train(walks, total_examples = model.corpus_count, epochs=20, report_delay=1)

(21445921, 22066600)

In [94]:
New_DF['NetworkEmbedding'] = [model[x['AUTHOR_ID']] for _,x in New_DF.iterrows()]
Node2VecMat = np.zeros(100)
for _,x in New_DF.iterrows():
    Node2VecMat = np.vstack((Node2VecMat,x['NetworkEmbedding']))
Node2VecMat = Node2VecMat[1:,:]

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [97]:
Savefilepath = "C:/Users/SHAHBAZ/ZS UserEmbedding/AutoEncoderInputUserMatrix/"
fileName = Savefilepath + "Node2Vec_100_Matrix.txt"
np.savetxt(fileName,Node2VecMat,fmt='%.8f')