In [None]:
#This Notebook purpose is to generate user representation linearly using SVD on the resultant user matrix 
# then Non linearly using Graph and deep walk method
# The last section also incorporates code incase there is an update required to the user representation formed using non-linear method

In [2]:
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
import networkx as nx
import math
from tqdm import tqdm
from sklearn.decomposition import PCA
from numpy import array
from numpy import diag
from numpy import zeros
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
import warnings

In [3]:
stop_words = set(stopwords.words('english'))
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
#lemmatizer = WordNetLemmatizer()

In [4]:
#### Following Functions will extract important users mentioned or indicated to form nodes or vertices of Matrix

#user name are mentioned in URL of the columns "Twitter Reply to","Twitter Retweet of"
def getUserFromUrl(Urlstr):
    #print(Urlstr)
    tok = Urlstr.split('/')
    return tok[3]


#Declaring some global variables
All_User_for_Graph = set()   # Will consider all the users into account if mentioned tweeted retweeted for all the rows
All_OccuranceList = []
BigList = []
#-----

def GetTweetRetweetfromRow(row):
    All_User_for_Graph.add(row['AUTHOR_ID'])
    lis = []
    if(row["Thread Entry Type"] == 'post'):
        return lis
    else:
        ReTweetList = row["Twitter Retweet of"]
        if(len(ReTweetList) >0):
            for reTweet in ReTweetList:
                if(pd.isnull(reTweet)):
                    pass
                else:
                    name = getUserFromUrl(reTweet)
                    lis.append(name)
                    All_User_for_Graph.add(name)
        ReplyList = row['Twitter Reply to']
        if(len(ReplyList) >0):
            for reply in ReplyList:
                if(pd.isnull(reply)):
                    pass
                else:
                    name = getUserFromUrl(reply)
                    lis.append(name)
                    All_User_for_Graph.add(name)
        return list(set(lis))
                
def AllMentionedUser(row):
    All_User_for_Graph.add(row['AUTHOR_ID'])
    lis = []
    tweetList = row["Snippet"]
    for tweet in tweetList:
        lis.extend(re.findall(r"@(\w+)",str(tweet)))
    All_User_for_Graph.update(lis)
    return list(set(lis))

def AllConnections(row):
    conn_lis = []
    conn_lis.append(row['AUTHOR_ID'])
    conn_lis.extend(row['TweetReTweetUser'])
    conn_lis.extend(row['AllMentioneduser'])
    return list(set(conn_lis))

In [5]:
###### Important using the same PPI score based method as used in co-occurance matrix ---
###--- formula  = log ( (count(w,c) / count(w) * count(c)) * N  )
### N is total Number of users 
def UpdateGraphsonBasisofUsers(user,AllUserList,GraphMatrix):
    total_lis = []
    for lis in BigList:
        if(lis.count(user) >0):
            total_lis.extend(lis)
    k = list(set(total_lis))
    for usr in k:
        if(usr == user):
            pass
        else:
            countwc = total_lis.count(usr)
            countw = All_OccuranceList.count(user)
            countc = All_OccuranceList.count(usr)
            N = len(AllUserList)
            tem = (countwc * N) / (countc * countw)
            score = np.log(tem)
            index_user = AllUserList.index(user)
            index_usr = AllUserList.index(usr)
            GraphMatrix[index_user][index_usr] = score
            GraphMatrix[index_usr][index_user] = score
    return GraphMatrix 

In [8]:
#Read and process the Training data
df = pd.read_excel(r'C:\Users\SHAHBAZ\Desktop\study mat\research papers\New folder\New folder\20200507_Cardiology_HCP_Data.xlsx')
Embedding_Column_Name = ["AUTHOR_ID", "AUTHOR_NAME","TRANS_AUTHOR_BIO","Account Type","Location Name","Gender","Mentioned Authors","Tags","Twitter Reply to","Twitter Retweet of","Thread Entry Type","Snippet"]
Main_DF = df[Embedding_Column_Name]
Main_DF = Main_DF.groupby('AUTHOR_ID',as_index= False)["AUTHOR_NAME","TRANS_AUTHOR_BIO","Account Type","Location Name","Gender","Mentioned Authors","Tags","Twitter Reply to","Twitter Retweet of","Thread Entry Type","Snippet"].agg(lambda x: list(x))

In [9]:
#### Making a cleaner dataframe ####
for columName in Embedding_Column_Name[1:]:
    for _,x in Main_DF.iterrows():
        x[columName] = list(set(x[columName]))

In [10]:
#Getting additional user information from the data:
Main_DF['TweetReTweetUser'] = [GetTweetRetweetfromRow(x) for _,x in Main_DF.iterrows()]
Main_DF['AllMentioneduser'] = [AllMentionedUser(x) for _,x in Main_DF.iterrows()]  
Main_DF['AllConnection'] = [AllConnections(x) for _,x in Main_DF.iterrows()] 
for _,x in Main_DF.iterrows():
    All_OccuranceList.extend(x['AllConnection'])
    BigList.append(x['AllConnection'])

In [12]:
test_df = Main_DF[["AUTHOR_ID","TRANS_AUTHOR_BIO","Snippet","AllConnection"]]
test_df.head()

Unnamed: 0,AUTHOR_ID,TRANS_AUTHOR_BIO,Snippet,AllConnection
0,0moba,[EM Physician | @Dartmouth MD MBA | Errand boy...,[@docneto Ha my brother don’t say this one. Pa...,"[docneto, victorr_ugo, 0moba]"
1,1600ronnell,[RN4L🖤],[RT @fox5dc Coronavirus now killing more Ameri...,"[1600ronnell, fox5dc]"
2,1963Kelli,[Geek Mom/Feminist/Hiker surrounded by fellow ...,[RT @koolkaryn ‘Imagine Having Ability to Do T...,"[dailysoundnfury, 1963Kelli, koolkaryn]"
3,1EyeSmart1,[Male. Physician. Independent Thinker. Pro USA...,[RT @BusyBrain_Very COVID-19 has now surpassed...,"[BusyBrain_Very, 1EyeSmart1]"
4,1HopelesslyHope,"[Makeup and Beauty Fanatic, Lover of Pretty F...",[RT @CNN Pharmaceutical company Novo Nordisk s...,"[1HopelesslyHope, CNN]"


In [13]:
test_df.to_csv('test.csv',header=True,index=False)

In [12]:
####### Creating and Processing Graph Matrix to generate user representation ###############
GraphMatrix = np.zeros((len(All_User_for_Graph),len(All_User_for_Graph)))
All_User = list(All_User_for_Graph)
for i in range(len(All_User)):
    GraphMatrix = UpdateGraphsonBasisofUsers(All_User[i],All_User,GraphMatrix)
    if((i % 500) == 0):
        print("Run epoch " + str(i) + " / " + str(len(All_User)))

In [36]:
#####Applying SVD to reduce dimensions###############
U, s, VT = svd(Graph)
Sigma = zeros((Graph.shape[0], Graph.shape[1]))
Sigma[:Graph.shape[0], :Graph.shape[0]] = diag(s)
# select
n_elements = 250    #Number of dimension of Embedding  tested with 500,375,300 best with 250
Sigma = Sigma[:, :n_elements]
VT = VT[:n_elements, :]
Graph_Embedding = U.dot(Sigma)

In [33]:
########## Alternate fasted dimension reduction using SVD Truncated 
svd = TruncatedSVD(n_components=250)
svd.fit(Graph)
result = svd.transform(Graph)
#print(result)

In [35]:
########Saving Matrix for later use since SVD is time consuming on such huge dimension of Matrix ###########
Savefilepath = "C:/Users/SHAHBAZ/ZS UserEmbedding/AutoEncoderInputUserMatrix/"

fileName = Savefilepath + "GraphEmbedding_UserDoc_Matrix.txt"
np.savetxt(fileName,result,fmt='%.8f') 

fileName = Savefilepath + "Graphs_U_Matrix.txt"
np.savetxt(fileName,U,fmt='%.8f') 

fileName = Savefilepath + "Graphs_Sig_Matrix.txt"
np.savetxt(fileName,s,fmt='%.8f') 

fileName = Savefilepath + "Graphs_VT_Matrix.txt"
np.savetxt(fileName,VT,fmt='%.8f') 

fileName = Savefilepath + "GraphUsers.txt"
np.savetxt(fileName,All_User, delimiter=" ", newline = "\n", fmt="%s") 

In [47]:
############## Implementation of Graphical Neural Network using deep walk approach ##################

In [None]:
#####These methods helps to define key submodule of entire process
#Idea is to generate corpus like structure applying random walks between nodes 
#Passing this corpus to skip gram model to obtain node based embedding

In [9]:
def CreateTargetConnection(authorid):
    totalList= []
    for lis in BigList:
        if(lis.count(authorid) > 0):
            totalList.extend(lis)
    k = list(filter((authorid).__ne__, totalList))
    return k 

def get_randomwalk(node,path_length):
    random_walk = [node]
    for i in range(path_length-1):
        temp = list(G.neighbors(node))
        temp = list(set(temp) - set(random_walk))    
        if len(temp) == 0:
            break

        random_node = random.choice(temp)
        random_walk.append(random_node)
        node = random_node
        
    return random_walk

In [10]:
#Not all columns are needed to create the Graphs, a source node and target nodes are needed 
New_DF = Main_DF[["AUTHOR_ID","AllConnection"]]
New_DF['Target_Connection'] = ""
New_DF['Target_Connection'] = [CreateTargetConnection(x['AUTHOR_ID']) for _,x in New_DF.iterrows()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [15]:
#New_DF.to_csv('Graph_based.csv',index=False,header=True)
All_User = list(All_User_for_Graph)
len(All_User)

11621

In [18]:
### Creating graph and adding nodes###
G = nx.Graph()
for user in All_User:
    G.add_node(user)
    
### Connecting the edges for users who are linked ####
for _,x in New_DF.iterrows():
    source = x['AUTHOR_ID']
    lis = x['Target_Connection']
    
    for target in lis:
        G.add_edge(source, target)

In [17]:
G.edges()

EdgeView([('AustinChiangMD', 'AASLDPresident'), ('AustinChiangMD', 'MamaDoctorJones'), ('AustinChiangMD', 'JohnVargoMD'), ('AustinChiangMD', 'MarkPochapin'), ('AustinChiangMD', 'kmergener'), ('AustinChiangMD', 'willsuh76'), ('AustinChiangMD', 'arghavan_salles'), ('AustinChiangMD', 'acgme'), ('AustinChiangMD', 'aoglasser'), ('AustinChiangMD', 'RealDoctorMike'), ('MDhockey2119', 'Kaimandante'), ('ethanjweiss', 'AmiBhattMD'), ('ethanjweiss', 'DrDave01'), ('ethanjweiss', 'DrRamonIssa'), ('ethanjweiss', 'DrSidMukherjee'), ('ethanjweiss', 'EricTopol'), ('ethanjweiss', 'HamidGhanbariMD'), ('ethanjweiss', 'HeartOTXHeartMD'), ('ethanjweiss', 'JKhambhati'), ('ethanjweiss', 'JakeKushnerMD'), ('ethanjweiss', 'KevinH_PhD'), ('ethanjweiss', 'LisaRosenbaum17'), ('ethanjweiss', 'MikeAlbertMD'), ('ethanjweiss', 'NirUrielMD'), ('ethanjweiss', 'PaulVarosy'), ('ethanjweiss', 'SA_Diabetes'), ('ethanjweiss', 'bschermd'), ('ethanjweiss', 'cadiulus'), ('ethanjweiss', 'davidludwigmd'), ('ethanjweiss', 'daviesb

In [19]:
#### Implementing random walk ####
all_nodes = list(G.nodes())

random_walks = []
for n in tqdm(all_nodes):
    for i in range(10):
        random_walks.append(get_randomwalk(n,10))
warnings.filterwarnings('ignore')

100%|██████████████████████████████████████████████████████████████████████████| 11621/11621 [00:09<00:00, 1272.89it/s]


In [20]:
random_walks

[['AustinChiangMD',
  'RealDoctorMike',
  'willsuh76',
  'jwmhoulton',
  'purviparwani',
  'TKDsosyal',
  'DrLaxmiMehta',
  'VietHeartPA',
  'leenhapong',
  'JJRyanMD'],
 ['AustinChiangMD',
  'arghavan_salles',
  'theheartorg',
  'drstaceyrosen',
  'DNewmanToker',
  'DaktariJay',
  'PushpaShivaram',
  'DrJayMohan',
  'SCMRorg',
  'HeartBobH'],
 ['AustinChiangMD',
  'MarkPochapin',
  'aoglasser',
  'willsuh76',
  'LoggheMD',
  'venkmurthy',
  'JNCjournal',
  'djc795',
  'DrTGupta',
  'sandylewis'],
 ['AustinChiangMD',
  'AASLDPresident',
  'arghavan_salles',
  'jwmhoulton',
  'willsuh76',
  'KaulP',
  'FrederickWelt',
  'HFnursemaghee',
  'BidmcCvi',
  'mangion11'],
 ['AustinChiangMD',
  'AASLDPresident',
  'arghavan_salles',
  'SharonneHayes',
  'angwoyno',
  'DrLaxmiMehta',
  'edwardmillermd',
  'AHAMeetings',
  'AnumSaeedMD',
  'PCIsurgeon'],
 ['AustinChiangMD',
  'MarkPochapin',
  'AASLDPresident',
  'arghavan_salles',
  'SanjayDivakaran',
  'DocStrom',
  'alessia_gimelli',
  'Heart

In [107]:
# train skip-gram (word2vec) model
model = Word2Vec(window = 4, sg = 1, size = 100, hs = 0,negative = 10,alpha=0.03, min_alpha=0.0007,seed = 14)
#size of 100,200 and 300 dimensions were tested and out of which 100 dimension performed best in MultiView
model.build_vocab(random_walks, progress_per=2)
model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1)

100%|██████████████████████████████████████████████████████████████████████████| 11621/11621 [00:07<00:00, 1474.94it/s]


116210

In [63]:
#Creating Matrix that represent network based representation of the users and saving same for later use 
New_DF['NetworkEmbedding'] = [model[x['AUTHOR_ID']] for _,x in New_DF.iterrows()]
DeepWalkMat = np.zeros(300)
for _,x in New_DF.iterrows():
    DeepWalkMat = np.vstack((DeepWalkMat,x['NetworkEmbedding']))
DeepWalkMat = DeepWalkMat[1:,:]

In [67]:
Savefilepath = "C:/Users/SHAHBAZ/ZS UserEmbedding/AutoEncoderInputUserMatrix/"
fileName = Savefilepath + "DeepWalk_300_Matrix.txt"
np.savetxt(fileName,DeepWalkMat,fmt='%.8f') 

In [36]:
#################################
### Updating the embedding #####

In [125]:
lis_org1_target1 = New_DF['Target_Connection'][420]   #existing user connecting

In [126]:
lis_org1_target1.extend(New_DF['Target_Connection'][270]) # new user connecting made during new window of time

In [128]:
### we will have to re build the graphs again afresh; 
### Creating graph and adding nodes###
G = nx.Graph()
for user in All_User:
    G.add_node(user)
    
for _,x in New_DF.iterrows():
    source = x['AUTHOR_ID']
    lis = x['Target_Connection']
    
    for target in lis:
        G.add_edge(source, target)

In [129]:
for target in lis_org1_target1: # adding updating new edges 
    G.add_edge('BrukeGetachew', target)

In [130]:
random_walks = []
for i in range(10):
    random_walks.append(get_randomwalk('BrukeGetachew',10))

In [132]:
#updating model
model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1)

(8, 860)

In [133]:
#represent updated embedding
MatDeepWalkUpdate = np.zeros(100)
MatDeepWalkUpdate = np.vstack((MatDeepWalkUpdate,model['BrukeGetachew']))
MatDeepWalkUpdate = MatDeepWalkUpdate[1:,:]