In [1]:
import pandas as pd
import networkx as nx
import community
import json
import datetime
import numpy as np

# Read and Clean Raw Json File

In [14]:
df = pd.read_json("/Users/StephenHedden/Downloads/40400-tweets.json")

## Define some functions

In [25]:
def getScreenName(x):
    try:
        temp = x["screen_name"]
        return temp;
    except:
        return;

In [26]:
def getOrigScreenName(x):
    try:
        temp = x["user"]["screen_name"]
        return temp;
    except:
        return;

In [27]:
def getMentions(x):
    try:
        temp = x["user_mentions"][0]["screen_name"]
        return temp;
    except:
        return np.NaN;

## Create edges

In [108]:
#Create a source column in the df for the sn of the tweeter
df['source'] = df["user"].apply(lambda x: getScreenName(x))
df['target'] = df['retweeted_status'].apply(lambda x: getOrigScreenName(x))

In [109]:
edges1 = pd.DataFrame() #Create empty df

#Break into retweets and non-reweets
rts = df.loc[~pd.isnull(df["retweeted_status"])]
nonrts = df.loc[pd.isnull(df["retweeted_status"])]

#Create edges for rts
edges1['source'] = rts["user"].apply(lambda x: getScreenName(x))
edges1['target'] = rts['retweeted_status'].apply(lambda x: getOrigScreenName(x))

In [110]:
#Create edges for mentions
edges2 = pd.DataFrame()
edges2['source'] = nonrts["user"].apply(lambda x: getScreenName(x))
edges2['target'] = nonrts['entities'].apply(lambda x: getMentions(x))
edges2 = edges2.dropna()

In [111]:
edges = pd.concat([edges1,edges2])

In [112]:
edges.head()

Unnamed: 0,source,target
0,ExpatPip,JuliaHB1
1,debysoto21,JByazminn
2,v_niverse,Variety
3,Merwyn65,akshaykumar
4,marijara1971,MariaFdaCabal


# Create graph and run centrality measures

In [33]:
#Create graph using the data
G=nx.from_pandas_edgelist(edges, 'source', 'target')

In [34]:
#Partition graph based on 'best partition'
partition = community.best_partition(G)

In [35]:
#Turn partition into dataframe
partition1 = pd.DataFrame([partition]).T
partition1 = partition1.reset_index()
partition1.columns = ['index','value']

## Degree Centrality

In [36]:
dc = nx.degree_centrality(G)
dc = pd.DataFrame([dc.keys(), dc.values()]).T
dc.columns= ['names', 'values']  # call them whatever you like
dc = dc.sort_values(by='values', ascending=False)
dc1 = pd.merge(dc,partition1, how='left', left_on="names",right_on="index")

## Betweenness Centrality

In [37]:
print(datetime.datetime.now())
bc = nx.betweenness_centrality(G,k=1000)
print(datetime.datetime.now())

2020-06-07 18:44:37.727894
2020-06-07 18:46:48.177185


In [38]:
bc = pd.DataFrame([bc.keys(), bc.values()]).T
bc.columns= ['names', 'values']  # call them whatever you like
bc = bc.sort_values(by='values', ascending=False)
bc.to_csv("bc.csv")

In [39]:
bc1 = pd.merge(bc,partition1, how='left', left_on="names",right_on="index")

## Eigenvector Centrality

In [40]:
ec = nx.eigenvector_centrality(G,weight='freq',max_iter=1000)
ec = pd.DataFrame([ec.keys(), ec.values()]).T
ec.columns= ['names', 'values']
ec = ec.sort_values(by='values', ascending=False)
ec1 = pd.merge(ec,partition1, how='left', left_on="names",right_on="index")

# Prepare Data for JS
## Required Functions

In [63]:
def createTopNodesforVisual(df,nameOfFile,head,threshold,minDegree):
    #Only use groups with more than X members
    top = df.groupby('value')['names'].filter(lambda x: len(x) > 0)
    df = df.loc[df['names'].isin(top)]
    degreeNetwork = filterByPartitionAndCentrality(df,head,threshold)
    degreeNetwork = degreeNetwork.sort_values(by="values",ascending=False).head(head)
    singles = degreeNetwork.groupby('value')['names'].filter(lambda x: len(x) < minDegree)
    degreeNetwork = degreeNetwork.loc[~degreeNetwork['names'].isin(singles)]
    degreeNetwork = buildNetworkFromData(edges,degreeNetwork,minDegree)
    nodes = buildNodesFromLinks(degreeNetwork,df)
    #exportData(nodes,degreeNetwork,nameOfFile)
    return(nodes,degreeNetwork)

In [42]:
def filterByPartitionAndCentrality(df,head,centralityThreshold):
    df = df.groupby('value').head(head)
    #df = df.head(head)
    df = df.loc[df['values'] > centralityThreshold]
    return df

In [43]:
def buildNetworkFromData(network, df, minDegree):
    df = network.loc[(network['source'].isin(df['names'])) & (network['target'].isin(df['names']))]
    G_clean = nx.from_pandas_edgelist(df, 'source', 'target')
    remove = [node for node,degree in dict(G_clean.degree()).items() if degree < minDegree]
    G_clean.remove_nodes_from(remove)
    G_clean = nx.to_pandas_edgelist(G_clean)
    #G_clean = pd.merge(G_clean,names,how='left',left_on='source',right_on='nconst')
    #G_clean = pd.merge(G_clean,names,how='left',left_on='target',right_on='nconst')
    G_clean = pd.DataFrame(G_clean[['source','target']])
    #G_clean.columns = ['source','target']
    #G_clean = G_clean.dropna()
    #G_clean = G_clean.drop_duplicates()
    return G_clean

In [44]:
def buildNodesFromLinks(df,centralityData):
    nodes1 = pd.DataFrame(df['source'])
    nodes2 = pd.DataFrame(df['target'])
    nodes2.columns = ['source']
    nodes = pd.concat([nodes1,nodes2])
    nodes = nodes.drop_duplicates()
    nodes2 = pd.merge(nodes,centralityData,how='left',left_on='source',right_on='names')
    nodes2 = nodes2.dropna()
    nodes2 = pd.DataFrame(nodes2[['names','values','value']])
    nodes2.columns = ['id','cent','value']
    return nodes2

In [45]:
def exportData(nodes,network,fileName):
    d1 = nodes.to_dict(orient='records')
    j1 = json.dumps(d1)
    d2 = network.to_dict(orient='records')
    j2 = json.dumps(d2)
    d1 = {"nodes":d1, "links":d2}
    with open(fileName + ".json", 'w', encoding='utf-8') as f:
        json.dump(d1, f, ensure_ascii=False,indent=4)

## Prepare Data

Create nodes and nets (they will need to be modified)

In [83]:
#Inputs are nodes data to filter on, name of file to save it as, 
#number of maximum nodes to take from each community, minimum centrality score
#and minimum numbe of connections
nodes,net = createTopNodesforVisual(bc1,"test",10000,0.0001,0)

In [84]:
#Make sure it's less than like 2000
nodes.count()

id       1344
cent     1344
value    1344
dtype: int64

In [85]:
#Make it exactly right for the d3 visual
nodes = buildNodesFromLinks(net,bc1)
nodes = nodes.rename(columns={"cent": "betweenness", "value": "group","id":"name"})
nodes = nodes.reset_index()
nodes = nodes.rename(columns={"index": "id"})
net["source"] = net["source"].apply(lambda x: nodes.loc[nodes["name"] == x]["id"].values[0])
net["target"] = net["target"].apply(lambda x: nodes.loc[nodes["name"] == x]["id"].values[0])

## Combine with other centrality measures

In [86]:
nodes = pd.merge(nodes,ec1, how="left",left_on="name",right_on="names")
nodes = nodes.drop(['names','index','value'], axis=1)
nodes = nodes.rename(columns={"values": "eigenvector"})

In [87]:
nodes = pd.merge(nodes,dc1, how="left",left_on="name",right_on="names")

In [89]:
nodes = nodes.drop(['names','index','value'], axis=1)
nodes = nodes.rename(columns={"values": "degree"})
nodes = nodes[['betweenness','degree','eigenvector','group','id','name']]

## Add text to each node

In [141]:
def getText1(x,df):
    length1 = len(df.loc[df["source"] == x]["text"])
    length2 = len(df.loc[df["target"] == x]["text"])
    if length1 > 0:
        text = df.loc[df["source"] == x]["text"].values[0]
        return text;
    elif length2 > 0:
        text = df.loc[df["target"] == x]["text"].values[0]
        return text;
    else:
        return;

In [142]:
nodes["tweet_text"] = nodes["name"].apply(lambda x: getText1(x,df))

In [143]:
nodes.head()

Unnamed: 0,betweenness,degree,eigenvector,group,id,name,tweet_text
0,0.000970877,0.000870249,0.00121886,7,0,yeeyeetweetweet,RT @mckenzieas93V2: Racists seeing ...
1,0.00102053,0.000280725,4.32654e-05,7,1,mckenzieas93V2,RT @mckenzieas93V2: Racists seeing ...
2,0.000395545,0.00011229,0.0200751,7,2,litolkth,RT @JupiterIsPlane1: me when I saw #whitelives...
3,0.00896225,0.00589523,0.014601,7,3,JupiterIsPlane1,RT @JupiterIsPlane1: me when I saw #whitelives...
4,0.00105677,0.00033687,0.0213399,7,4,jaebumjeon,RT @eternal_jungkoo: Kpop stans ready to take ...


In [144]:
#Export data
exportData(nodes,net,"test6")