# Higgs Boson Tweets PyRaphtory Example Notebook 💥

## Setup environment and download data 💾

Import all necessary dependencies needed to build a graph from your data in PyRaphtory. 

In [1]:
from pathlib import Path
from pyraphtory.context import PyRaphtory
from pyraphtory.vertex import Vertex
from pyraphtory.spouts import FileSpout
from pyraphtory.builder import *
from pyvis.network import Network
import csv

!curl -o /tmp/twitter.csv https://raw.githubusercontent.com/Raphtory/Data/main/higgs-retweet-activity.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 8022k  100 8022k    0     0  4959k      0  0:00:01  0:00:01 --:--:-- 4970k


## Preview data 👀

Preview the retweet twitter data: each line includes the source user A (the retweeter), the destination user B (the user being retweeted) and the time at which the retweet occurs.

In [2]:
!head /tmp/twitter.csv

376989,50329,1341101181
376989,13813,1341101192
453850,8,1341101208
99258,50329,1341101263
75083,84647,1341101732
325821,8,1341102141
104321,238279,1341102794
408376,8,1341102802
247125,463,1341103262
224480,93318,1341103333


## Create a new Raphtory graph 📊

Turn on logs to see what is going on in PyRaphtory. Initialise Raphtory by creating a PyRaphtory object and create your new graph.

In [3]:
pr = PyRaphtory(logging=True).open()
rg = pr.new_graph()

11:35:18.987 [io-compute-5] INFO  com.raphtory.internals.management.Py4JServer - Starting PythonGatewayServer...
Port: 63480
Secret: 3288ab5cad4a546d561daef9f3c62e179e951d3d73544e95392a061aa96f95ed




11:35:19.591 [Thread-12] INFO  com.raphtory.internals.context.LocalContext$ - Creating Service for 'damp_pear_leech'
11:35:19.603 [io-compute-4] INFO  com.raphtory.internals.management.Prometheus$ - Prometheus started on port /0:0:0:0:0:0:0:0:9999
11:35:20.203 [io-compute-4] INFO  com.raphtory.internals.components.partition.PartitionOrchestrator$ - Creating '1' Partition Managers for 'damp_pear_leech'.
11:35:22.501 [io-compute-1] INFO  com.raphtory.internals.components.partition.PartitionManager - Partition 0: Starting partition manager for 'damp_pear_leech'.


## Ingest the data into a graph 😋

Write a parsing method to parse your csv file and ultimately create a graph.

In [4]:
def parse(graph, tuple: str):
    parts = [v.strip() for v in tuple.split(",")]
    source_node = parts[0]
    src_id = graph.assign_id(source_node)
    target_node = parts[1]
    tar_id = graph.assign_id(target_node)
    time_stamp = int(parts[2])

    graph.add_vertex(time_stamp, src_id, Properties(ImmutableProperty("name", source_node)), Type("User"))
    graph.add_vertex(time_stamp, tar_id, Properties(ImmutableProperty("name", target_node)), Type("User"))
    graph.add_edge(time_stamp, src_id, tar_id, Type("Tweet"))

twitter_builder = GraphBuilder(parse)
twitter_spout = FileSpout("/tmp/twitter.csv")
rg.load(Source(twitter_spout, twitter_builder))

11:35:31.339 [spawner-akka.actor.default-dispatcher-3] INFO  com.raphtory.internals.components.ingestion.IngestionManager - Ingestion Manager for 'damp_pear_leech' establishing new data source


com.raphtory.api.analysis.graphview.DeployedTemporalGraph@6473cce5

11:35:31.957 [io-compute-9] INFO  com.raphtory.spouts.FileSpoutInstance - Spout: Processing file 'twitter.csv' ...
11:35:31.965 [spawner-akka.actor.default-dispatcher-3] INFO  com.raphtory.internals.components.querymanager.QueryManager - Source '0' is blocking analysis for Graph 'damp_pear_leech'


## Collect simple metrics 📈

Select certain metrics to show in your output dataframe. Here we have selected vertex name, degree, out degree and in degree. **Time to finish: ~2 to 3 minutes**

In [5]:
from pyraphtory.graph import Row
df = rg \
      .select(lambda vertex: Row(vertex.name(), vertex.degree(), vertex.out_degree(), vertex.in_degree())) \
      .write_to_dataframe(["name", "degree", "out_degree", "in_degree"])

11:35:43.664 [io-compute-3] INFO  com.raphtory.api.querytracker.QueryProgressTracker - Job 404837468_2400780139525730329: Starting query progress tracker.
11:35:43.667 [spawner-akka.actor.default-dispatcher-3] INFO  com.raphtory.internals.components.querymanager.QueryManager - Query '404837468_2400780139525730329' currently blocked, waiting for ingestion to complete.
11:38:00.566 [spawner-akka.actor.default-dispatcher-3] INFO  com.raphtory.internals.components.querymanager.QueryManager - Source '0' is unblocking analysis for Graph 'damp_pear_leech' with 1064790 messages sent.
11:38:00.734 [spawner-akka.actor.default-dispatcher-3] INFO  com.raphtory.internals.components.querymanager.QueryManager - Query '404837468_2400780139525730329' received, your job ID is '404837468_2400780139525730329'.
11:38:00.743 [spawner-akka.actor.default-dispatcher-10] INFO  com.raphtory.internals.components.partition.QueryExecutor - 404837468_2400780139525730329_0: Starting QueryExecutor.
11:38:14.576 [spawn

#### Clean the dataframe, we have deleted the unused window column. 🧹

In [6]:
df.drop(columns=['window'], inplace=True)

### Preview the dataframe 👀

In [7]:
df

Unnamed: 0,timestamp,name,degree,out_degree,in_degree
0,1341705593,247216,1,1,0
1,1341705593,61013,4,3,1
2,1341705593,161960,1,1,0
3,1341705593,422612,1,1,0
4,1341705593,396362,1,1,0
...,...,...,...,...,...
256486,1341705593,293395,1,1,0
256487,1341705593,30364,5,5,0
256488,1341705593,84292,1,1,0
256489,1341705593,324348,2,0,2


**Sort by highest degree, top 10**

In [8]:
df.sort_values(['degree'], ascending=False)[:10]

Unnamed: 0,timestamp,name,degree,out_degree,in_degree
77232,1341705593,88,14061,3,14060
95981,1341705593,14454,6190,0,6190
120807,1341705593,677,5621,8,5613
142755,1341705593,1988,4336,2,4335
237149,1341705593,349,2803,1,2802
95879,1341705593,283,2039,0,2039
83229,1341705593,3571,1981,1,1980
32393,1341705593,6948,1959,0,1959
240523,1341705593,14572,1692,0,1692
138723,1341705593,68278,1689,0,1689


**Sort by highest in-degree, top 10**

In [9]:
df.sort_values(['in_degree'], ascending=False)[:10]

Unnamed: 0,timestamp,name,degree,out_degree,in_degree
77232,1341705593,88,14061,3,14060
95981,1341705593,14454,6190,0,6190
120807,1341705593,677,5621,8,5613
142755,1341705593,1988,4336,2,4335
237149,1341705593,349,2803,1,2802
95879,1341705593,283,2039,0,2039
83229,1341705593,3571,1981,1,1980
32393,1341705593,6948,1959,0,1959
240523,1341705593,14572,1692,0,1692
138723,1341705593,68278,1689,0,1689


**Sort by highest out-degree, top 10**

In [10]:
df.sort_values(['out_degree'], ascending=False)[:10]

Unnamed: 0,timestamp,name,degree,out_degree,in_degree
27504,1341705593,38535,134,134,0
151314,1341705593,181190,84,84,0
199289,1341705593,81405,67,66,1
191563,1341705593,64911,230,49,192
188514,1341705593,54301,49,49,0
156270,1341705593,27705,57,48,11
78066,1341705593,53508,43,42,1
123157,1341705593,232850,41,41,0
6841,1341705593,62391,38,38,0
92951,1341705593,2237,38,38,0


# Run a PageRank algorithm 📑

Run your selected algorithm on your graph, here we run PageRank. Your algorithms can be obtained from the PyRaphtory object you created at the start. Specify where you write the result of your algorithm to, e.g. the additional column results in your dataframe. **Time to finish: ~3 to 4 minutes**

In [11]:
cols = ["prlabel"]


df_pagerank = rg.at(1341705593) \
                .past() \
                .execute(pr.algorithms.generic.centrality.PageRank())\
                .write_to_dataframe(["name"] + cols)

df_pagerank

11:41:58.681 [io-compute-1] INFO  com.raphtory.api.querytracker.QueryProgressTracker - Job PageRank_3498013686461469106: Starting query progress tracker.
11:41:58.697 [spawner-akka.actor.default-dispatcher-6] INFO  com.raphtory.internals.components.querymanager.QueryManager - Query 'PageRank_3498013686461469106' received, your job ID is 'PageRank_3498013686461469106'.
11:41:58.699 [spawner-akka.actor.default-dispatcher-6] INFO  com.raphtory.internals.components.partition.QueryExecutor - PageRank_3498013686461469106_0: Starting QueryExecutor.
11:45:49.953 [spawner-akka.actor.default-dispatcher-6] INFO  com.raphtory.api.querytracker.QueryProgressTracker - Job 'PageRank_3498013686461469106': Perspective '1341705593' finished in 231271 ms.
11:45:49.953 [spawner-akka.actor.default-dispatcher-9] INFO  com.raphtory.internals.components.querymanager.QueryHandler - Job 'PageRank_3498013686461469106': Perspective at Time '1341705593' took 231251 ms to run. 
11:45:49.954 [spawner-akka.actor.defau

Unnamed: 0,timestamp,window,name,prlabel
0,1341705593,,247216,0.410038
1,1341705593,,61013,0.758570
2,1341705593,,161960,0.410038
3,1341705593,,422612,0.410038
4,1341705593,,396362,0.410038
...,...,...,...,...
256486,1341705593,,293395,0.410038
256487,1341705593,,30364,0.410038
256488,1341705593,,84292,0.410038
256489,1341705593,,324348,1.107102


**Clean your dataframe** 🧹

In [12]:
df_pagerank.drop(columns=['window'], inplace=True)

In [13]:
df_pagerank

Unnamed: 0,timestamp,name,prlabel
0,1341705593,247216,0.410038
1,1341705593,61013,0.758570
2,1341705593,161960,0.410038
3,1341705593,422612,0.410038
4,1341705593,396362,0.410038
...,...,...,...
256486,1341705593,293395,0.410038
256487,1341705593,30364,0.410038
256488,1341705593,84292,0.410038
256489,1341705593,324348,1.107102


**The top ten most ranked users**

In [14]:
df_pagerank.sort_values(['prlabel'], ascending=False)[:10]

Unnamed: 0,timestamp,name,prlabel
77232,1341705593,88,6512.050333
93521,1341705593,2342,3746.267274
191563,1341705593,64911,2335.452547
2955,1341705593,39420,1885.321866
95981,1341705593,14454,1828.696595
120807,1341705593,677,1790.105521
73101,1341705593,2567,1649.711162
62742,1341705593,134095,1599.569856
116004,1341705593,169287,1593.242617
142755,1341705593,1988,1473.269535


### Run chained algorithms at once

In this example, we chain PageRank, Connected Components and Degree algorithms, running them one after another on the graph. Specify all the columns in the output dataframe, including an output column for each algorithm in the chain. **Time to finish: ~4 minutes**

In [15]:
cols = ["inDegree", "outDegree", "degree","prlabel","cclabel"]

df_chained = rg.at(1341705593) \
                .past() \
                .transform(pr.algorithms.generic.centrality.PageRank())\
                .transform(pr.algorithms.generic.ConnectedComponents)\
                .transform(pr.algorithms.generic.centrality.Degree())\
                .execute(pr.algorithms.generic.NodeList(*cols)) \
                .write_to_dataframe(["name"] + cols)

11:53:05.077 [io-compute-5] INFO  com.raphtory.api.querytracker.QueryProgressTracker - Job PageRank:ConnectedComponents:Degree:NodeList_9077571109020297891: Starting query progress tracker.
11:53:05.098 [spawner-akka.actor.default-dispatcher-5] INFO  com.raphtory.internals.components.querymanager.QueryManager - Query 'PageRank:ConnectedComponents:Degree:NodeList_9077571109020297891' received, your job ID is 'PageRank:ConnectedComponents:Degree:NodeList_9077571109020297891'.
11:53:05.100 [spawner-akka.actor.default-dispatcher-6] INFO  com.raphtory.internals.components.partition.QueryExecutor - PageRank:ConnectedComponents:Degree:NodeList_9077571109020297891_0: Starting QueryExecutor.
11:57:07.024 [spawner-akka.actor.default-dispatcher-11] INFO  com.raphtory.internals.components.querymanager.QueryHandler - Job 'PageRank:ConnectedComponents:Degree:NodeList_9077571109020297891': Perspective at Time '1341705593' took 241923 ms to run. 
11:57:07.024 [spawner-akka.actor.default-dispatcher-3] 

In [19]:
df_chained.drop(columns=['window'], inplace=True)

In [20]:
df_chained

Unnamed: 0,timestamp,name,inDegree,outDegree,degree,prlabel,cclabel
0,1341705593,247216,0,1,1,0.410038,-9223355950273962294
1,1341705593,61013,1,3,4,0.758570,-9223355950273962294
2,1341705593,161960,0,1,1,0.410038,-9223355950273962294
3,1341705593,422612,0,1,1,0.410038,-9223355950273962294
4,1341705593,396362,0,1,1,0.410038,-9223355950273962294
...,...,...,...,...,...,...,...
256486,1341705593,293395,0,1,1,0.410038,-9223355950273962294
256487,1341705593,30364,0,5,5,0.410038,-9223355950273962294
256488,1341705593,84292,0,1,1,0.410038,-8233873337259817729
256489,1341705593,324348,2,0,2,1.107102,288926933003355592


## Create visualisation by adding nodes 🔎

In [29]:
def visualise(rg, df_chained):
    # Create network object
    net = Network(notebook=True, height='750px', width='100%', bgcolor='#222222', font_color='white')
    # Set visualisation tool
    net.force_atlas_2based()
    # Get the node list 
    df_node_list = rg.at(1341705593) \
                .past() \
                .execute(pr.algorithms.generic.NodeList()) \
                .write_to_dataframe(['name'])
    
    nodes = df_node_list['name'].tolist()
    
    node_data = []
    ignore_items = ['timestamp', 'name', 'window']
    for node_name in nodes:
        for i, row in df_chained.iterrows():
            if row['name']==node_name:
                data = ''
                for k,v in row.iteritems():
                    if k not in ignore_items:
                        data = data+str(k)+': '+str(v)+'\n'
                node_data.append(data)
                continue
    # Add the nodes
    net.add_nodes(nodes, title=node_data)
    # Get the edge list
#     df_edge_list = rg.at(1341705593) \
#             .past() \
#             .execute(pr.algorithms.generic.EdgeList()) \
#             .write_to_dataframe(['from', 'to'])
#     edges = []
#     for i, row in df_edge_list[['from', 'to']].iterrows():
#         edges.append([row['from'], row['to']])
#     # Add the edges
#     net.add_edges(edges)
    # Toggle physics
    net.toggle_physics(True)
    return net

In [30]:
net = visualise(rg, df_chained)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:63480)
Traceback (most recent call last):
  File "/Users/rachelchan/miniconda3/envs/test3913/lib/python3.9/site-packages/py4j/java_gateway.py", line 982, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/rachelchan/miniconda3/envs/test3913/lib/python3.9/site-packages/py4j/java_gateway.py", line 1132, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 61] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:63480)
Traceback (most recent call last):
  File "/Users/rachelchan/miniconda3/envs/test3913/lib/python3.9/site-packages/py4j/java_gateway.py", line 982, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an em

RuntimeError: No overloaded implementations matched for at with args=(1341705593,) and kwargs={}

In [25]:
net.show('preview.html')

NameError: name 'net' is not defined

In [None]:
pr.shutdown()