# Higgs Boson Tweets PyRaphtory Example Notebook ðŸ’¥

## Setup environment and download data ðŸ’¾

Import all necessary dependencies needed to build a graph from your data in PyRaphtory. 

In [None]:
pip install pyvis

If you would like to use the full dataset, please uncomment the curl command in the cell below and the preview data cell.

In [None]:
from pathlib import Path
from pyraphtory.context import PyRaphtory
from pyraphtory.vertex import Vertex
from pyraphtory.spouts import FileSpout
from pyraphtory.builder import *
from pyvis.network import Network
import csv

# !curl -o /tmp/twitter.csv https://raw.githubusercontent.com/Raphtory/Data/main/higgs-retweet-activity.csv

## Preview data ðŸ‘€

Preview the retweet twitter data: each line includes the source user A (the retweeter), the destination user B (the user being retweeted) and the time at which the retweet occurs.

In [None]:
# !head /tmp/twitter.csv

## Create a new Raphtory graph ðŸ“Š

Turn on logs to see what is going on in PyRaphtory. Initialise Raphtory by creating a PyRaphtory object and create your new graph.

In [None]:
graph = PyRaphtory.new_graph()

## Ingest the data into a graph ðŸ˜‹

Write a parsing method to parse your csv file and ultimately create a graph.

Swap twitter_spout with /tmp/twitter.csv if using the big dataset, otherwise keep it as higgstestdata.csv for testing

In [None]:
def parse(graph, tuple: str):
    parts = [v.strip() for v in tuple.split(",")]
    source_node = parts[0]
    src_id = graph.assign_id(source_node)
    target_node = parts[1]
    tar_id = graph.assign_id(target_node)
    time_stamp = int(parts[2])

    graph.add_vertex(time_stamp, src_id, Properties(ImmutableProperty("name", source_node)), Type("User"))
    graph.add_vertex(time_stamp, tar_id, Properties(ImmutableProperty("name", target_node)), Type("User"))
    graph.add_edge(time_stamp, src_id, tar_id, Type("Tweet"))

twitter_builder = GraphBuilder(parse)
# twitter_spout = FileSpout("/tmp/twitter.csv")
twitter_spout = FileSpout("higgstestdata.csv")
graph.load(Source(twitter_spout, twitter_builder))

## Collect simple metrics ðŸ“ˆ

Select certain metrics to show in your output dataframe. Here we have selected vertex name, degree, out degree and in degree. **Time to finish: ~2 to 3 minutes**

In [None]:
from pyraphtory.graph import Row
df = graph \
      .select(lambda vertex: Row(vertex.name(), vertex.degree(), vertex.out_degree(), vertex.in_degree())) \
      .to_df(["name", "degree", "out_degree", "in_degree"])

#### Clean the dataframe, we have deleted the unused window column. ðŸ§¹

In [None]:
df.drop(columns=['window'], inplace=True)

### Preview the dataframe ðŸ‘€

In [None]:
df

**Sort by highest degree, top 10**

In [None]:
df.sort_values(['degree'], ascending=False)[:10]

**Sort by highest in-degree, top 10**

In [None]:
df.sort_values(['in_degree'], ascending=False)[:10]

**Sort by highest out-degree, top 10**

In [None]:
df.sort_values(['out_degree'], ascending=False)[:10]

# Run a PageRank algorithm ðŸ“‘

Run your selected algorithm on your graph, here we run PageRank. Your algorithms can be obtained from the PyRaphtory object you created at the start. Specify where you write the result of your algorithm to, e.g. the additional column results in your dataframe. **Time to finish: ~3 to 4 minutes**

In [None]:
cols = ["prlabel"]


df_pagerank = graph.at(1341705593) \
                .past() \
                .execute(PyRaphtory.algorithms.generic.centrality.PageRank())\
                .to_df(["name"] + cols)

df_pagerank

**Clean your dataframe** ðŸ§¹

In [None]:
df_pagerank.drop(columns=['window'], inplace=True)

In [None]:
df_pagerank

**The top ten most ranked users**

In [None]:
df_pagerank.sort_values(['prlabel'], ascending=False)[:10]

### Run chained algorithms at once

In this example, we chain PageRank, Connected Components and Degree algorithms, running them one after another on the graph. Specify all the columns in the output dataframe, including an output column for each algorithm in the chain. **Time to finish: ~4 minutes**

In [None]:
cols = ["inDegree", "outDegree", "degree","prlabel","cclabel"]

df_chained = graph.at(1341705593) \
                .past() \
                .transform(PyRaphtory.algorithms.generic.centrality.PageRank())\
                .transform(PyRaphtory.algorithms.generic.ConnectedComponents)\
                .transform(PyRaphtory.algorithms.generic.centrality.Degree())\
                .execute(PyRaphtory.algorithms.generic.NodeList(*cols)) \
                .to_df(["name"] + cols)

In [None]:
df_chained.drop(columns=['window'], inplace=True)

In [None]:
df_chained

## Create visualisation by adding nodes ðŸ”Ž

In [None]:
def visualise(graph, df_chained):
    # Create network object
    net = Network(notebook=True, height='750px', width='100%', bgcolor='#222222', font_color='white')
    # Set visualisation tool
    net.force_atlas_2based()
    # Get the node list 
    df_node_list = graph.at(1341705593) \
                .past() \
                .execute(PyRaphtory.algorithms.generic.NodeList()) \
                .to_df(['name'])
    
    nodes = df_node_list['name'].tolist()
    
    node_data = []
    ignore_items = ['timestamp', 'name', 'window']
    for node_name in nodes:
        for i, row in df_chained.iterrows():
            if row['name']==node_name:
                data = ''
                for k,v in row.iteritems():
                    if k not in ignore_items:
                        data = data+str(k)+': '+str(v)+'\n'
                node_data.append(data)
                continue
    # Add the nodes
    net.add_nodes(nodes, title=node_data)
    # Get the edge list
#     df_edge_list = graph.at(1341705593) \
#             .past() \
#             .execute(PyRaphtory.algorithms.generic.EdgeList()) \
#             .write_to_dataframe(['from', 'to'])
#     edges = []
#     for i, row in df_edge_list[['from', 'to']].iterrows():
#         edges.append([row['from'], row['to']])
#     # Add the edges
#     net.add_edges(edges)
    # Toggle physics
    net.toggle_physics(True)
    return net

In [None]:
net = visualise(graph, df_chained)

In [None]:
net.show('preview.html')

In [None]:
PyRaphtory.close_graphs