# <center>Graph Search</center>

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Graph Search And Connectivity
<img src="Images/BFS_vs_DFS.png" width="600"/>

In [10]:
graph_example = {0: [1, 2],
                 1: [0, 3],
                 2: [0, 3, 4],
                 3: [1, 2, 4, 5],
                 4: [2, 3, 5],
                 5: [3, 4]}

## Breath First Search (BFS)

### Algorithm description
<img src="Images/BFS_Code.png" width="600"/>

In [34]:
def BFS(graph:list, start_vertex:int)-> list:
    '''
        graph list 
        start_vertex int
    '''
    visited, queue, order = [start_vertex], [start_vertex], [start_vertex]
    index = 1
    layers = {}
    layers[0] = [start_vertex]
    
    while queue:
        v = queue[0]
        vertex = queue.pop(0)
        layers[index] = []
        for w in graph[v]:
            if w not in visited:
                layers[index].append(w)
                visited.append(w)
                order.append(w)
                queue.append(w)
        index +=1 
    print(layers)
    return order

BFS(graph_example, 0)

'''
Queue description
s
a b
b c 
c d
d e
e

'''

{0: [0], 1: [1, 2], 2: [3], 3: [4], 4: [5], 5: [], 6: []}


[0, 1, 2, 3, 4, 5]

### Shortest Paths    
<img src="Images/BFS_ShortestPath.png" width="600"/>

### Connected Components via BFS
<img src="Images/BFS_Connected_Components.png" width="600"/>

## Deep First Search (DFS)

### Algorithm description
<img src="Images/DFS_properties.png" width="600"/>
<img src="Images/DFS_Code.png" width="600"/>

### Strictly Connected Components (SCC)
<img src="Images/DFS_SCC.png" width="600"/>
<img src="Images/DFS_Kosaraju.png" width="600"/>
<img src="Images/DFS_SCC_Code.png" width="600"/>

The file contains the edges of a directed graph. Vertices are labeled as positive integers from 1 to 875714. Every row indicates an edge, the vertex label in first column is the tail and the vertex label in second column is the head (recall the graph is directed, and the edges are directed from the first column vertex to the second column vertex). So for example, the 11th row looks liks : "2 47646". This just means that the vertex with label 2 has an outgoing edge to the vertex with label 47646

Your task is to code up the algorithm from the video lectures for computing strongly connected components (SCCs), and to run this algorithm on the given graph.

Output Format: You should output the sizes of the 5 largest SCCs in the given graph, in decreasing order of sizes, separated by commas (avoid any spaces). So if your algorithm computes the sizes of the five largest SCCs to be 500, 400, 300, 200 and 100, then your answer should be "500,400,300,200,100" (without the quotes). If your algorithm finds less than 5 SCCs, then write 0 for the remaining terms. Thus, if your algorithm computes only 3 SCCs whose sizes are 400, 300, and 100, then your answer should be "400,300,100,0,0" (without the quotes).  (Note also that your answer should not have any spaces in it.)

WARNING: This is the most challenging programming assignment of the course. Because of the size of the graph you may have to manage memory carefully. The best way to do this depends on your programming language and environment, and we strongly suggest that you exchange tips for doing this on the discussion forums.

### Réponse tenté
613052,121,113,76,70

In [1]:
# SCC algorithm with only list and global variables
import os, sys, threading

SCC_file = '../Data/SCC.txt'
filepath = os.path.join(os.getcwd(), SCC_file)
nbr_nodes = 875715

def load_data(filepath:str, nbr_nodes:int)->list:
    '''
        Create graph/graph reverse as lists from the input file.
    '''
    file = open(filepath,"r")
    data = file.readlines()
    G, Grev = [[] for x in range(nbr_nodes)], [[] for x in range(nbr_nodes)]
    for line in data:
        node_start, node_end = list(map(lambda x: int(x),line.split())) 
        G[node_start].append(node_end); Grev[node_end].append(node_start)
    return G, Grev   

def DFSrev(Grev:list, i:int)->None:
    '''
        Apply DFS from a defined node on Grev.
        Give a finishing time to every node of the graph to compute "magical ordering" of the nodes.
    '''
    global t, is_explored, finishing_time
    is_explored[i] = True
    for j in Grev[i]:
        if not is_explored[j]:
            DFSrev(Grev, j)
    t += 1
    finishing_time[i] = t 

def DFSrev_Loop(Grev:list)->None:
    '''
        Loop DFS on every node of the graph Grev.
    '''
    global t, is_explored, finishing_time
    is_explored = [False for x in range(nbr_nodes)]
    t = 0
    for i in range(len(Grev)-1, 0, -1):
        if not is_explored[i]:
            DFSrev(Grev, i)

def DFS(G:list, i:int)->None:
    '''
        Apply DFS from a defined node on G.
        Find the SCCs in the graph and list all the leaders.
    '''
    global is_explored, scc_size
    is_explored[i] = True
    for j in G[i]:
        if is_explored[j]:
            DFS(G, j)
    scc_size += 1                      

def DFS_Loop(G:list)->None:
    '''
        Loop DFS on every node of the graph G.
    '''
    global is_explored, finishing_time, SCCs, scc_size
    is_explored = [False for x in range(nbr_nodes)]
    for i in range(len(G)-1, 0, -1):
        if not is_explored[i]:
            scc_size = 0
            DFS(G, i)
            SCCs.append(scc_size)
            
def Korasaju(filepath:str, nbr_nodes:int)->list:
    '''
        Apply Korasaju's two passes algorithms to find the five biggest Strongly Connected Components(SCC).
    '''
    global finishing_time, SCCs
    finishing_time = [[] for x in range(nbr_nodes)]
    SCCs = []
    G, Grev = load_data(filepath, nbr_nodes)     # 1 - Let Grev = G with all arcs reversed
    DFSrev_Loop(Grev)                            # 2 - Run DFS-Loop on Grev. Give a finishing time to every node of the graph to compute "magical ordering" of the nodes.
    DFS_Loop(G)                                  # 3 - Run DFS-Loop on G. Processinig all nodes by finishing time.
    return sorted(SCCs, reverse=True)[:5]

In [None]:
threading.stack_size(67108864) # 64MB stack
sys.setrecursionlimit(2 ** 20)  # approx 1 million recursions

Korasaju(filepath, nbr_nodes)

Loop 1
Point 875714 t=1
Point 272881 t=100000
Point 681209 t=200000
Point 546135 t=300000
Point 32143 t=400000
Point 32925 t=500000
Point 712167 t=600000
Point 697447 t=700000
Point 437484 t=800000
Loop 2
Point 552145 t=1
Point 645981 t=100000
Point 757394 t=200000
Point 677058 t=300000
Point 262401 t=400000
Point 558375 t=500000
Point 847561 t=600000
Point 643447 t=700000
Point 349298 t=800000
[613052, 121, 113, 76, 70]


In [6]:
from collections import defaultdict
import sys
import threading


class Track():
    """ Use Track() class to record all the information """
    def __init__(self):
        self.visited = set()
        self.current_time = 0
        self.current_source = []
        self.leader = defaultdict(list)
        self.finishing_times = {}

    def addNode(self, node):
        """ Add node to a leader dictionary """
        self.leader[self.current_source].append(node)


def dfs(graph, start, track):
    """ DFS on the graph from start vertex"""
    track.visited.add(start)
    track.addNode(start)
    for v in graph[start]:
        if v not in track.visited:
            dfs(graph, v, track)

    track.current_time += 1
    track.finishing_times[start] = track.current_time


def dfs_loop(graph, nodes, track):
    """ Loop over the nodes on the graph using DFS """
    for node in nodes:
        if node not in track.visited:
            track.current_source = node
            dfs(graph, node, track)


def load_graph(filename):
    """ Load the graph structure from the txt file """
    edges = []
    with open(filename) as f:
        for lines in f:
            line = lines.split()
            edges.append((int(line[0]), int(line[1])))

    nodes = list(set([v for edge in edges for v in edge]))
    graph = {i: [] for i in range(1, len(nodes) + 1)}
    graph_rev = {i: [] for i in range(1, len(nodes) + 1)}
    for edge in edges:
        graph[edge[0]].append(edge[1])
        graph_rev[edge[1]].append(edge[0])

    return graph, graph_rev, nodes


def scc(graph, graph_rev, nodes):
    """ Compute the SCC components """
    track = Track()
    dfs_loop(graph_rev, nodes, track)
    sorted_nodes = sorted(track.finishing_times, key=track.finishing_times.get, reverse=True)
    track = Track()
    dfs_loop(graph, sorted_nodes, track)
    return track


def most_common(leader, x):
    """ Find the top x elements """
    results = [len(v) for k, v in leader.items()] + [0] * x
    return sorted(results, reverse=True)[: x]


def main():
    graph, graph_rev, nodes = load_graph(filepath)
    track = scc(graph, graph_rev, nodes)
    assert most_common(track.leader, 5) == [6, 3, 2, 1, 0]

    graph, graph_rev, nodes = load_graph("SCC.txt")
    track = scc(graph, graph_rev, nodes)
    print(most_common(track.leader, 5))

In [None]:
main()