# Project 3 Neo4j graph on BART model

University of California, Berkeley

Master of Information and Data Science (MIDS) program

w205 - Fundamentals of Data Engineering


# Included Modules and Packages

In [1]:
import math
import numpy as np
import pandas as pd
import psycopg2
import neo4j
import csv
from geographiclib.geodesic import Geodesic

# Supporting code

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [3]:
session = driver.session(database="neo4j")

In [4]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [5]:
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [6]:
def my_neo4j_create_node(station_name):
    "create a node with label Station"
    
    query = """
    
    CREATE (:Station {name: $station_name})
    
    """
    
    session.run(query, station_name=station_name)

In [7]:
def my_neo4j_create_relationship_one_way(from_station, to_station, weight):
    "create a relationship one way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    

In [8]:
def my_neo4j_create_relationship_two_way(from_station, to_station, weight):
    "create relationships two way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to),
           (to)-[:LINK {weight: $weight}]->(from)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)

In [9]:
def my_calculate_box(point, miles):
    "Given a point and miles, calculate the box in form left, right, top, bottom"
    
    geod = Geodesic.WGS84

    kilometers = miles * 1.60934
    meters = kilometers * 1000

    g = geod.Direct(point[0], point[1], 270, meters)
    left = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 90, meters)
    right = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 0, meters)
    top = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 180, meters)
    bottom = (g['lat2'], g['lon2'])
    
    return(left, right, top, bottom)

In [10]:
def my_station_get_zips(station, miles):
    "given a station, pull all zip codes with miles distance, print them, sum the population"
    
    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)
        
    (left, right, top, bottom) = my_calculate_box(point, miles)
    
    query = "select zip, population from zip_codes "
    query += " where latitude >= " + str(bottom[0])
    query += " and latitude <= " + str(top [0])
    query += " and longitude >= " + str(left[1])
    query += " and longitude <= " + str(right[1])
    query += " order by 1 "

    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    print("\n-------------------------------------------------------------------------------")
    print("  Zip Codes within " + str(miles) + " mile(s) of " + station + " BART Station")
    print("-------------------------------------------------------------------------------\n")
    
    total_population = 0
    
    for row in rows:
        zip = row[0]
        population = row[1]
        print("     zip:", zip, "  population: ", f'{population:10,}')
        total_population += population
        
    
    print("\n-------------------------------------------------------------------------------")
    print("  Total Population: ", f'{total_population:10,}')
    print("-------------------------------------------------------------------------------")

In [11]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")


In [12]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [13]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [14]:
cursor = connection.cursor()

# Process Exit Data

In [15]:
connection.rollback()

query = """

drop table if exists capacity;

"""

cursor.execute(query)

connection.commit()

In [16]:
connection.rollback()

query = """

create table capacity (
    weekday date,
    entry_station varchar(32),
    exit_station varchar(32),
    exits_count numeric(32),
    entry_station_full_name varchar(128),
    exit_station_full_name varchar(128)
);

"""

cursor.execute(query)

connection.commit()

In [17]:
def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")
    
my_read_csv_file("BART_entry_exist.csv", limit=10)

['weekday', 'entry_station', 'exit_station', 'exits count', 'entry_station_full_name', 'exit_station_full_name']
['2020-01-01 00:00:00', 'RM', 'RM', '19.857142857142900', 'Richmond', 'Richmond']
['2020-01-01 00:00:00', 'EN', 'RM', '119.52380952381000', 'El Cerrito Del Norte', 'Richmond']
['2020-01-01 00:00:00', 'EP', 'RM', '86.61904761904760', 'El Cerrito Plaza', 'Richmond']
['2020-01-01 00:00:00', 'NB', 'RM', '70.23809523809520', 'North Berkeley', 'Richmond']
['2020-01-01 00:00:00', 'BK', 'RM', '389.5238095238100', 'Berkeley', 'Richmond']
['2020-01-01 00:00:00', 'AS', 'RM', '93.47619047619050', 'Ashby', 'Richmond']
['2020-01-01 00:00:00', 'MA', 'RM', '140.0', 'MacArthur', 'Richmond']
['2020-01-01 00:00:00', '19', 'RM', '168.61904761904800', '19th Street Oakland', 'Richmond']
['2020-01-01 00:00:00', '12', 'RM', '183.04761904761900', '12th Street / Oakland City Center', 'Richmond']

Printed  10 lines of  105001 total lines.


In [18]:
connection.rollback()

query = """

copy capacity
from '/user/projects/project-3-gonz-ga-ds/code/BART_entry_exist.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [19]:
# get capacity df
rollback_before_flag = True
rollback_after_flag = True

query = """

with entry_station_count as (
                            select
                                    avg(capacity.exits_count) as entry_count_month_avg,
                                    entry_station_full_name as station
                            from capacity
                            group by entry_station_full_name
                            ),
    exit_station_count as (
                            select
                                    avg(capacity.exits_count) as exit_count_month_avg,
                                    exit_station_full_name as station
                            from capacity
                            group by exit_station_full_name
                            )               

select 
        entry_station_count.station as station,
        entry_station_count.entry_count_month_avg,
        exit_station_count.exit_count_month_avg,
        (entry_station_count.entry_count_month_avg + exit_station_count.exit_count_month_avg) as total_monthly_capacity
from entry_station_count join exit_station_count on entry_station_count.station = exit_station_count.station
order by station;

"""

capacity_df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)


# Concatenate two df
supplimentary_list = []
def station_name_process(x):
    x = x.strip()
    if "/" in x:
        x1 = x[0:x.find("/")].strip()
        x2 = x[x.find("/")+1:len(x)].strip()
        supplimentary_list.append([x2,x1])
        return x1
    elif x == "Berkeley":
        return "Downtown Berkeley"
    elif x == "Oakland International Airport":
        return "OAK"
    elif x == "19th Street Oakland":
        return "19th Street"
    elif x == "San Francisco International Airport":
        return "SFO"
    else:
        return x

capacity_df["station"] = capacity_df["station"].apply(lambda x:station_name_process(x))


for i in supplimentary_list:
    append_list = capacity_df.loc[capacity_df["station"]==i[1]].values[0]
    append_list[0] = i[0]
    capacity_df.loc[len(capacity_df)+1] = append_list


capacity_df["station_key"] = capacity_df["station"].str.lower()
capacity_df["station_key"] = capacity_df["station_key"].apply(lambda x:x.replace(" ",""))

capacity_df

Unnamed: 0,station,entry_count_month_avg,exit_count_month_avg,total_monthly_capacity,station_key
0,12th Street,67.561905,66.128095,133.69,12thstreet
1,16th Street Mission,78.91381,79.128571,158.042381,16thstreetmission
2,19th Street,61.720952,60.296667,122.017619,19thstreet
3,24th Street Mission,77.336667,76.18619,153.522857,24thstreetmission
4,Antioch,25.208095,25.00619,50.214286,antioch
5,Ashby,27.212857,26.24,53.452857,ashby
6,Balboa Park,64.512857,58.51619,123.029048,balboapark
7,Bayfair,37.126667,37.597619,74.724286,bayfair
8,Downtown Berkeley,63.204286,66.143333,129.347619,downtownberkeley
9,Berryessa,11.439524,11.944286,23.38381,berryessa


## Define Weights Factors

In [20]:
factor_1 = 190
factor_2 = 0.01

# 1.0 Algorithm for Louvain Modularity

In [21]:
my_neo4j_wipe_out_database()

In [22]:
connection.rollback()

query = """

select station
from stations
order by station

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    
    my_neo4j_create_node(station)

## Verifying number of nodes and relationships

In [23]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 50
  Relationships: 0
-------------------------


## Looking at query

In [24]:
rollback_before_flag = True
rollback_after_flag = True

query = """

(
select distinct a.station as from_station, b.station as to_station, t.travel_time
from lines a
  join lines b
    on a.line = b.line and b.sequence = (a.sequence + 1)
  join travel_times t
    on (a.station = t.station_1 and b.station = t.station_2)
        or (a.station = t.station_2 and b.station = t.station_1)
where a.station < b.station
)
UNION
(
select distinct b.station as from_station, a.station as to_station, t.travel_time
from lines a
  join lines b
    on a.line = b.line and b.sequence = (a.sequence + 1)
  join travel_times t
    on (a.station = t.station_1 and b.station = t.station_2)
        or (a.station = t.station_2 and b.station = t.station_1)
where a.station > b.station
)
order by 1, 2, 3

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,from_station,to_station,travel_time
0,12th Street,19th Street,120
1,12th Street,Lake Merritt,180
2,12th Street,West Oakland,300
3,16th Street Mission,24th Street Mission,120
4,16th Street Mission,Civic Center,180
5,19th Street,MacArthur,180
6,24th Street Mission,Glen Park,180
7,Antioch,Pittsburg Center,420
8,Ashby,Downtown Berkeley,180
9,Ashby,MacArthur,240


## Creating a relationships

In [25]:
connection.rollback()

query = """

(
select distinct a.station as from_station, b.station as to_station, t.travel_time
from lines a
  join lines b
    on a.line = b.line and b.sequence = (a.sequence + 1)
  join travel_times t
    on (a.station = t.station_1 and b.station = t.station_2)
        or (a.station = t.station_2 and b.station = t.station_1)
where a.station < b.station
)
UNION
(
select distinct b.station as from_station, a.station as to_station, t.travel_time
from lines a
  join lines b
    on a.line = b.line and b.sequence = (a.sequence + 1)
  join travel_times t
    on (a.station = t.station_1 and b.station = t.station_2)
        or (a.station = t.station_2 and b.station = t.station_1)
where a.station > b.station
)
order by 1, 2, 3

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    from_station = row[0]
    to_station = row[1]
    to_station_key = to_station.lower().replace(" ","")
    # find the total_daily_exit_number for to_station
    daily_exit_number = float(capacity_df.loc[capacity_df["station_key"] == to_station_key,"total_monthly_capacity"].values[0])
    travel_time = int(row[2])
    input_weight = (factor_1 * (1/travel_time)) + (factor_2 * daily_exit_number)
    my_neo4j_create_relationship_two_way(from_station, to_station, input_weight)

In [26]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 50
  Relationships: 102
-------------------------


## Louvain Modularity algorithm on simplified graph

In [27]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f24bfc34580>

In [28]:
query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,community,intermediate_community
0,Ashby,2,"[15, 2]"
1,Downtown Berkeley,2,"[15, 2]"
2,El Cerrito Plaza,2,"[38, 2]"
3,El Cerrito del Norte,2,"[38, 2]"
4,North Berkeley,2,"[15, 2]"
5,Richmond,2,"[38, 2]"
6,16th Street Mission,10,"[3, 10]"
7,24th Street Mission,10,"[3, 10]"
8,Civic Center,10,"[10, 10]"
9,Glen Park,10,"[3, 10]"


## How many communities

In [29]:
query = """

CALL gds.louvain.stream('ds_graph')
YIELD communityId
RETURN count(DISTINCT communityId) AS totalCommunities

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,totalCommunities
0,8


## Compute the capacity for each community

In [30]:
# get capacity df
rollback_before_flag = True
rollback_after_flag = True

query = """

with entry_station_count as (
                            select
                                    avg(capacity.exits_count) as entry_count_month_avg,
                                    entry_station_full_name as station
                            from capacity
                            group by entry_station_full_name
                            ),
    exit_station_count as (
                            select
                                    avg(capacity.exits_count) as exit_count_month_avg,
                                    exit_station_full_name as station
                            from capacity
                            group by exit_station_full_name
                            )               

select 
        entry_station_count.station as station,
        entry_station_count.entry_count_month_avg,
        exit_station_count.exit_count_month_avg,
        (entry_station_count.entry_count_month_avg + exit_station_count.exit_count_month_avg) as total_monthly_capacity
from entry_station_count join exit_station_count on entry_station_count.station = exit_station_count.station
order by station;

"""

capacity_df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

# get community df
query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

community_df = my_neo4j_run_query_pandas(query)

# Concatenate two df
supplimentary_list = []
def station_name_process(x):
    x = x.strip()
    if "/" in x:
        x1 = x[0:x.find("/")].strip()
        x2 = x[x.find("/")+1:len(x)].strip()
        supplimentary_list.append([x2,x1])
        return x1
    elif x == "Berkeley":
        return "Downtown Berkeley"
    elif x == "Oakland International Airport":
        return "OAK"
    elif x == "19th Street Oakland":
        return "19th Street"
    elif x == "San Francisco International Airport":
        return "SFO"
    else:
        return x

capacity_df["station"] = capacity_df["station"].apply(lambda x:station_name_process(x))


for i in supplimentary_list:
    append_list = capacity_df.loc[capacity_df["station"]==i[1]].values[0]
    append_list[0] = i[0]
    capacity_df.loc[len(capacity_df)+1] = append_list

community_df["station_key"] = community_df["name"].str.lower()
community_df["station_key"] = community_df["station_key"].apply(lambda x:x.replace(" ",""))
capacity_df["station_key"] = capacity_df["station"].str.lower()
capacity_df["station_key"] = capacity_df["station_key"].apply(lambda x:x.replace(" ",""))

community_capcity_df = pd.merge(left=community_df,right=capacity_df,on="station_key",how="left",indicator=False)

community_capcity_full = community_capcity_df.copy()
community_capcity_full = community_capcity_full.groupby("community").sum()
community_capcity_full = community_capcity_full.sort_values("total_monthly_capacity")
community_capcity_full

Unnamed: 0_level_0,entry_count_month_avg,exit_count_month_avg,total_monthly_capacity
community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
45,140.972857,139.822857,280.795714
31,172.496667,171.447143,343.94381
46,173.267619,171.179524,344.447143
2,228.001905,230.008571,458.010476
14,256.28,244.897619,501.177619
23,258.589524,258.24619,516.835714
10,512.202381,478.041429,990.24381
29,513.484286,561.651905,1075.13619


# 2.0 Algorithm for Finding shortest Path

In [31]:
my_neo4j_wipe_out_database()

In [32]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 0
  Relationships: 0
-------------------------


## Creating a relationships

In [33]:
connection.rollback()

query = """

select station
from stations
order by station

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    
    my_neo4j_create_node('depart ' + station)
    my_neo4j_create_node('arrive ' + station)
    

In [34]:
connection.rollback()

query = """

select station, line
from lines
order by station, line

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    line = row[1]
    
    depart = 'depart ' + station
    arrive = 'arrive ' + station
    line_station = line + ' ' + station
    
    my_neo4j_create_node(line_station)
    my_neo4j_create_relationship_one_way(depart, line_station, 0)
    my_neo4j_create_relationship_one_way(line_station, arrive, 0)
    

In [35]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 214
  Relationships: 228
-------------------------


In [36]:
connection.rollback()

query = """

select a.station, a.line as from_line, b.line as to_line, s.transfer_time
from lines a
     join lines b
       on a.station = b.station and a.line <> b.line 
     join stations s
       on a.station = s.station
order by 1, 2, 3

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    from_line = row[1]
    to_line = row[2]
    transfer_time = int(row[3])
    
    from_station = from_line + ' ' + station
    to_station = to_line + ' ' + station
    
    # === Create Weights
    to_station_key = station.lower().replace(" ","")
    # find the total_daily_exit_number for to_station
    daily_exit_number = float(capacity_df.loc[capacity_df["station_key"] == to_station_key,"total_monthly_capacity"].values[0])
    # compute the weight
    input_weight = (factor_1 * (1/transfer_time)) + (factor_2 * daily_exit_number)
    my_neo4j_create_relationship_one_way(from_station, to_station, input_weight)

In [37]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 214
  Relationships: 436
-------------------------


In [38]:
connection.rollback()

query = """

select a.line, a.station as from_station, b.station as to_station, t.travel_time
from lines a
  join lines b
    on a.line = b.line and b.sequence = (a.sequence + 1)
  join travel_times t
    on (a.station = t.station_1 and b.station = t.station_2)
        or (a.station = t.station_2 and b.station = t.station_1)
order by line, from_station, to_station

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    line = row[0]
    from_station = line + ' ' + row[1]
    to_station = line + ' ' + row[2]
    travel_time = int(row[3])

    # === Create Weights
    to_station_key = row[2].lower().replace(" ","")
    # find the total_daily_exit_number for to_station
    daily_exit_number = float(capacity_df.loc[capacity_df["station_key"] == to_station_key,"total_monthly_capacity"].values[0])
    # compute the weight
    weight = (factor_1 * (1/transfer_time)) + (factor_2 * daily_exit_number)
    my_neo4j_create_relationship_two_way(from_station, to_station, input_weight)
    

In [39]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 214
  Relationships: 652
-------------------------


In [40]:
def my_neo4j_shortest_path(from_station, to_station):
    "given a from station and to station, run and print the shortest path"
    
    query = "CALL gds.graph.drop('ds_graph', false)"
    session.run(query)

    query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)
    
#     for r in result:
        
#         total_cost = int(r['totalCost'])
        
#         print("\n--------------------------------")
#         print("   Total Cost: ", total_cost)
#         print("   Minutes: ", round(total_cost / 60.0,1))
#         print("--------------------------------")
        
#         nodes = r['nodes']
#         costs = r['costs']
        
#         i = 0
#         previous = 0
        
#         for n in nodes:
            
#             print(n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i])))
            
#             previous = int(costs[i])
#             i += 1
    for r in result:
        total_cost = int(r['totalCost'])
    return round(total_cost / 60.0,1)

In [41]:
delivery_station_df = community_capcity_df[['name' , 'community', 'total_monthly_capacity']].groupby(['community']).max()
from_stations = []
to_stations = []
time = []
from_station = delivery_station_df.loc[delivery_station_df['total_monthly_capacity'].idxmax()]['name']
for to_station in delivery_station_df['name'].to_list():
    if from_station != to_station:
        from_stations.append(from_station)
        to_stations.append(to_station)
        time.append(my_neo4j_shortest_path('depart ' + from_station, 'arrive ' + to_station))
delivery_df = pd.DataFrame({'From Station': from_stations, 'To Station': to_stations,
                            'Weight': time })
delivery_df.sort_values(['Weight'])

Unnamed: 0,From Station,To Station,Weight
1,West Oakland,Powell Street,0.1
6,West Oakland,Rockridge,0.1
0,West Oakland,Richmond,0.2
3,West Oakland,West Dublin,0.2
4,West Oakland,Walnut Creek,0.2
2,West Oakland,South San Francisco,0.3
5,West Oakland,Warm Springs,0.3


# 3.0 Alogorithm for Centrality

## Betweenness Centrality

In [42]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f24bfbfc7f0>

In [43]:
query = """

CALL gds.betweenness.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

result = my_neo4j_run_query_pandas(query)
top_10 = result.head(10)
top_10["station_key"] = top_10["name"].apply(lambda x: x[x.find(" ")+1:len(x)].lower().replace(" ",""))
top_10 = pd.merge(left = capacity_df, right = top_10, on="station_key",how="right",indicator=False)
top_10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_10["station_key"] = top_10["name"].apply(lambda x: x[x.find(" ")+1:len(x)].lower().replace(" ",""))


Unnamed: 0,station,entry_count_month_avg,exit_count_month_avg,total_monthly_capacity,station_key,name,betweenness
0,MacArthur,52.001905,51.069524,103.071429,macarthur,yellow MacArthur,5999.809223
1,Rockridge,27.541429,28.32,55.861429,rockridge,yellow Rockridge,5509.0
2,Lake Merritt,41.017619,42.088095,83.105714,lakemerritt,orange Lake Merritt,5155.831877
3,12th Street,67.561905,66.128095,133.69,12thstreet,orange 12th Street,5139.715461
4,Orinda,14.353333,13.802381,28.155714,orinda,yellow Orinda,4997.0
5,19th Street,61.720952,60.296667,122.017619,19thstreet,yellow 19th Street,4820.250748
6,Fruitvale,59.342857,59.308095,118.650952,fruitvale,orange Fruitvale,4641.959661
7,Lafayette,17.65,17.690952,35.340952,lafayette,yellow Lafayette,4469.0
8,12th Street,67.561905,66.128095,133.69,12thstreet,yellow 12th Street,4423.507563
9,Coliseum,39.727619,39.75,79.477619,coliseum,orange Coliseum,4306.942363


## Wasserman and Faust

In [44]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f24bfc83580>

In [45]:
query = """

CALL gds.beta.closeness.stream('ds_graph',
                               {useWassermanFaust: true}
                              )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

result = my_neo4j_run_query_pandas(query)
top_10 = result.head(10)
top_10["station_key"] = top_10["name"].apply(lambda x: x[x.find(" ")+1:len(x)].lower().replace(" ",""))
top_10 = pd.merge(left = capacity_df, right = top_10, on="station_key",how="right",indicator=False)
top_10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_10["station_key"] = top_10["name"].apply(lambda x: x[x.find(" ")+1:len(x)].lower().replace(" ",""))


Unnamed: 0,station,entry_count_month_avg,exit_count_month_avg,total_monthly_capacity,station_key,name,closeness
0,West Oakland,49.066667,45.483333,94.55,westoakland,yellow West Oakland,0.105979
1,West Oakland,49.066667,45.483333,94.55,westoakland,green West Oakland,0.105531
2,West Oakland,49.066667,45.483333,94.55,westoakland,red West Oakland,0.105263
3,West Oakland,49.066667,45.483333,94.55,westoakland,blue West Oakland,0.104821
4,12th Street,67.561905,66.128095,133.69,12thstreet,yellow 12th Street,0.103775
5,12th Street,67.561905,66.128095,133.69,12thstreet,orange 12th Street,0.103174
6,12th Street,67.561905,66.128095,133.69,12thstreet,red 12th Street,0.103089
7,Lake Merritt,41.017619,42.088095,83.105714,lakemerritt,green Lake Merritt,0.102327
8,Embarcadero,186.275238,214.303333,400.578571,embarcadero,yellow Embarcadero,0.101993
9,Lake Merritt,41.017619,42.088095,83.105714,lakemerritt,orange Lake Merritt,0.101993
