# Network creation trials

notebook focused on creating graph from our dataset. after defining the functions we'll use, we show a toy model of a couple streets, then we show the network of the entire dataset, and finally the entire dataset minus TANGENZIALI and large adjacent roads.

### Imports and function definitions

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import contextily as cx
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
#retrieve the dataset
import os
import zipfile
cwd = os.getcwd()
dataname = 'dataset_vehicles'
if os.path.exists(dataname):
    vehicle_path = dataname + '/dataset_vehicles/AC_VEI_AC_VEI_SUP_SR.shp'
else:
    !wget -nv -O dataset_vehicles.zip https://github.com/RiccardoBasilone/roadnets/raw/master/dataset_vehicles.zip -nc
    zip_filename = 'dataset_vehicles.zip'
    extract_dir = './dataset_vehicles'
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

    vehicle_path = os.path.join(cwd,'dataset_vehicles/dataset_vehicles/AC_VEI_AC_VEI_SUP_SR.shp')

In [None]:
gdf = gpd.read_file(vehicle_path)

In [None]:
#initial cleaning 

gdf.drop(['AC_VEI_FON', 
          #'AC_VEI_LIV', 'AC_VEI_SED', 
          'CLASSREF'],axis = 1, inplace = True)
gdf.rename(columns={'SUBREGID':'ID', 'NOME': 'NAME', 'AC_VEI_ZON': 'TYPE'}, inplace = True)

# portions of road (e.g not intersections or parking lots) start with 01 in TYPE
# intersections, squares, and roundabouts start with 02 in TYPE

#gdf = gdf[~gdf['NAME'].str.contains('TANGENZIALE', regex = False)] #removing tangenziali
pattern1 = ('01','02')
gdf = gdf.loc[gdf.TYPE.str.startswith(pattern1)]

In [None]:
#we use openstreetmap crs, because it's a common projected (not geographic) CRS --> useful for calculating distances
OSM_crs = 3857
gdf.to_crs(epsg=OSM_crs, inplace = True)

In [None]:
#divides gdf into intersections and roads

def ints_and_roads(gdf):

    pattern2 = ('01')
    pattern3 = ('02') 
    pattern4 = ('0102') 
    roads = gdf.loc[(gdf.TYPE.str.startswith(pattern2)) #& ~ (gdf.TYPE.str.startswith(pattern4))
    ]
    ints = gdf.loc[(gdf.TYPE.str.startswith(pattern3)) #| (gdf.TYPE.str.startswith(pattern4))
    ]
    return ints, roads

In [None]:
#function creates geodataframe with all streets of gdf within distance dist (in meters) of street.
#street is a geodataframe, dist is a positive number, and gdf is the geodataframe dataset.

def within_dist(street, dist, gdf):

    temp = street.copy()
    temp.geometry = temp.geometry.buffer(dist)
    temp = temp.filter(['geometry']) #so sjoin doesn't give suffixes and i don't have to rename later
    gdf_distanced = gdf.sjoin(temp, how='inner', predicate='intersects')
    gdf_distanced = gdf_distanced.dropna()
    gdf_distanced = gdf_distanced[~gdf_distanced.index.duplicated(keep='first')] #removes streets that are in more than one polygon's buffer
    gdf_distanced = gdf_distanced.iloc[:,:-1] #drops index_R column
    return gdf_distanced

In [None]:
#we define a variation of the within_dist function. This one keeps duplicate entries because they are useful for creating the graph later on.

def within_dist_dupes(street, dist, gdf):

    temp = street.copy()
    temp.geometry = temp.geometry.buffer(dist)
    temp = temp.filter(['geometry']) #so sjoin doesn't give suffixes and i don't have to rename later
    gdf_distanced = gdf.sjoin(temp, how='inner', predicate='intersects')
    gdf_distanced = gdf_distanced.dropna()
    return gdf_distanced

In [None]:
#calculates width of all entries in gdf, and adds them to a width column. Assumes rectangular equivalent shape for polygons

def calc_widths(gdf):
    gdf['temp'] = 1 # create column of ones
    gdf['SemiPeri'] = -gdf.length/2 # i need it negative for the equation
    gdf['Area'] = gdf.area
    def calculate_roots(row):
        coefficients = row[['temp', 'SemiPeri', 'Area']].values
        roots = np.roots(coefficients).real
        return roots

    #gdf['roots'] = gdf.apply(calculate_roots, axis=1)
    gdf['roots'] = gdf[['temp', 'SemiPeri', 'Area']].apply(calculate_roots, axis=1)
    gdf[['root1', 'root2']] = pd.DataFrame(gdf['roots'].tolist(), index=gdf.index)
    gdf['width'] = gdf['root2']
    gdf = gdf.drop([ 'Area', 'temp', 'SemiPeri', 'roots', 'root2'], axis = 1)
    return gdf

In [None]:
#takes gdf, finds adjacent roads and dissolves them into one
def dissolver(gdf):

    #divide intersections and roads
        ints, roads = ints_and_roads(gdf)
    
    #gives adjacency for roads of gdf
        adj = within_dist_dupes(roads,0.1,roads)
    
    #remove self connections and islands (speeds up execution significantly)
        adj = adj[adj.index_right != adj.index]

    #create network to find connected components
        edge_list = pd.DataFrame(zip(adj.index, adj.index_right), columns = ['from', 'to'])
        G = nx.from_pandas_edgelist(edge_list, 'from', 'to' , create_using=nx.MultiGraph())

    #store connected components in dictionary
        mylist = sorted(nx.connected_components(G), key = len, reverse = True)
        mydict = {i: val for i, val in enumerate(mylist)}
    
    #if index of gdf is in dict, tag it with the key of the dict. if not, tag it with itself
        def get_key_or_index(idx, mydict):
            for key, indices in mydict.items():
                if idx in indices:
                    return key
            return idx

    #this is to suppress setting with copy warning
        pd.options.mode.chained_assignment = None
        roads['component'] = roads.index.map(lambda idx: get_key_or_index(idx, mydict))
        ints['component'] = -1
        roads = roads.dissolve(by = 'component', as_index = False)
        gdf = pd.concat([ints, roads])
        return gdf


In [None]:
def make_edges(gdf_tot):

    #prep
    #takes dataset with roads and intersections, creates edgelist of nodes with weights of edges
    ints, roads = ints_and_roads(gdf_tot)
    #we need indices from 0 --> reset
    ints.reset_index(inplace = True, drop = True)
    roads.reset_index(inplace = True, drop = True)


    #body
    stubs = within_dist_dupes(ints, 1, roads) #all stubs, i.e all roads connected to all nodes
    grouped = stubs.groupby('index_right') #one dataframe for each node
    edges = {} # will contain intersections of each node 
    edge_list = pd.DataFrame(columns = ['from','to','weight'])
    for node, group in grouped:
        stubs = stubs[stubs['index_right'] != node] #removing "self" from gdf that we will merge onto, to avoid self connections. also removes redundancies  
        edges[node] = pd.merge(group,stubs, on = 'ID', how = 'inner')
        edge_list_temp = pd.DataFrame({'from': edges[node].index_right_x, 'to': edges[node].index_right_y, 'weight': edges[node].width_x})
        edge_list = pd.concat([edge_list if not edge_list.empty 
                               else None,edge_list_temp])
    
    
    #exceptions    
    #adds self-edges to nodes that don't appear in to or from
    conc = pd.concat([edge_list['from'], edge_list['to']])
    all = set(range(0, len(ints))) #all possible nodes
    there = set(conc.unique()) #the nodes we actually have
    not_there = sorted(list(all-there)) #missing nodes (irregardless of why they're missing for the moment)
    df_self = pd.DataFrame({'from': not_there, 'to': not_there, 'weight': [1] * len(not_there)})
    edge_list = pd.concat([edge_list, df_self])
    return edge_list
    


In [None]:
def calc_pos(gdf, G):
    #takes dataset and network, calculates node positions and stores them in dict
    
    ints, _ = ints_and_roads(gdf)
    cent = ints.centroid
    coordinates = np.column_stack((cent.geometry.x, cent.geometry.y))
    positions = dict(zip(sorted(G.nodes), coordinates))
    return positions

In [None]:
def plot_clusters(positions, G):
    #takes network, plots clusters by color and edge width by color
    
    Gcc = sorted(nx.connected_components(G), key = len, reverse = True)
    
    f, ax = plt.subplots(1, 1, figsize=(8, 8))
    ax.set_title("Road graph. Nodes are colored by cluster, and edges are colored by width")
    ax.axis("off")
    colorlist = [ 'r', 'g', 'b', 'y', 'orange']
    #plot each component one at a time
    
    for i in range(0, len(Gcc)):
        nx.draw_networkx_nodes(G, positions, nodelist = list(Gcc[i]), 
                               node_color = colorlist[i%5], ax=ax, 
                               node_size=0.8, alpha = 0.3)
    #now draw all edges using color map
    edges, weights = zip(*nx.get_edge_attributes(G,'weight').items())
    nx.draw_networkx_edges(G, positions, edge_color = weights, 
                           edge_cmap = plt.cm.inferno, 
                           edge_vmin = min(weights), edge_vmax = max(weights), width =0.3,
                           node_size = 0.9,
                           ax = ax)   
    #labels = nx.draw_networkx_labels(G, pos=positions, font_size = 6)
    cx.add_basemap(ax, source=cx.providers.CartoDB.Positron)
    
    plt.show()

## Creating the Networks

Our networks will have intersections as nodes and roads as edges. an edge will be placed between two nodes when there is a road connecting them.  
Our dataset sometimes has consecutive roads counted as distinct ones; we must join consecutive roads together so that we can rpoperly place edges between nodes.  

### Toy model: streets near tangenziali

First we show how dissolve joins adjacent roads.  Then we demonstrate how dissolving with tangenziali leads to large edges that are hard to deal with.  

In [None]:
gdf = calc_widths(gdf)

In [None]:
toy = gdf.loc[gdf.NAME == 'VIA GAUDENZIO FANTOLI']

In [None]:
toy_d  = dissolver(toy)

In [None]:
fig, ax = plt.subplots(1,2)
fig.suptitle('before and after dissolve')
ax[0].set_title('before')
ax[0].set_xticks([])
ax[0].set_yticks([])

ax[1].set_title('after')
ax[1].set_xticks([])
ax[1].set_yticks([])

toy.plot(column = 'TYPE', cmap = 'viridis', edgecolor = 'r', linewidth = 0.6, ax = ax[0])
toy_d.plot(column = 'TYPE', cmap = 'viridis', edgecolor = 'r', linewidth = 0.6, ax = ax[1])
cx.add_basemap(ax[0], crs=toy.crs, zoom = 15, source=cx.providers.CartoDB.Positron)
cx.add_basemap(ax[1], crs=toy.crs, zoom = 15, source=cx.providers.CartoDB.Positron)
plt.show()

Consecutive roads were successfully dissolved into single roads.  Now the intersections (in yellow) will be properly detected as connected by edges.

Now let's add tangenziali to our toy model

In [None]:
toy = within_dist(toy,100,gdf) #takes all streets within 100 m of toy
toy_d  = dissolver(toy)

In [None]:
fig, ax = plt.subplots(1,2)
fig.suptitle('before and after dissolve, with tangenziali')
ax[0].set_title('before')
ax[0].set_xticks([])
ax[0].set_yticks([])

ax[1].set_title('after')
ax[1].set_xticks([])
ax[1].set_yticks([])

toy.plot(column = 'ID', ax = ax[0])
toy_d.plot(column = 'ID', ax = ax[1])
cx.add_basemap(ax[0], crs=toy.crs, zoom = 15, source=cx.providers.CartoDB.Positron)
cx.add_basemap(ax[1], crs=toy.crs, zoom = 15, source=cx.providers.CartoDB.Positron)
plt.show()

Each individual street or intersection has its color. We see that in the before image there are many streets, whereas in the after image The tangenziale has combined with some other streets to become one large brown street.  This is a problem because it means that all the nodes that the tangenziale touches will be connected by this very large edge.  

We can see a more in depth image using the gdf.explore() command.  Remember, TYPE that starts with 01 counts as a road, and with 02 counts as an intersection.  
By looking around you can see that many intersections are connected by one large brown road.  
For example, the intersection of VIALE DELL'AVIAZIONE (in cyan) connects to the same road polygon four times... 

In [None]:
toy_d.explore(column = 'ID') #column = 'TYPE' to see color coding of intersections/roads instead

It's clear to see that the tangenziale has engulfed a piece of Via Gaudenzio Fantoli into it (NB THESE STREETS ARE NOT EVEN AT THE SAME HEIGHT, SO THIS IS ANOTHER CRITICALITY OF THIS METHOD) and via Monlue.

### Entire dataset with tangenziali

now let's look at the actual graph of the road network

In [None]:
gdf2 = dissolver(gdf)
gdf2 = calc_widths(gdf2)
gdf2.reset_index(inplace = True, drop = True)

In [None]:
edges = make_edges(gdf2)
G = nx.from_pandas_edgelist(edges, 'from', 'to', edge_attr=["weight"] , create_using=nx.MultiGraph())
Gcc = sorted(nx.connected_components(G), key = len, reverse = True)

print("Number of components: ",len(Gcc),
      "\n largest: ",  len(Gcc[0]),
      "\n second largest: " , len(Gcc[1]),
      "\n third largest: " ,len(Gcc[2])
     )


A perfect fully connected network would have just one giant component. We have a few exceptions in our dataset, but at least we see that the second largest component only has 15 nodes.  



### Graph plot:

We plot the graph network below. NB isolated nodes have been given a self loop to make them more visible in the plot. The loop has no geographical meaning.

In [None]:
positions = calc_pos(gdf2, G)
plot_clusters(positions, G)

The large purplish clusters are all the regions where tangenziali are present.  The number of the nodes in the regions is correct, but all of them seem to be connected by these extremely long and identical edges.

## Entire dataset without tangenziali

Let's see what the network looks like without tangenziali.

In [None]:
gdf = gdf.loc[~gdf['NAME'].str.contains('TANGENZIALE', regex = False)] #removing tangenziali

gdf3 = dissolver(gdf)
gdf3 = calc_widths(gdf3)
gdf3.reset_index(inplace = True, drop = True)

In [None]:
edges3 = make_edges(gdf3)
G3 = nx.from_pandas_edgelist(edges, 'from', 'to', edge_attr=["weight"] , create_using=nx.MultiGraph())
Gcc = sorted(nx.connected_components(G3), key = len, reverse = True)

print("Number of components: ",len(Gcc),
      "\n largest: ",  len(Gcc[0]),
      "\n second largest: " , len(Gcc[1]),
      "\n third largest: " ,len(Gcc[2])
     )

Just by looking at the component sizes, it seems like the general connectivity of the network has not gotten worse

In [None]:
positions = calc_pos(gdf3, G3)
plot_clusters(positions, G3)

Most of the criticalities have disappeared! some remain because some of the streets in milan that are very large are not technically tangenziali. They can be manually removed if necessary

## Average length and width distribution of roads

Let's look at some metrics:

In [None]:
gdf2.head() #length is stored in root1 column, width in width

In [None]:
print('length comparison \n',
      'with tangenziali: ', gdf2.root1.mean(),
      '+- ', gdf2.root1.std(),
      '\n \t median: ', gdf2.root1.median(),
      '\n without tangenziali: ' , gdf3.root1.mean(),
      '+- ', gdf3.root1.std(),
      '\n \t median: ',gdf3.root1.median())

In [None]:
fig, ax = plt.subplots(1,2)
fig.suptitle('length distribution with and without tangenziali')
ax[0].set_title('with')
ax[1].set_title('without')
ax[0].set_yscale('log')
ax[1].set_yscale('log')
range = np.linspace(0,gdf2.root1.max(), 50)
gdf2.root1.plot(kind = 'hist', bins = range,ax = ax[0],)
gdf3.root1.plot(kind = 'hist', bins = range, ax = ax[1],)
plt.show()

We can see that removing tangenziali loses a large portion of the longest streets.

In [None]:
print('width comparison \n',
      'with tangenziali: ', gdf2.width.mean(),
      '+- ', gdf2.width.std(),
      '\n \t median: ', gdf2.width.median(),
      '\n without tangenziali: ' , gdf3.width.mean(),
      '+- ', gdf3.width.std(),
      '\n \t median: ',gdf3.width.median())

In [None]:
fig, ax = plt.subplots(1,2)
fig.suptitle('length distribution with and without tangenziali')
ax[0].set_title('with')
ax[1].set_title('without')
ax[0].set_yscale('log')
ax[1].set_yscale('log')
range = np.linspace(0,gdf2.width.max(), 50)
gdf2.width.plot(kind = 'hist', bins = range,ax = ax[0],)
gdf3.width.plot(kind = 'hist', bins = range, ax = ax[1],)
plt.show()

Width is practically identical with and without. This makes sense, since tangenziali aren't wide enough to skew the metric.  

All this suggests that removing Tangenziali from the dataset could significantly clean up the dataset while preserving the structure and metrics of the network.  
Let me know what you think.

## Percolation and keeping track of removed lengths
To do