In [1]:
import pandas as pd
import psycopg2
import sys
import pandas
import numpy as np
import math
from pandas.io import sql
import sqlalchemy
from sqlalchemy import create_engine
from datetime import datetime
import os
from configobj import ConfigObj
import contextlib

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from functions.utils import database

In [2]:
## define paths and variables

config_path = "/home/jovyan/shared/rural_planner_refactoring/config_files_test/config_co"

parser = ConfigObj(config_path)

sql_path =  parser['sql_path']
country_folder = parser['country_folder']
schema = parser['clustering_params']['schema']


output_table = parser['clustering_3g_params']['output_table']
table_transport = parser['clustering_params']['table_transport']
table_infrastructure = parser['clustering_params']['table_infrastructure']
table_settlements = parser['clustering_params']['table_settlements']
table_coverage = parser['clustering_params']['table_coverage']
table_franchises = parser['clustering_params']['table_franchises']
table_nodes_original = parser['clustering_3g_params']['table_nodes_original']
table_nodes = parser['clustering_3g_params']['table_nodes']
table_schools   = parser['clustering_params']['table_schools']

radius = int(parser['clustering_params']['max_coverage_radius'])

In [3]:
#Create node table

query_path = sql_path + '/' + country_folder + '/' + 'clustering_3g_create_node_table.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                               table_coverage = table_coverage,
                               table_settlements = table_settlements,
                               table_infrastructure = table_infrastructure,
                               table_nodes_original = table_nodes_original,
                               table_franchises = table_franchises,
                               table_transport = table_transport)
    db.execute(query_formatted)


In [4]:
#Create node table copy

query_path = sql_path + '/' + 'clustering_create_node_table_copy.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   table_nodes = table_nodes,
                               table_nodes_original = table_nodes_original)
    db.execute(query_formatted)


In [5]:
# Generate clusters with tx

query_path = sql_path + '/' + country_folder + '/' + 'clustering_3g_towers.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                             table_nodes = table_nodes,
                             radius = radius,
                             table_infrastructure = table_infrastructure,
                             table_coverage = table_coverage)
    clusters_towers_3g = pd.read_sql_query(query_formatted, db)
    

In [7]:
# Delete already assigned nodes from node_table

query_path = sql_path + '/' + 'clustering_delete_assigned_nodes.sql'

excluded_ids = ''    

for i in range(0, len(clusters_towers_3g)-1):
    if (clusters_towers_3g.iloc[i]['nodes'] == ''):
        nodes = '\', '
    else:
        nodes = '\', ' + clusters_towers_3g.iloc[i]['nodes'] + ', '
    excluded_ids =   excluded_ids +  ' \'' + clusters_towers_3g.iloc[i]['centroid'] + nodes

if (clusters_towers_3g.iloc[i]['nodes'] == ''):
    nodes_end = ''
else:
    nodes_end = clusters_towers_3g.iloc[len(clusters_towers_3g)-1]['nodes'] + ', '
excluded_ids =   excluded_ids + nodes_end + ' \'' + clusters_towers_3g.iloc[len(clusters_towers_3g)-1]['centroid']+ '\''

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                             table_nodes = table_nodes,
                             excluded_ids = excluded_ids)
    db.execute(query_formatted)

In [8]:
#Delete nodes that are towers

query_path = sql_path + '/' + 'clustering_3g_delete_unwanted_nodes.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   table_nodes = table_nodes)
    db.execute(query_formatted)


In [9]:
#Clustering for SETTLEMENTS

query_path = sql_path + '/' + country_folder + '/' + 'clustering_settlements.sql'
del_query_path = sql_path + '/' + 'clustering_delete_assigned_nodes.sql'

with open(query_path) as file, open(del_query_path) as del_file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                             table_nodes = table_nodes,
                             table_coverage = table_coverage,
                             table_franchises = table_franchises,
                             table_schools = table_schools,
                             radius = radius)
    
    del_query = del_file.read()
    del_query_formatted = del_query.format(schema = schema, 
                             table_nodes = table_nodes,
                            excluded_ids = excluded_ids)
    
    df_clusters_int = pd.read_sql_query(query_formatted, db)
    clusters_greenfield_3g = pd.DataFrame(columns = df_clusters_int.columns)

    num_nodes = 0
    excluded_ids = ''
    
    #We cluster the rest of nodes iteratively
    i = 0
    while(not df_clusters_int.empty):
        i = i + 1

        excluded_ids = df_clusters_int.iloc[0]['nodes'] + ', \'' + df_clusters_int.iloc[0]['centroid'] + '\''

        del_query_formatted=del_query.format(schema = schema, 
                                 table_nodes = table_nodes,
                                excluded_ids = excluded_ids)

        db.execute(del_query_formatted) 

        num_nodes = num_nodes + df_clusters_int.iloc[0]['cluster_size']

        clusters_greenfield_3g = clusters_greenfield_3g.append(df_clusters_int, ignore_index=True)

        df_clusters_int = pd.read_sql_query(query_formatted,con=db)

        if(i%10 == 0):
            print(str(datetime.now()))
            print("Iteration " + str(i) + " with " + str(num_nodes) + " nodes clustered.")
            print(df_clusters_int)

2020-02-17 13:35:55.011124
Iteration 10 with 64 nodes clustered.
   centroid   centroid_type  centroid_weight  \
0  23686020  SETTLEMENT 3G+       851.131253   

                                            nodes  cluster_weight  \
0  '23168005' ,'23686006' ,'23686008' ,'23686029'     6230.859623   

   cluster_size  
0             5  
2020-02-17 13:35:55.718340
Iteration 20 with 101 nodes clustered.
   centroid   centroid_type  centroid_weight  \
0  70670001  SETTLEMENT 3G+       2172.28524   

                                            nodes  cluster_weight  \
0  '70001007' ,'70001013' ,'70001015' ,'70001021'      4317.71735   

   cluster_size  
0             5  
2020-02-17 13:35:56.394451
Iteration 30 with 137 nodes clustered.
           centroid   centroid_type  centroid_weight       nodes  \
0  N5D5986-W73D2023  SETTLEMENT 3G+       144.346049  '15814000'   

   cluster_weight  cluster_size  
0     3794.346049             2  
2020-02-17 13:35:57.036967
Iteration 40 with 177 nodes

In [11]:
#Create data frame with the unclustered nodes to include them as one-node clusters

query_path = sql_path + '/' + 'clustering_clusters_unclustered.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   table_nodes = table_nodes)
    clusters_unclustered_3g = pd.read_sql_query(query_formatted, db)


In [12]:
clusters  = clusters_towers_3g.append(clusters_greenfield_3g, ignore_index=True).append(clusters_unclustered_3g, ignore_index=True)

In [17]:
with open(query_path) as file, database(parser) as db:    
    clusters.to_sql(output_table, con=db, if_exists = 'replace', schema = schema, index = False, 
                dtype = {'centroid_weight': sqlalchemy.types.Integer(),
                         'cluster_weight': sqlalchemy.types.Integer(),
                         'cluster_size': sqlalchemy.types.Integer()
                        })

In [18]:
#Create final data frame with all clusters        

query_path = sql_path + '/' + 'clustering_add_geom_clusters.sql'

with open(query_path) as file, database(parser) as db:    
    clusters.to_sql(output_table, con=db, if_exists = 'replace', schema = schema, index = False, 
                dtype = {'centroid_weight': sqlalchemy.types.Integer(),
                         'cluster_weight': sqlalchemy.types.Integer(),
                         'cluster_size': sqlalchemy.types.Integer()
                        })
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   output_table = output_table,
                               table_nodes_original = table_nodes_original)
    db.execute(query_formatted)


In [21]:
#Create links data frame

query_path = sql_path + '/' + 'clustering_create_clusters_links.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   output_table = output_table,
                               table_nodes_original = table_nodes_original)
    db.execute(query_formatted)


In [19]:
### Tests
clusters['cluster_weight'].sum()
#clusters_towers_tx['cluster_size'].sum()
#clusters_towers_no_tx['cluster_size'].sum()
#clusters_greenfield['cluster_size'].sum()
#clusters['centroid_type'].unique()
#clusters.head()

4965342.591953921