In [24]:
import pandas as pd
import psycopg2
import sys
import pandas
import math
from pandas.io import sql
import sqlalchemy
from sqlalchemy import create_engine
from datetime import datetime
import os
from configobj import ConfigObj
import contextlib

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from functions.utils import database

In [25]:
## define paths and variables

config_path = "/home/jovyan/shared/rural_planner_refactoring/config_files_test/config_co"

parser = ConfigObj(config_path)

sql_path =  parser['sql_path']
country_folder = parser['country_folder']
schema = parser['clustering_params']['schema']


output_table = parser['clustering_params']['output_table']
table_transport = parser['clustering_params']['table_transport']
table_infrastructure = parser['clustering_params']['table_infrastructure']
table_settlements = parser['clustering_params']['table_settlements']
table_coverage = parser['clustering_params']['table_coverage']
table_franchises = parser['clustering_params']['table_franchises']
table_nodes_original = parser['clustering_params']['table_nodes_original']
table_nodes = parser['clustering_params']['table_nodes']
table_schools   = parser['clustering_params']['table_schools']

radius = int(parser['clustering_params']['max_coverage_radius'])

In [26]:
#Create node table

query_path = sql_path + '/' + country_folder + '/' + 'clustering_create_node_table.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                               table_coverage = table_coverage,
                               table_settlements = table_settlements,
                               table_infrastructure = table_infrastructure,
                               table_nodes_original = table_nodes_original,
                               table_franchises = table_franchises,
                               table_transport = table_transport)
    db.execute(query_formatted)


In [27]:
#Create node table copy

query_path = sql_path + '/' + 'clustering_create_node_table_copy.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   table_nodes = table_nodes,
                               table_nodes_original = table_nodes_original)
    db.execute(query_formatted)


In [28]:
# Generate clusters with tx

query_path = sql_path + '/' + country_folder + '/' + 'clustering_towers_priority_1.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                             table_nodes = table_nodes,
                             radius = radius,
                             table_infrastructure = table_infrastructure,
                             table_coverage = table_coverage)
    clusters_towers_p1 = pd.read_sql_query(query_formatted, db)
    

In [29]:
# Delete already assigned nodes from node_table

query_path = sql_path + '/' + 'clustering_delete_assigned_nodes.sql'

excluded_ids = ''    

for i in range(0, len(clusters_towers_p1)-1):
    if (clusters_towers_p1.iloc[i]['nodes'] == ''):
        nodes = '\', '
    else:
        nodes = '\', ' + clusters_towers_p1.iloc[i]['nodes'] + ', '
    excluded_ids =   excluded_ids +  ' \'' + clusters_towers_p1.iloc[i]['centroid'] + nodes

if (clusters_towers_p1.iloc[i]['nodes'] == ''):
    nodes_end = ''
else:
    nodes_end = clusters_towers_p1.iloc[len(clusters_towers_p1)-1]['nodes'] + ', '
excluded_ids =   excluded_ids + nodes_end + ' \'' + clusters_towers_p1.iloc[len(clusters_towers_p1)-1]['centroid']+ '\''

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                             table_nodes = table_nodes,
                             excluded_ids = excluded_ids)
    db.execute(query_formatted)

In [30]:
#Clustering for towers with no tx

query_path = sql_path + '/' + country_folder + '/' + 'clustering_towers_priority_2.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                             table_nodes = table_nodes,
                             radius = radius,
                             table_infrastructure = table_infrastructure,
                             table_coverage = table_coverage)
    clusters_towers_p2 = pd.read_sql_query(query_formatted, db)
    

In [32]:
# Delete already assigned nodes from node_table

query_path = sql_path + '/' + 'clustering_delete_assigned_nodes.sql'

excluded_ids = ''

for i in range(0, len(clusters_towers_p2)-1):
    if (clusters_towers_p2.iloc[i]['nodes'] == ''):
        nodes = '\', '
    else:
        nodes = '\', ' + clusters_towers_p2.iloc[i]['nodes'] + ', '
    excluded_ids =   excluded_ids +  ' \'' + clusters_towers_p2.iloc[i]['centroid'] + nodes

if (clusters_towers_p2.iloc[i]['nodes'] == ''):
    nodes_end = ''
else:
    nodes_end = clusters_towers_p2.iloc[len(clusters_towers_p2)-1]['nodes'] + ', '
excluded_ids =   excluded_ids + nodes_end + ' \'' + clusters_towers_p2.iloc[len(clusters_towers_p2)-1]['centroid']+ '\''

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                             table_nodes = table_nodes,
                             excluded_ids = excluded_ids)
    db.execute(query_formatted)

In [33]:
#Delete nodes that are towers

query_path = sql_path + '/' + country_folder + '/' + 'clustering_delete_unwanted_nodes.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   table_nodes = table_nodes)
    db.execute(query_formatted)


In [34]:
#Clustering for SETTLEMENTS

query_path = sql_path + '/' + country_folder + '/' + 'clustering_settlements.sql'
del_query_path = sql_path + '/' + 'clustering_delete_assigned_nodes.sql'

with open(query_path) as file, open(del_query_path) as del_file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                             table_nodes = table_nodes,
                             table_coverage = table_coverage,
                             table_franchises = table_franchises,
                             table_schools = table_schools,
                             radius = radius)
    
    del_query = del_file.read()
    del_query_formatted = del_query.format(schema = schema, 
                             table_nodes = table_nodes,
                            excluded_ids = excluded_ids)
    
    df_clusters_int = pd.read_sql_query(query_formatted, db)
    clusters_greenfield = pd.DataFrame(columns = df_clusters_int.columns)

    num_nodes = 0
    excluded_ids = ''
    
    #We cluster the rest of nodes iteratively
    i = 0
    while(not df_clusters_int.empty):
        i = i + 1

        excluded_ids = df_clusters_int.iloc[0]['nodes'] + ', \'' + df_clusters_int.iloc[0]['centroid'] + '\''

        del_query_formatted=del_query.format(schema = schema, 
                                 table_nodes = table_nodes,
                                excluded_ids = excluded_ids)

        db.execute(del_query_formatted) 

        num_nodes = num_nodes + df_clusters_int.iloc[0]['cluster_size']

        clusters_greenfield = clusters_greenfield.append(df_clusters_int, ignore_index=True)

        df_clusters_int = pd.read_sql_query(query_formatted,con=db)

        if(i%10 == 0):
            print(str(datetime.now()))
            print("Iteration " + str(i) + " with " + str(num_nodes) + " nodes clustered.")
            print(df_clusters_int)

2020-02-17 10:58:23.398236
Iteration 10 with 65 nodes clustered.
           centroid          centroid_type  centroid_weight  \
0  N4D4975-W69D7922  SETTLEMENT GREENFIELD       144.774516   

                                               nodes  cluster_weight  \
0  '99773000' ,'N4D5086-W69D8097' ,'N4D5194-W69D8...     8258.401332   

   cluster_size  
0             4  
2020-02-17 10:58:26.127640
Iteration 20 with 114 nodes clustered.
   centroid          centroid_type  centroid_weight                   nodes  \
0  76364014  SETTLEMENT GREENFIELD      3326.217426  '76364011' ,'76364018'   

   cluster_weight  cluster_size  
0     6576.592761             3  
2020-02-17 10:58:28.626244
Iteration 30 with 151 nodes clustered.
   centroid          centroid_type  centroid_weight  \
0  19517044  SETTLEMENT GREENFIELD      1092.833096   

                                               nodes  cluster_weight  \
0  '19517030' ,'19517032' ,'19517034' ,'19517035'...     5206.466847   

   cluster_s

2020-02-17 10:59:13.993065
Iteration 270 with 1056 nodes clustered.
   centroid          centroid_type  centroid_weight                   nodes  \
0  05002015  SETTLEMENT GREENFIELD       932.574442  '05002012' ,'05376001'   

   cluster_weight  cluster_size  
0     2063.015509             3  
2020-02-17 10:59:15.285277
Iteration 280 with 1091 nodes clustered.
   centroid          centroid_type  centroid_weight                   nodes  \
0  47555018  SETTLEMENT GREENFIELD       565.646018  '47555005' ,'47555009'   

   cluster_weight  cluster_size  
0     1983.982301             3  
2020-02-17 10:59:16.562010
Iteration 290 with 1118 nodes clustered.
   centroid          centroid_type  centroid_weight                   nodes  \
0  19760001  SETTLEMENT GREENFIELD        545.98081  '19392002' ,'19760012'   

   cluster_weight  cluster_size  
0     1901.309526             3  
2020-02-17 10:59:17.831769
Iteration 300 with 1148 nodes clustered.
   centroid          centroid_type  centroid_we

2020-02-17 10:59:47.744401
Iteration 560 with 1853 nodes clustered.
   centroid          centroid_type  centroid_weight  \
0  19809027  SETTLEMENT GREENFIELD       278.511538   

                            nodes  cluster_weight  cluster_size  
0  '19809011' ,'N2D7584-W77D4403'      960.384615             3  
2020-02-17 10:59:48.760527
Iteration 570 with 1876 nodes clustered.
   centroid          centroid_type  centroid_weight                   nodes  \
0  13244017  SETTLEMENT GREENFIELD       239.029683  '13244016' ,'70713004'   

   cluster_weight  cluster_size  
0      941.960464             3  
2020-02-17 10:59:49.778056
Iteration 580 with 1901 nodes clustered.
            centroid  centroid_type  centroid_weight                   nodes  \
0  N11D2950-W73D9720  SETTLEMENT 2G         35.59669  '47001002' ,'47001032'   

   cluster_weight  cluster_size  
0      923.140825             3  
2020-02-17 10:59:50.780279
Iteration 590 with 1929 nodes clustered.
   centroid          centroid

2020-02-17 11:00:20.049419
Iteration 840 with 2551 nodes clustered.
           centroid  centroid_type  centroid_weight       nodes  \
0  N3D8913-W73D6303  SETTLEMENT 2G       188.893126  '50006006'   

   cluster_weight  cluster_size  
0      382.961407             2  
2020-02-17 11:00:22.163688
Iteration 850 with 2573 nodes clustered.
           centroid          centroid_type  centroid_weight  \
0  N1D8208-W75D8168  SETTLEMENT GREENFIELD        62.001842   

                                               nodes  cluster_weight  \
0  'N1D8032-W75D8211' ,'N1D8240-W75D8182' ,'N1D84...       364.71672   

   cluster_size  
0             4  
2020-02-17 11:00:24.963511
Iteration 860 with 2596 nodes clustered.
   centroid          centroid_type  centroid_weight       nodes  \
0  76109035  SETTLEMENT GREENFIELD        160.23949  '76109090'   

   cluster_weight  cluster_size  
0       345.13121             2  
2020-02-17 11:00:26.461747
Iteration 870 with 2618 nodes clustered.
           cen

In [35]:
#Create data frame with the unclustered nodes to include them as one-node clusters

query_path = sql_path + '/' + 'clustering_clusters_unclustered.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   table_nodes = table_nodes)
    clusters_unclustered = pd.read_sql_query(query_formatted, db)


In [36]:
clusters  = clusters_towers_p1.append(clusters_towers_p2, ignore_index=True).append(clusters_greenfield, ignore_index=True).append(clusters_unclustered, ignore_index=True)

In [38]:
#Create final data frame with all clusters        

query_path = sql_path + '/' + 'clustering_add_geom_clusters.sql'

with open(query_path) as file, database(parser) as db:    
    clusters.to_sql(output_table, con=db, if_exists = 'replace', schema = schema, index = False, 
                dtype = {'centroid_weight': sqlalchemy.types.Integer(),
                         'cluster_weight': sqlalchemy.types.Integer(),
                         'cluster_size': sqlalchemy.types.Integer()
                        })
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   output_table = output_table,
                               table_nodes_original = table_nodes_original)
    db.execute(query_formatted)


In [127]:
#Create links data frame

query_path = sql_path + '/' + 'clustering_create_clusters_links.sql'

with open(query_path) as file, database(parser) as db:
    query = file.read()
    query_formatted = query.format(schema = schema,
                                   output_table = output_table,
                               table_nodes_original = table_nodes_original)
    db.execute(query_formatted)
