## `map_region_city_coordinates.py`
1. Read the csv file from the `Output/output03-gnoc_networks-countrycodes.csv` into pandas dataframe
2. Add new column CountryCode in dataframe by performing jmespath search in `CityDatabase/regions.json` 
3. Write file to `Output/output04-gnoc_networks-region_city_coordinates.csv`

In [1]:
import pandas as pd
import json
import jmespath
import numpy as np
import os.path
import time



In [2]:
df = pd.read_csv("Output/output03-gnoc_networks-countrycodes.csv", encoding='utf8', sep=';')

In [3]:
df.sort_values(by=['Country'],inplace=True)
df.head()

Unnamed: 0,Network,City,Country,CountryCode
7051,10.129.5.0/24,Buenos Aires,Argentina,AR
7046,10.129.13.0/24,Buenos Aires,Argentina,AR
10184,10.129.9.128/27,FW-unknown,Argentina,AR
10185,10.129.36.0/25,FW-unknown,Argentina,AR
10186,10.129.32.128/27,FW-unknown,Argentina,AR


In [4]:
#### TESTING ###############

filename = "CityDatabase/Cities/"+str("IN")+".json"
regions = open(filename, mode='r',encoding ="utf8")
data = json.load(regions, encoding="utf8")

cityname = "Rampur"


latitude_query = """regions[*].cities[?name=='"""+cityname+"""'].latitude"""
longitude_query = """regions[*].cities[?name=='"""+cityname+"""'].longitude"""
cityname_query = """regions[*].cities[?name=='"""+cityname+"""'].name"""

region_query = """regions[].{region_name: name, city_names: cities[?name=='"""+cityname+"""'].name[]}"""


path = jmespath.search(region_query,data)

# path = [x for x in path if x != []]

for x in path:
    if(x['city_names']): 
        print(x['region_name'])

q = [x['region_name'] for x in path if (x['city_names'])]  

print(q)

regions.close()


Himachal Pradesh
Uttar Pradesh
['Himachal Pradesh', 'Uttar Pradesh']


In [None]:
def search_latitude(x):
    filename = "CityDatabase/Cities/"+str(x['CountryCode'])+".json"

    if(os.path.exists(filename)==True):
        regions = open(filename, mode='r',encoding ="utf8")
        data = json.load(regions, encoding="utf8")

        latitude_query  = """regions[*].cities[?name=='"""+str(x['City'])+"""'].latitude"""
        
        path = jmespath.search(latitude_query,data)
        path = [y for y in path if y != []]
        
        if(len(path)==0): # City is not present in JSON file, hence return null
            #print("City not present in list")
            return(str(np.nan))
        elif(len(path)==1): # Exactly 1 match is found, hence return latitude
            #print("Returning latitude")
            return path[0][0]
        elif(len(path)>1): # More than 1 city found, hence ambiguity about the region and lat-long
            #print("I am not that smart")
            return(str(np.nan))
        data =""
        regions.close()
    else:
        return(str(np.nan))
    
start = time.time()
df['Latitude'] = df[['CountryCode','City']].apply(search_latitude,axis=1) 
end = time.time()

print(end - start)

# 678 seconds for 14k entries
# sorting the data by city: 382 sec

In [None]:
def search_longitude(x):
    filename = "CityDatabase/Cities/"+str(x['CountryCode'])+".json"

    if(os.path.exists(filename)==True):
        regions = open(filename, mode='r',encoding ="utf8")
        data = json.load(regions, encoding="utf8")

        longitude_query  = """regions[*].cities[?name=='"""+str(x['City'])+"""'].longitude"""
        
        path = jmespath.search(longitude_query,data)
        path = [y for y in path if y != []]
        
        if(len(path)==0): # City is not present in JSON file, hence return null
            #print("City not present in list")
            return(str(np.nan))
        elif(len(path)==1): # Exactly 1 match is found, hence return longitude
            #print("Returning latitude")
            return path[0][0]
        elif(len(path)>1): # More than 1 city found, hence ambiguity about the region and lat-long
            #print("I am not that smart")
            return(str(np.nan))
        data =""
        regions.close()
    else:
        return(str(np.nan))
    
start = time.time()
df['Longitude'] = df[['CountryCode','City']].apply(search_longitude,axis=1)
end = time.time()
print(end - start)

# 684 seconds for 14k entries
# sorting helps 386 sec

In [None]:
def search_region(x):
    
    filename = "CityDatabase/Cities/"+str(x['CountryCode'])+".json"

    if(os.path.exists(filename)==True):
        regions = open(filename, mode='r',encoding ="utf8")
        data = json.load(regions, encoding="utf8")

        region_query = """regions[].{region_name: name, city_names: cities[?name=='"""+str(x['City'])+"""'].name[]}"""

        path = jmespath.search(region_query,data)
        q = [x['region_name'] for x in path if (x['city_names'])]   # frigging list comprehension
        
        if(len(q)==0): # Region is not found in JSON file, hence return null
            #print("Region is not present in list")
            return(str(np.nan))
        
        elif(len(q)==1): # Exactly 1 match is found, hence return region
            #print("Returning latitude")
            return str(q[0])
        
        elif(len(q)>1): # More than 1 regions found, hence ambiguity about the region
            #print("I am not that smart")
            return(str(np.nan))
        data =""
        regions.close()
        
    else:
        return(str(np.nan)) # If json file does not exists, we cannot search region



start = time.time()
df['Region'] = df[['CountryCode','City']].apply(search_region,axis=1)
end = time.time()
print(end - start)


# 390 seconds for 14k entries

In [None]:
df.head()

In [None]:
df.to_csv("Output/output04-gnoc_networks-region_city_coordinates.csv", encoding="utf8", index=False)