# Merge Geodata

Now we have a clean set of data we can merge the cleaned geodata.  
  
First we need to import the dependencies.

In [1]:
import geopandas as gp
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

Using Geopandas we can read in the shape file.

In [2]:
gdf0 = gp.read_file('../../data/geolocation/ph_cities_v2/ph_cities_v2.shp')
gdf0

Unnamed: 0,FID,geometry
0,0,"MULTIPOLYGON (((118.57998 9.37215, 118.57976 9..."
1,1,"POLYGON ((120.60896 13.35233, 120.60711 13.398..."
2,2,"POLYGON ((120.43443 14.68814, 120.42332 14.685..."
3,3,"MULTIPOLYGON (((121.39392 18.48335, 121.39868 ..."
4,4,"POLYGON ((124.92778 10.66676, 124.92364 10.677..."
...,...,...
1629,1629,"MULTIPOLYGON (((122.06639 6.86972, 122.06639 6..."
1630,1630,"POLYGON ((123.13098 9.06249, 123.13156 9.06276..."
1631,1631,"POLYGON ((120.85121 15.45757, 120.85025 15.455..."
1632,1632,"POLYGON ((122.66113 10.77914, 122.66028 10.779..."


In [3]:
gdf = gp.read_file(
    '../../data/geolocation/ph_cities_joined_v2/ph_cities_v2.shp')
gdf

Unnamed: 0,psgc,name,city_munic,province,clean_idx,longitude,latitude,coords,geometry
0,1705301000,Aborlan,Aborlan,Palawan,"aborlan, palawan",118.548417,9.437101,"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57982 9..."
1,1705101000,Abra De Ilog,Abra De Ilog,Occidental Mindoro,"abradeilog, occidentalmindoro",120.726826,13.443721,"13.4437209, 120.7268262","POLYGON ((120.60896 13.35233, 120.60797 13.373..."
2,300801000,Abucay,Abucay,Bataan,"abucay, bataan",120.534870,14.721315,"14.7213146, 120.5348704","POLYGON ((120.45676 14.69671, 120.45620 14.696..."
3,201501000,Abulug,Abulug,Cagayan,"abulug, cagayan",121.457273,18.443485,"18.4434854, 121.4572732","MULTIPOLYGON (((121.40276 18.40896, 121.40276 ..."
4,803701000,Abuyog,Abuyog,Leyte,"abuyog, leyte",125.011485,10.747102,"10.747102, 125.0114853","POLYGON ((125.04650 10.56751, 125.04588 10.576..."
...,...,...,...,...,...,...,...,...,...
1629,931700000,Zamboanga,City of Zamboanga,Zamboanga del Sur,"zamboanga, zamboangadelsur",122.079000,6.921400,"6.9214, 122.079","MULTIPOLYGON (((122.06639 6.86972, 122.06639 6..."
1630,704625000,Zamboanguita,Zamboanguita,Negros Oriental,"zamboanguita, negrosoriental",123.199424,9.100465,"9.1004649, 123.1994244","POLYGON ((123.20750 9.10485, 123.20722 9.10443..."
1631,304932000,Zaragoza,Zaragoza,Nueva Ecija,"zaragoza, nuevaecija",120.793554,15.447583,"15.4475833, 120.7935538","POLYGON ((120.81170 15.47132, 120.81309 15.470..."
1632,603047000,Zarraga,Zarraga,Iloilo,"zarraga, iloilo",122.609582,10.822379,"10.8223786, 122.6095819","POLYGON ((122.65892 10.79784, 122.65959 10.796..."


We have a column named **'FID'**.  
  
We can use this as a reference for our **DTi data** later on when mapping.  
  
Lets merge this into our new dataframe.

In [4]:
gdf = gdf.drop('geometry', axis=1).assign(geometry=gdf0['geometry'])
gdf = gdf.assign(FID=gdf0['FID'])
gdf

Unnamed: 0,psgc,name,city_munic,province,clean_idx,longitude,latitude,coords,geometry,FID
0,1705301000,Aborlan,Aborlan,Palawan,"aborlan, palawan",118.548417,9.437101,"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57976 9...",0
1,1705101000,Abra De Ilog,Abra De Ilog,Occidental Mindoro,"abradeilog, occidentalmindoro",120.726826,13.443721,"13.4437209, 120.7268262","POLYGON ((120.60896 13.35233, 120.60711 13.398...",1
2,300801000,Abucay,Abucay,Bataan,"abucay, bataan",120.534870,14.721315,"14.7213146, 120.5348704","POLYGON ((120.43443 14.68814, 120.42332 14.685...",2
3,201501000,Abulug,Abulug,Cagayan,"abulug, cagayan",121.457273,18.443485,"18.4434854, 121.4572732","MULTIPOLYGON (((121.39392 18.48335, 121.39868 ...",3
4,803701000,Abuyog,Abuyog,Leyte,"abuyog, leyte",125.011485,10.747102,"10.747102, 125.0114853","POLYGON ((124.92778 10.66676, 124.92364 10.677...",4
...,...,...,...,...,...,...,...,...,...,...
1629,931700000,Zamboanga,City of Zamboanga,Zamboanga del Sur,"zamboanga, zamboangadelsur",122.079000,6.921400,"6.9214, 122.079","MULTIPOLYGON (((122.06639 6.86972, 122.06639 6...",1629
1630,704625000,Zamboanguita,Zamboanguita,Negros Oriental,"zamboanguita, negrosoriental",123.199424,9.100465,"9.1004649, 123.1994244","POLYGON ((123.13098 9.06249, 123.13156 9.06276...",1630
1631,304932000,Zaragoza,Zaragoza,Nueva Ecija,"zaragoza, nuevaecija",120.793554,15.447583,"15.4475833, 120.7935538","POLYGON ((120.85121 15.45757, 120.85025 15.455...",1631
1632,603047000,Zarraga,Zarraga,Iloilo,"zarraga, iloilo",122.609582,10.822379,"10.8223786, 122.6095819","POLYGON ((122.66113 10.77914, 122.66028 10.779...",1632


Check the shape and type our DataFrame.

In [5]:
gdf.shape, type(gdf)

((1634, 10), pandas.core.frame.DataFrame)

In [6]:
gdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1634 entries, 0 to 1633
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   psgc        1634 non-null   int64   
 1   name        1634 non-null   object  
 2   city_munic  1634 non-null   object  
 3   province    1634 non-null   object  
 4   clean_idx   1634 non-null   object  
 5   longitude   1634 non-null   float64 
 6   latitude    1634 non-null   float64 
 7   coords      1634 non-null   object  
 8   geometry    1626 non-null   geometry
 9   FID         1634 non-null   int64   
dtypes: float64(2), geometry(1), int64(2), object(5)
memory usage: 127.8+ KB


Now we can read in the dataset.

In [7]:
df = pd.read_csv('../../data/dti-competitive-index/2-merged/complete_dataset_v2.csv')
df

Unnamed: 0,city_municipality,year,accommodation_capacity,active_establishments_in_the_locality,annual_disaster_drill,availability_of_basic_utilities,budget_for_drrmp,capacity_of_health_services,capacity_of_school_services,capacity_to_generate_local_resource,compliance_to_arta_citizens_charter,compliance_to_national_directives,cost_of_doing_business,cost_of_living,disaster_risk_reduction_plan,distance_to_ports,early_warning_system,education,emergency_infrastructure,employed_population,employment_generation,financial_deepening,financial_technology_capacity,getting_business_permits,health,information_technology_capacity,lgu_investment,land_use_plan,local_economy_growth,local_economy_size,local_risk_assessments,peace_and_order,presence_of_business_and_professional_organizations,presence_of_investment_promotion_unit,productivity,recognition_of_performance,road_network,safety_compliant_business,sanitary_system,social_protection,transportation_vehicles,utilities
0,Aborlan,2017,0.0000,0.2120,2.5000,2.5000,0.0017,0.1098,0.2879,1.4320,2.2597,2.3718,2.2277,1.5203,2.5000,2.1655,2.5000,0.0997,0.8750,0.0457,0.0218,0.1894,0.0463,0.7434,0.0623,0.2976,0.4592,1.8750,0.0012,0.0039,2.5000,0.4161,0.0000,2.5000,0.0145,0.2879,0.0000,0.1943,1.5706,0.1104,0.0204,0.9949
1,Aborlan,2018,0.0000,0.2337,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2.1771,1.9907,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0197,0.0423,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0224,0.0072,0.0000,0.0000,0.0218,0.0000,0.0047,0.0000,0.0000,0.1253,0.0000,0.0000,0.0000,0.0000
2,Aborlan,2019,0.0000,0.0000,2.2500,2.1047,0.2189,0.0614,0.1451,0.0074,1.9499,2.2000,1.9655,0.9127,2.4537,2.3204,2.5000,0.1300,0.4984,0.0401,0.0000,0.1339,0.0671,2.3451,0.0689,0.0806,0.3137,2.4231,0.0024,0.0051,2.5000,0.0012,0.0007,2.5000,0.0004,0.0595,0.1006,0.2553,1.4921,0.0007,0.0029,1.8558
3,Aborlan,2020,0.0000,0.1653,2.5000,2.3125,0.0852,0.0587,0.0488,0.0083,2.2404,2.4242,2.2798,0.3017,2.5000,2.4933,2.5000,0.1185,0.5004,0.0385,0.0651,0.1390,0.0631,2.3385,0.0830,0.2679,0.1665,2.5000,0.1419,0.0013,2.5000,0.0072,0.0216,0.6250,0.0413,0.0023,0.0006,0.1643,0.9946,0.0024,0.0026,1.8706
4,Aborlan,2021,0.0000,0.2606,1.2542,2.2948,0.0215,0.1064,0.0145,0.0061,2.1365,2.2619,2.3654,1.6250,2.4737,2.3721,1.2537,0.0835,0.0531,0.0060,0.1052,0.1231,0.0436,1.8169,0.0816,0.0909,0.0220,2.4545,0.0045,0.0067,2.5000,0.0283,0.1089,1.8750,0.0690,0.0000,0.1846,0.1683,1.2502,0.1047,0.0030,1.2478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9787,Zumarraga,2018,0.0000,0.0004,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.8120,1.0185,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
9788,Zumarraga,2019,0.0104,0.0093,2.4107,1.4719,0.0632,0.0324,0.2339,0.7070,2.3612,2.4994,1.4285,1.0000,2.4537,2.4148,2.5000,0.0473,0.4590,0.0073,0.0060,0.0000,0.0000,1.5524,0.0277,0.2097,0.0435,2.0000,0.0000,0.0000,2.5000,0.1843,0.0000,0.0000,0.0033,0.0000,0.0202,0.0000,1.0144,0.7160,0.0014,1.8799
9789,Zumarraga,2020,0.0000,1.0019,2.5000,1.9531,0.0636,0.1629,0.1926,0.3451,2.3957,2.5000,1.5196,1.6803,2.5000,2.3809,2.5000,0.1006,0.0036,0.0148,0.0385,0.0174,0.0000,1.4931,0.0329,0.5833,0.0587,2.5000,0.0090,0.0000,2.5000,0.5482,0.0027,1.2500,0.0298,0.2262,0.0024,0.0000,1.5045,0.1281,0.0015,1.8754
9790,Zumarraga,2021,0.0000,0.1886,0.0000,1.3688,0.0039,0.0830,0.1112,0.0018,2.1877,2.4405,2.3730,0.8438,2.4762,2.2281,2.1023,0.1826,0.2189,0.0000,0.0000,0.0130,0.0000,1.2088,0.0339,0.3571,0.2385,2.0000,0.0000,0.0039,2.5000,0.1875,0.0272,0.6250,0.0000,0.0000,0.0126,0.0000,1.2567,1.6311,0.0034,1.0391


Check the shape od our DataFrame.

In [8]:
df.shape

(9792, 42)

Lets have a quick count of the unique city names by using the `len` parameter.

In [9]:
len(df['city_municipality'].unique())

1632

In [10]:
len(gdf['name'].unique())

1634

We can see there is a difference, lets have a look why.

In [11]:
city_list_geo = gdf['name'].unique().tolist()

In [12]:
city_list = df['city_municipality'].unique().tolist()

In [13]:
def compare_lists(list1, list2):
    # Make a copy of list1
    differences = list1.copy()
    # Iterate over the elements in list2
    for element in list2:
        # If the element exists in list1, remove it from differences
        if element in differences:
            differences.remove(element)
    # Return the list of differences
    return differences


In [14]:
differences = compare_lists(city_list_geo, city_list)
print(differences)

["Brooke's Point", "T'Boli"]


There are two cities missing from the statistical data **"Brooke's Point"** and **"T'Boli"**.

### Merge
We can now merge the dataframes.

In [15]:
gdf = gdf.merge(df,
                how="left",
                left_on="name",
                right_on="city_municipality")

Lets have a look.

In [16]:
gdf.head(5)

Unnamed: 0,psgc,name,city_munic,province,clean_idx,longitude,latitude,coords,geometry,FID,city_municipality,year,accommodation_capacity,active_establishments_in_the_locality,annual_disaster_drill,availability_of_basic_utilities,budget_for_drrmp,capacity_of_health_services,capacity_of_school_services,capacity_to_generate_local_resource,compliance_to_arta_citizens_charter,compliance_to_national_directives,cost_of_doing_business,cost_of_living,disaster_risk_reduction_plan,distance_to_ports,early_warning_system,education,emergency_infrastructure,employed_population,employment_generation,financial_deepening,financial_technology_capacity,getting_business_permits,health,information_technology_capacity,lgu_investment,land_use_plan,local_economy_growth,local_economy_size,local_risk_assessments,peace_and_order,presence_of_business_and_professional_organizations,presence_of_investment_promotion_unit,productivity,recognition_of_performance,road_network,safety_compliant_business,sanitary_system,social_protection,transportation_vehicles,utilities
0,1705301000,Aborlan,Aborlan,Palawan,"aborlan, palawan",118.548417,9.437101,"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57976 9...",0,Aborlan,2017.0,0.0,0.212,2.5,2.5,0.0017,0.1098,0.2879,1.432,2.2597,2.3718,2.2277,1.5203,2.5,2.1655,2.5,0.0997,0.875,0.0457,0.0218,0.1894,0.0463,0.7434,0.0623,0.2976,0.4592,1.875,0.0012,0.0039,2.5,0.4161,0.0,2.5,0.0145,0.2879,0.0,0.1943,1.5706,0.1104,0.0204,0.9949
1,1705301000,Aborlan,Aborlan,Palawan,"aborlan, palawan",118.548417,9.437101,"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57976 9...",0,Aborlan,2018.0,0.0,0.2337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1771,1.9907,0.0,0.0,0.0,0.0,0.0,0.0,0.0197,0.0423,0.0,0.0,0.0,0.0,0.0,0.0,0.0224,0.0072,0.0,0.0,0.0218,0.0,0.0047,0.0,0.0,0.1253,0.0,0.0,0.0,0.0
2,1705301000,Aborlan,Aborlan,Palawan,"aborlan, palawan",118.548417,9.437101,"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57976 9...",0,Aborlan,2019.0,0.0,0.0,2.25,2.1047,0.2189,0.0614,0.1451,0.0074,1.9499,2.2,1.9655,0.9127,2.4537,2.3204,2.5,0.13,0.4984,0.0401,0.0,0.1339,0.0671,2.3451,0.0689,0.0806,0.3137,2.4231,0.0024,0.0051,2.5,0.0012,0.0007,2.5,0.0004,0.0595,0.1006,0.2553,1.4921,0.0007,0.0029,1.8558
3,1705301000,Aborlan,Aborlan,Palawan,"aborlan, palawan",118.548417,9.437101,"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57976 9...",0,Aborlan,2020.0,0.0,0.1653,2.5,2.3125,0.0852,0.0587,0.0488,0.0083,2.2404,2.4242,2.2798,0.3017,2.5,2.4933,2.5,0.1185,0.5004,0.0385,0.0651,0.139,0.0631,2.3385,0.083,0.2679,0.1665,2.5,0.1419,0.0013,2.5,0.0072,0.0216,0.625,0.0413,0.0023,0.0006,0.1643,0.9946,0.0024,0.0026,1.8706
4,1705301000,Aborlan,Aborlan,Palawan,"aborlan, palawan",118.548417,9.437101,"9.4371009, 118.5484168","MULTIPOLYGON (((118.57998 9.37215, 118.57976 9...",0,Aborlan,2021.0,0.0,0.2606,1.2542,2.2948,0.0215,0.1064,0.0145,0.0061,2.1365,2.2619,2.3654,1.625,2.4737,2.3721,1.2537,0.0835,0.0531,0.006,0.1052,0.1231,0.0436,1.8169,0.0816,0.0909,0.022,2.4545,0.0045,0.0067,2.5,0.0283,0.1089,1.875,0.069,0.0,0.1846,0.1683,1.2502,0.1047,0.003,1.2478


In [17]:
gdf = gp.GeoDataFrame(gdf, geometry=gdf['geometry'])
type(gdf)

geopandas.geodataframe.GeoDataFrame

### Export New DataSet

Export as a new shapefile.

In [20]:
gdf.to_file('complete_dataset_with_geo_data_v2.shp', driver='ESRI Shapefile')

Export as a .csv file.

In [21]:
gdf.to_csv('complete_dataset_with_geo_data_v2.csv', index=False)