In [23]:
import os
import geopandas as gpd
import pandas as pd
import numpy as np
import math
from shapely.geometry import Point
import time

In [24]:
# Get script and dataset file paths.
SCRIPT_PATH = "/Users/Syrin/Documents/GitHub/AI4PublicHealth/"

# Read the node intersection geojson file via geopandas and store as a pandas dataframe.
NODE_INTERSECTION_PATH = os.path.join(SCRIPT_PATH, "Datasets/Clipped_Mzuzu_Road_Intersections.geojson")
NODE_INTERSECTION_DATA = gpd.read_file(NODE_INTERSECTION_PATH)
NODE_INTERSECTION_DATA = pd.DataFrame(NODE_INTERSECTION_DATA)

# Read the road points w/ elevation .shp file via geopandas and store as a pandas dataframe.
ROAD_POINT_WITH_ELEVATION_PATH = os.path.join(SCRIPT_PATH, "Datasets/MZUZU_roads_pointdata_with_elevation.shp")
ROAD_POINT_WITH_ELEVATION_DATA = gpd.read_file(ROAD_POINT_WITH_ELEVATION_PATH)
ROAD_POINT_WITH_ELEVATION_DATA = pd.DataFrame(ROAD_POINT_WITH_ELEVATION_DATA)

# Read the road line data .shp file via geopandas and store as a pandas dataframe.
ROAD_LINE_PATH = os.path.join(SCRIPT_PATH, "Datasets/MZUZU_roads_lines_CORRECT.shp")
ROAD_LINE_DATA = gpd.read_file(ROAD_LINE_PATH)
ROAD_LINE_DATA = pd.DataFrame(ROAD_LINE_DATA)
INCLUDE_MULTILINE = False

In [25]:
def intersection_processing(intersection_df):
    """Clean the .shp file that contains the route data. Create a second pandas data frame to store a processed
        version of the original data from the .shp file. """

    # Create a secondary pandas data frame that contains the index of nodes, start/end longitude and latitude,
    # elevation, road condition, and road type.
    processed_data = []

    # TODO: Maybe there's a more efficient way to do this than to loop through the entire unprocessed data set
    for rows in range(len(intersection_df.index)):
        coordinates_line = intersection_df.iloc[rows, 2]
        # TODO: Check what the team meant by this comment
        # Maybe take out the start lat and long here if we combine the dataframes for the line and point data
        for item in coordinates_line:
            coordinates_line = item

            coordinates_line = list(coordinates_line.coords)

            start_longitude = coordinates_line[0][0]
            start_latitude = coordinates_line[0][1]

            processed_data.append((start_longitude, start_latitude))

    processed_data = pd.DataFrame(processed_data)
    processed_data = processed_data.rename(
        columns={0: "Longitude", 1: "Latitude"})

    return processed_data

intersection_nodes = intersection_processing(NODE_INTERSECTION_DATA)
intersection_nodes.sort_values(by=["Longitude"], ascending=False)
intersection_nodes

Unnamed: 0,Longitude,Latitude
0,33.998191,-11.393664
1,33.988289,-11.391190
2,33.989335,-11.392850
3,33.989530,-11.394799
4,33.977415,-11.392591
...,...,...
3604,33.982696,-11.435769
3605,33.980917,-11.435747
3606,33.981086,-11.435420
3607,33.980867,-11.435344


In [26]:
def road_line_processing(road_line_df):
    """Clean the .shp file that contains the route data. Create a second pandas data frame to store a processed
        version of the original data from the .shp file. """

    processed_data_line = []

    for rows in range(len(road_line_df.index)):
        coordinates_line = road_line_df.iloc[rows, 11]
        string_type = (type(coordinates_line))

        if INCLUDE_MULTILINE:
            if str(string_type) == "<class 'shapely.geometry.linestring.LineString'>":
                coordinates_line = list(coordinates_line.coords)

                start_longitude_line = coordinates_line[0][0]
                start_latitude_line = coordinates_line[0][1]
                end_longitude_line = coordinates_line[-1][0]
                end_latitude_line = coordinates_line[-1][1]

                processed_data_line.append(
                    (start_longitude_line, start_latitude_line, end_longitude_line, end_latitude_line))

            elif str(string_type) != "<class 'shapely.geometry.linestring.MultiLineString'>":
                for item in coordinates_line:
                    coordinates_line = item

                    coordinates_line = list(coordinates_line.coords)

                    start_longitude_line = coordinates_line[0][0]
                    start_latitude_line = coordinates_line[0][1]
                    end_longitude_line = coordinates_line[-1][0]
                    end_latitude_line = coordinates_line[-1][1]

                    processed_data_line.append(
                        (start_longitude_line, start_latitude_line, end_longitude_line, end_latitude_line))

            else:
                print("There is a unique string type that is neither LineString or MultiString:")
                print("    ", string_type)

        else:
            if str(string_type) == "<class 'shapely.geometry.linestring.LineString'>":
                coordinates_line = list(coordinates_line.coords)

                start_longitude_line = coordinates_line[0][0]
                start_latitude_line = coordinates_line[0][1]
                end_longitude_line = coordinates_line[-1][0]
                end_latitude_line = coordinates_line[-1][1]

                processed_data_line.append(
                    (start_longitude_line, start_latitude_line, end_longitude_line, end_latitude_line))

            else:
                continue

    processed_data_line = pd.DataFrame(processed_data_line)
    processed_data_line = processed_data_line.rename(
        columns={0: "Start Longitude", 1: "Start Latitude", 2: "End Longitude", 3: "End Latitude"})

    return processed_data_line

road_line_nodes = road_line_processing(ROAD_LINE_DATA)
road_line_nodes

Unnamed: 0,Start Longitude,Start Latitude,End Longitude,End Latitude
0,33.985349,-11.390521,33.984215,-11.369368
1,33.957398,-11.397216,34.048045,-11.394335
2,33.960214,-11.390768,33.962719,-11.399591
3,33.989335,-11.392850,33.992700,-11.409748
4,34.046345,-11.387836,34.044863,-11.388096
...,...,...,...,...
5458,33.970628,-11.449207,33.970089,-11.448832
5459,33.969799,-11.450311,33.969593,-11.450297
5460,33.964466,-11.436791,33.963564,-11.436120
5461,33.967368,-11.445715,33.964825,-11.444966


In [29]:
def road_elevation_processing(road_elevation_df, intersection_df):
    """Clean the .shp file that contains the route data. Create a second pandas data frame to store a processed
        version of the original data from the .shp file. """

    # Create a secondary pandas data frame that contains the index of nodes, start/end longitude and latitude,
    # elevation, road condition, and road type.
    processed_data = []

    intersection_longitude_list = intersection_df["Longitude"].values
    intersection_latitude_list = intersection_df["Latitude"].values

    # TODO: Maybe there's a more efficient way to do this than to loop through the entire unprocessed data set
    for rows in range(len(road_elevation_df.index)):
        fid = road_elevation_df.iloc[rows, 0]
        
        # TODO: Check what the team meant by this comment
        # Maybe take out the start lat and long here if we combine the dataframes for the line and point data
        coordinates = list(road_elevation_df.iloc[rows, 22].coords)
        start_longitude = coordinates[0][0]
        start_latitude = coordinates[0][1]

        elevation = road_elevation_df.iloc[rows, 17]
        distance = road_elevation_df.iloc[rows, 15]
        road_condition = road_elevation_df.iloc[rows, 10]
        road_type = road_elevation_df.iloc[rows, 9]
        

        if start_longitude in intersection_longitude_list:
            if start_latitude in intersection_latitude_list:
                processed_data.append((fid, start_longitude, start_latitude, elevation, distance, road_condition, road_type, True, False, False))
            
        else:
            processed_data.append((fid, start_longitude, start_latitude, elevation, distance, road_condition, road_type, False, False, False))

    processed_data = pd.DataFrame(processed_data)

    processed_data = processed_data.rename(
        columns={0: "FID", 1: "Longitude", 2: "Latitude", 3: "Elevation", 4: "Distance", 5: "Road Condition", 6: "Road Type",
                 7: "Intersection Node", 8: "Road Start", 9: "Road End"})

    return processed_data

road_elevation_nodes = road_elevation_processing(ROAD_POINT_WITH_ELEVATION_DATA, intersection_nodes)
road_elevation_nodes

Unnamed: 0,FID,Longitude,Latitude,Elevation,Distance,Road Condition,Road Type,Intersection Node,Road Start,Road End
0,3908.0,34.029856,-11.458530,1286,0.000623,,path,False,False,False
1,3906.0,34.028073,-11.458322,1279,0.000674,,track,False,False,False
2,3906.0,34.028159,-11.458254,1279,0.000784,,track,False,False,False
3,3906.0,34.028325,-11.458243,1277,0.000950,,track,False,False,False
4,3906.0,34.028443,-11.458259,1277,0.001069,,track,False,False,False
...,...,...,...,...,...,...,...,...,...,...
57585,3907.0,34.034627,-11.458959,1270,0.002183,,path,False,False,False
57586,3907.0,34.034729,-11.458953,1270,0.002285,,path,False,False,False
57587,3907.0,34.034887,-11.458986,1275,0.002447,,path,False,False,False
57588,3908.0,34.029513,-11.458301,1282,0.000201,,path,False,False,False


In [53]:
final_list = []
# road_elevation_nodes.sort_values(by=["FID"])

road_start_long = road_line_nodes["Start Longitude"].values
road_start_lat = road_line_nodes["Start Latitude"].values
road_end_long = road_line_nodes["End Longitude"].values
road_end_lat = road_line_nodes["End Latitude"].values

for rows in range(len(road_elevation_nodes.index)):
    longitude = road_elevation_nodes.iloc[rows, 1]
    latitude = road_elevation_nodes.iloc[rows, 2]
    
    if longitude in road_start_long:
        if latitude in road_start_lat:
            road_elevation_nodes.iloc[rows, 8] = True
    if longitude in road_start_long:
        if latitude in road_start_lat:
            road_elevation_nodes.iloc[rows, 9] = True
    else:
        continue
    
road_elevation_nodes

Unnamed: 0,FID,Longitude,Latitude,Elevation,Distance,Road Condition,Road Type,Intersection Node,Road Start,Road End
0,3908.0,34.029856,-11.458530,1286,0.000623,,path,False,False,False
1,3906.0,34.028073,-11.458322,1279,0.000674,,track,False,False,False
2,3906.0,34.028159,-11.458254,1279,0.000784,,track,False,False,False
3,3906.0,34.028325,-11.458243,1277,0.000950,,track,False,False,False
4,3906.0,34.028443,-11.458259,1277,0.001069,,track,False,False,False
...,...,...,...,...,...,...,...,...,...,...
57585,3907.0,34.034627,-11.458959,1270,0.002183,,path,False,False,False
57586,3907.0,34.034729,-11.458953,1270,0.002285,,path,False,False,False
57587,3907.0,34.034887,-11.458986,1275,0.002447,,path,False,False,False
57588,3908.0,34.029513,-11.458301,1282,0.000201,,path,False,False,False


In [52]:
start_list = []
end_list = [] 

for row in range(len(road_elevation_nodes.index)):
    if road_elevation_nodes.iloc[row, 8]:
        start_list.append(road_elevation_nodes.iloc[row])
    elif road_elevation_nodes.iloc[row, 9]:
        end_list.append(road_elevation_nodes.iloc[row])

start_list = pd.DataFrame(start_list)
end_list = pd.DataFrame(end_list)

print(len(start_list), len(end_list))
start_list

4813 0


Unnamed: 0,FID,Longitude,Latitude,Elevation,Distance,Road Condition,Road Type,Intersection Node,Road Start,Road End
12,3907.0,34.032584,-11.459063,1283,0.000000,,path,False,True,True
35,3903.0,34.037295,-11.459525,1287,0.000000,,residential,True,True,True
63,3598.0,34.020333,-11.445559,1265,0.002427,,residential,True,True,True
69,3598.0,34.021316,-11.446092,1266,0.003572,,residential,False,True,True
70,3598.0,34.021367,-11.446133,1266,0.003638,,residential,True,True,True
...,...,...,...,...,...,...,...,...,...,...
57384,3931.0,34.024723,-11.461591,1280,0.000000,,service,False,True,True
57389,3928.0,34.037094,-11.465254,1294,0.000000,,path,False,True,True
57423,3927.0,34.038098,-11.464864,1299,0.000000,,path,False,True,True
57451,3901.0,34.036662,-11.459522,1285,0.002362,,residential,True,True,True


In [48]:
print(start_list['FID'].nunique())
print(end_list['FID'].nunique())

3570
3570


In [51]:
distance_check_list = []

start_list.sort_values(by="FID")
end_list.sort_values(by="FID")

for start_list_row in range(len(start_list.index)):
    fid_reoccurance = []
    store_fid_1 = start_list.iloc[start_list_row, 0]
    
    
    if store_fid_1 not in fid_reoccurance:
        for end_list_row in range(len(end_list.index)):
            store_fid_2 = end_list.iloc[end_list_row, 0]

            if store_fid_1 == store_fid_2 and store_fid_1 not in fid_reoccurance:
                distance_check_list.append([start_list.iloc[start_list_row, 1], start_list.iloc[start_list_row, 2],
                                            end_list.iloc[end_list_row, 1], end_list.iloc[end_list_row, 2]])
                fid_reoccurance.append(store_fid_2)
    
distance_check_list = pd.DataFrame(distance_check_list)
distance_check_list

KeyboardInterrupt: 

In [6]:
intersection_confirm = []

for row in range(len(road_elevation_nodes.index)):
    if road_elevation_nodes.iloc[row][5]:
        lat = road_elevation_nodes.iloc[row, 1]
        long = road_elevation_nodes.iloc[row, 2]
        intersection_confirm.append([lat, long])
    else:
        continue

intersection_confirm = pd.DataFrame(intersection_confirm)
intersection_confirm = intersection_confirm.rename(
        columns={0: "Longitude", 1: "Latitude"})
intersection_confirm = intersection_confirm.reset_index()
intersection_confirm = intersection_confirm.drop("index", axis = 1)

intersection_confirm.sort_values(by=["Longitude"], ascending=False)
intersection_confirm

Unnamed: 0,Longitude,Latitude
0,34.037521,-11.459729
1,34.037295,-11.459525
2,34.020333,-11.445559
3,34.020684,-11.445623
4,34.021367,-11.446133
...,...,...
3584,34.037692,-11.459987
3585,34.036968,-11.458396
3586,34.036792,-11.459805
3587,34.036662,-11.459522


In [19]:
not_equal = []
compare_val = len(intersection_nodes.index) - len(intersection_confirm.index)

intersection_confirm_longitude_list = intersection_confirm["Longitude"].values
intersection_confirm_latitude_list = intersection_confirm["Latitude"].values

for row in range(len(intersection_nodes.index) - compare_val):
    if intersection_nodes.iloc[row, 0] in intersection_confirm_longitude_list:
        if intersection_nodes.iloc[row, 1] in intersection_confirm_latitude_list:
            continue
        else:
            not_equal.append(intersection_nodes.iloc[row])
    else:
        not_equal.append(intersection_nodes.iloc[row])

print(compare_val)
not_equal = pd.DataFrame(not_equal)
not_equal = not_equal.rename(
        columns={0: "Longitude", 1: "Latitude"})

not_equal = not_equal.reset_index()
not_equal = not_equal.drop("index", axis = 1)
not_equal.sort_values(by=["Longitude"], ascending=False)
not_equal

20


Unnamed: 0,Longitude,Latitude
0,34.003964,-11.41162
1,34.002936,-11.409206
2,33.990073,-11.414708
3,33.989918,-11.414716
4,33.989368,-11.415333
5,34.001827,-11.411644
6,34.001899,-11.412725
7,33.996206,-11.406807
8,34.003057,-11.411844
9,34.00282,-11.412563
