In [12]:
import os
import geopandas as gpd
import pandas as pd
import numpy as np
import math
from shapely.geometry import Point
import time

In [13]:
# Get script and dataset file paths.
SCRIPT_PATH = "/Users/Syrin/Documents/GitHub/AI4PublicHealth/"

# Read the node intersection geojson file via geopandas and store as a pandas dataframe.
NODE_INTERSECTION_PATH = os.path.join(SCRIPT_PATH, "Datasets/Mzuzu_Road_Intersections.geojson")
NODE_INTERSECTION_DATA = gpd.read_file(NODE_INTERSECTION_PATH)
NODE_INTERSECTION_DATA = pd.DataFrame(NODE_INTERSECTION_DATA)

# Read the road points w/ elevation .shp file via geopandas and store as a pandas dataframe.
ROAD_POINT_WITH_ELEVATION_PATH = os.path.join(SCRIPT_PATH, "Datasets/MZUZU_roads_pointdata_with_elevation.shp")
ROAD_POINT_WITH_ELEVATION_DATA = gpd.read_file(ROAD_POINT_WITH_ELEVATION_PATH)
ROAD_POINT_WITH_ELEVATION_DATA = pd.DataFrame(ROAD_POINT_WITH_ELEVATION_DATA)

# Read the road line data .shp file via geopandas and store as a pandas dataframe.
ROAD_LINE_PATH = os.path.join(SCRIPT_PATH, "Datasets/MZUZU_roads_lines_CORRECT.shp")
ROAD_LINE_DATA = gpd.read_file(ROAD_LINE_PATH)
ROAD_LINE_DATA = pd.DataFrame(ROAD_LINE_DATA)
INCLUDE_MULTILINE = True

In [14]:
def intersection_processing(intersection_df):
    """Clean the .shp file that contains the route data. Create a second pandas data frame to store a processed
        version of the original data from the .shp file. """

    # Create a secondary pandas data frame that contains the index of nodes, start/end longitude and latitude,
    # elevation, road condition, and road type.
    processed_data = []

    # TODO: Maybe there's a more efficient way to do this than to loop through the entire unprocessed data set
    for rows in range(len(intersection_df.index)):
        # TODO: Check what the team meant by this comment
        # Maybe take out the start lat and long here if we combine the dataframes for the line and point data
        coordinates = list(intersection_df.iloc[rows, 2].coords)
        start_longitude = coordinates[0][0]
        start_latitude = coordinates[0][1]

        processed_data.append((start_longitude, start_latitude))

    processed_data = pd.DataFrame(processed_data)
    processed_data = processed_data.rename(
        columns={0: "Longitude", 1: "Latitude"})

    return processed_data

intersection_nodes = intersection_processing(NODE_INTERSECTION_DATA)
intersection_nodes.sort_values(by=["Longitude"], ascending=False)
intersection_nodes.head(10)

Unnamed: 0,Longitude,Latitude
0,33.873307,-11.536691
1,33.988,-11.472417
2,33.990133,-11.469199
3,33.99261,-11.464942
4,33.993797,-11.464174
5,34.01627,-11.460825
6,33.991764,-11.445077
7,33.993533,-11.463745
8,33.977681,-11.458313
9,33.97891,-11.460184


In [15]:
def road_elevation_processing(road_elevation_df, intersection_df):
    """Clean the .shp file that contains the route data. Create a second pandas data frame to store a processed
        version of the original data from the .shp file. """

    # Create a secondary pandas data frame that contains the index of nodes, start/end longitude and latitude,
    # elevation, road condition, and road type.
    processed_data = []

    intersection_longitude_list = intersection_df["Longitude"].values
    intersection_latitude_list = intersection_df["Latitude"].values

    # TODO: Maybe there's a more efficient way to do this than to loop through the entire unprocessed data set
    for rows in range(len(road_elevation_df.index)):
        # TODO: Check what the team meant by this comment
        # Maybe take out the start lat and long here if we combine the dataframes for the line and point data
        coordinates = list(road_elevation_df.iloc[rows, 22].coords)
        start_longitude = coordinates[0][0]
        start_latitude = coordinates[0][1]

        elevation = road_elevation_df.iloc[rows, 17]
        road_condition = road_elevation_df.iloc[rows, 10]
        road_type = road_elevation_df.iloc[rows, 9]

        if start_longitude in intersection_longitude_list:
            if start_latitude in intersection_latitude_list:
                processed_data.append((start_longitude, start_latitude, elevation, road_condition, road_type, True))
        else:
            processed_data.append((start_longitude, start_latitude, elevation, road_condition, road_type, False))

    processed_data = pd.DataFrame(processed_data)

    processed_data = processed_data.rename(
        columns={0: "Longitude", 1: "Latitude", 2: "Elevation", 3: "Road Condition", 4: "Road Type",
                 5: "Intersection Node"})

    return processed_data

road_elevation_nodes = road_elevation_processing(ROAD_POINT_WITH_ELEVATION_DATA, intersection_nodes)
road_elevation_nodes.head(10)

Unnamed: 0,Longitude,Latitude,Elevation,Road Condition,Road Type,Intersection Node
0,34.029856,-11.45853,1286,,path,False
1,34.028073,-11.458322,1279,,track,False
2,34.028159,-11.458254,1279,,track,False
3,34.028325,-11.458243,1277,,track,False
4,34.028443,-11.458259,1277,,track,False
5,34.028502,-11.458317,1274,,track,False
6,34.028657,-11.458301,1274,,track,False
7,34.028759,-11.458301,1274,,track,False
8,34.028829,-11.458296,1274,,track,False
9,34.028931,-11.458259,1274,,track,False


In [5]:
def road_line_processing(road_line_df):
    """Clean the .shp file that contains the route data. Create a second pandas data frame to store a processed
        version of the original data from the .shp file. """

    processed_data_line = []

    for rows in range(len(road_line_df.index)):
        coordinates_line = road_line_df.iloc[rows, 11]
        string_type = (type(coordinates_line))

        if INCLUDE_MULTILINE:
            if str(string_type) == "<class 'shapely.geometry.linestring.LineString'>":
                coordinates_line = list(coordinates_line.coords)

                start_longitude_line = coordinates_line[0][0]
                start_latitude_line = coordinates_line[0][1]
                end_longitude_line = coordinates_line[-1][0]
                end_latitude_line = coordinates_line[-1][1]

                processed_data_line.append(
                    (start_longitude_line, start_latitude_line, end_longitude_line, end_latitude_line))

            elif str(string_type) != "<class 'shapely.geometry.linestring.MultiLineString'>":
                for item in coordinates_line:
                    coordinates_line = item

                    coordinates_line = list(coordinates_line.coords)

                    start_longitude_line = coordinates_line[0][0]
                    start_latitude_line = coordinates_line[0][1]
                    end_longitude_line = coordinates_line[-1][0]
                    end_latitude_line = coordinates_line[-1][1]

                    processed_data_line.append(
                        (start_longitude_line, start_latitude_line, end_longitude_line, end_latitude_line))

            else:
                print("There is a unique string type that is neither LineString or MultiString:")
                print("    ", string_type)

        else:
            if str(string_type) == "<class 'shapely.geometry.linestring.LineString'>":
                coordinates_line = list(coordinates_line.coords)

                start_longitude_line = round(coordinates_line[0][0], 5)
                start_latitude_line = round(coordinates_line[0][1], 5)
                end_longitude_line = round(coordinates_line[-1][0], 5)
                end_latitude_line = round(coordinates_line[-1][1], 5)

                processed_data_line.append(
                    (start_longitude_line, start_latitude_line, end_longitude_line, end_latitude_line))

            else:
                continue

    processed_data_line = pd.DataFrame(processed_data_line)
    processed_data_line = processed_data_line.rename(
        columns={0: "Start Longitude", 1: "Start Latitude", 2: "End Longitude", 3: "End Latitude"})

    return processed_data_line

road_line_nodes = road_line_processing(ROAD_LINE_DATA)
road_line_nodes.head(10)

Unnamed: 0,Start Longitude,Start Latitude,End Longitude,End Latitude
0,33.985349,-11.390521,33.984215,-11.369368
1,33.957398,-11.397216,34.048045,-11.394335
2,33.960214,-11.390768,33.962719,-11.399591
3,33.989335,-11.39285,33.9927,-11.409748
4,34.046345,-11.387836,34.044863,-11.388096
5,33.982526,-11.408098,33.977415,-11.392591
6,33.983392,-11.401632,33.986693,-11.390878
7,33.995284,-11.399235,33.988289,-11.39119
8,33.966338,-11.384478,33.971585,-11.391797
9,33.961973,-11.389829,33.962256,-11.391865


In [16]:
intersection_confirm = []

for row in range(len(road_elevation_nodes.index)):
    if road_elevation_nodes.iloc[row][5]:
        lat = road_elevation_nodes.iloc[row, 0]
        long = road_elevation_nodes.iloc[row, 1]
        intersection_confirm.append([lat, long])
    else:
        continue

intersection_confirm = pd.DataFrame(intersection_confirm)
intersection_confirm = intersection_confirm.rename(
        columns={0: "Longitude", 1: "Latitude"})
intersection_confirm = intersection_confirm.reset_index()
intersection_confirm = intersection_confirm.drop("index", axis = 1)

intersection_confirm.sort_values(by=["Longitude"], ascending=False)
print(intersection_confirm.info)
intersection_confirm.head(10)

<bound method DataFrame.info of       Longitude   Latitude
0     34.037521 -11.459729
1     34.037295 -11.459525
2     34.020333 -11.445559
3     34.020684 -11.445623
4     34.021367 -11.446133
...         ...        ...
3584  34.037692 -11.459987
3585  34.036968 -11.458396
3586  34.036792 -11.459805
3587  34.036662 -11.459522
3588  34.031377 -11.458153

[3589 rows x 2 columns]>


Unnamed: 0,Longitude,Latitude
0,34.037521,-11.459729
1,34.037295,-11.459525
2,34.020333,-11.445559
3,34.020684,-11.445623
4,34.021367,-11.446133
5,34.021786,-11.446316
6,34.022316,-11.445424
7,34.022069,-11.444594
8,34.018581,-11.444515
9,34.018565,-11.444985


In [17]:
not_equal = []
compare_val = len(intersection_nodes.index) - len(intersection_confirm.index)

for row in range(len(intersection_nodes.index) - compare_val):
    if intersection_nodes.iloc[row, 0] == intersection_confirm.iloc[row, 0] and intersection_nodes.iloc[row, 1] == intersection_confirm.iloc[row, 1]:
        continue
    else:
        not_equal.append(intersection_nodes.iloc[row])
        
not_equal = pd.DataFrame(intersection_nodes)
not_equal = intersection_nodes.rename(
        columns={0: "Longitude", 1: "Latitude"})
not_equal.head(len(not_equal.index))

Unnamed: 0,Longitude,Latitude
0,33.873307,-11.536691
1,33.988000,-11.472417
2,33.990133,-11.469199
3,33.992610,-11.464942
4,33.993797,-11.464174
...,...,...
3689,34.004502,-11.398306
3690,34.004840,-11.397878
3691,34.006794,-11.395404
3692,34.007214,-11.394662
