In [1]:
import logging
from typing import Union, Optional

import geopandas as gpd
import pandas as pd
import pandera as pa
from pandera import Field, SchemaModel, check, dataframe_check
from pandera.typing import Series

In [2]:
logger = logging.getLogger(__name__)

In [3]:


class ClientLines(SchemaModel):
    """
    Client Lines dataset
    """

    geometry: Series[gpd.array.GeometryDtype]
    level3: Series[str] = Field(unique=True, description="Unique line ID")
    level1: Optional[Series[str]] = Field(nullable=True)
    level2: Optional[Series[str]] = Field(nullable=True)
    phasingType: Optional[Series[str]] = Field(
        isin=["single-phase", "two-phase", "three-phase"], nullable=True
    )

    class Config:
        name = "Client lines"
        description = "Cleaned client lines dataset"
        unique_column_names = True

    @check("geometry", name="geometry_is_valid")
    def geometry_is_valid(cls, geom: Series[gpd.array.GeometryDtype]) -> Series[bool]:
        return geom.is_valid

    @check("geometry", name="geometry_is_linestring")
    def geometry_is_linestring(
        cls, geom: Series[gpd.array.GeometryDtype]
    ) -> Series[bool]:
        return geom.geom_type == "LineString"

    @dataframe_check
    def dataframe_in_utm(cls, gdf: gpd.GeoDataFrame) -> Series[bool]:
        """Ensure dataframe CRS is in UTM"""
        return gdf.estimate_utm_crs() == gdf.crs

In [4]:
MIN_LINE_LENGTH_IN_M = 2  # Default minimum span length

def validate(gdf: gpd.GeoDataFrame, schema):
    try:
        return schema.validate(gdf, lazy=True)
    except pa.errors.SchemaErrors as err:
        logger.error(err.failure_cases)

def validate_client_poles(gdf: gpd.GeoDataFrame):
    try:
        return ClientPoles.validate(gdf, lazy=True)
    except pa.errors.SchemaErrors as err:
        logger.error(err.failure_cases)
        assert False, "Validation failed."


def validate_client_lines(
    gdf: gpd.GeoDataFrame, min_line_length_in_m: float = MIN_LINE_LENGTH_IN_M
):
    try:
        geometry_column = pa.Column(
            gpd.array.GeometryDtype,
            name="geometry",
            checks=pa.Check(
                lambda x: x.length > min_line_length_in_m,
                error="Line should meet minimum line length.",
                name="geometry_min_length",
            ),
        )
        geometry_column.validate(gdf, lazy=True)
        return ClientLines.validate(gdf, lazy=True)
    except pa.errors.SchemaErrors as err:
        logger.error(err.failure_cases)
        assert False, "Validation failed."

In [5]:
lines = gpd.read_file('https://storage.googleapis.com/overstory-customer-test/take_home_exercise/demo_lines.geojson')
lines.head()
#validate_client_lines(lines)

Unnamed: 0,level1,level2,level3,phasingType,pointA,pointB,lineHeightInFt,geometry
0,AOI2,East12th,EA100,,206,402.0,35,"LINESTRING (505131.847 4398320.697, 505188.789..."
1,AOI2,East12th,EA101,three-phase,206,207.0,35,"LINESTRING (505188.789 4398319.902, 505229.010..."
2,AOI2,East12th,EA102,three-phase,207,209.0,35,"LINESTRING (505229.010 4398319.412, 505229.638..."
3,AOI2,East12th,EA103,three-phase,209,211.0,35,"LINESTRING (505229.638 4398265.080, 505229.506..."
4,AOI2,East12th,EA104,three-phase,211,213.0,35,"LINESTRING (505229.506 4398235.136, 505229.746..."


# Error caused by either: 
#### 1.  One or more columns in the dataframe doest adhere to the schema specified in the function.
#### 2. The phasingType column does not contain only the values 'single-phase', 'two-phase', and 'three-phase'
#### 3. There might be a data type inconsistency

In [6]:
# Checking if all geometries are valid
geo_lines_valid = lines['geometry'].is_valid
geometryLines_valid = geo_lines_valid .all()

print(f"All geometries valid: {geometryLines_valid}")

All geometries valid: True


In [7]:
# lines.crs 
from pandas import Series

def geometry_is_linestring(geom: gpd.GeoSeries) -> bool:
    return (geom.geom_type == "LineString").all()


all_linestrings = geometry_is_linestring(lines['geometry'])

print(f"All geometries are LineStrings: {all_linestrings}")

All geometries are LineStrings: True


In [8]:
lines.crs 

<Projected CRS: EPSG:32613>
Name: WGS 84 / UTM zone 13N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 108°W and 102°W, northern hemisphere between equator and 84°N, onshore and offshore. Canada - Northwest Territories (NWT); Nunavut; Saskatchewan. Mexico. United States (USA).
- bounds: (-108.0, 0.0, -102.0, 84.0)
Coordinate Operation:
- name: UTM zone 13N
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [9]:
#### 2. The phasingType column does not contain only the values 'single-phase', 'two-phase', and 'three-phase'
# checking if there is any other value apart from the said value (above)

print(lines['phasingType'].unique())

['N/A' 'three-phase' 'single-phase' 'two-phase']


In [10]:
# How many entries have 'N/A'
null_values= lines.loc[lines['phasingType'] == 'N/A']
len(null_values)

1

In [11]:
#drop the row with the N/A  and assign the dataframe a new variable name
lines_clean = lines.loc[lines['phasingType'] != 'N/A']

validate_client_lines(lines_clean)

Unnamed: 0,level1,level2,level3,phasingType,pointA,pointB,lineHeightInFt,geometry
1,AOI2,East12th,EA101,three-phase,206,207.0,35,"LINESTRING (505188.789 4398319.902, 505229.010..."
2,AOI2,East12th,EA102,three-phase,207,209.0,35,"LINESTRING (505229.010 4398319.412, 505229.638..."
3,AOI2,East12th,EA103,three-phase,209,211.0,35,"LINESTRING (505229.638 4398265.080, 505229.506..."
4,AOI2,East12th,EA104,three-phase,211,213.0,35,"LINESTRING (505229.506 4398235.136, 505229.746..."
5,AOI2,East12th,EA105,three-phase,213,406.0,35,"LINESTRING (505229.746 4398199.608, 505229.932..."
...,...,...,...,...,...,...,...,...
266,AOI2,Elm,E63,single-phase,64,65.0,35,"LINESTRING (506204.459 4400170.652, 506206.135..."
267,AOI2,Elm,E64,single-phase,65,66.0,35,"LINESTRING (506206.135 4400139.905, 506205.529..."
268,AOI2,Elm,E65,single-phase,66,67.0,35,"LINESTRING (506205.529 4400103.968, 506205.298..."
269,AOI2,Elm,E66,single-phase,67,68.0,35,"LINESTRING (506205.298 4400075.244, 506204.820..."


In [12]:
poles = gpd.read_file('https://storage.googleapis.com/overstory-customer-test/take_home_exercise/demo_poles.geojson')

#validate_client_poles(poles)

In [13]:
poles.head()

Unnamed: 0,level1,heightInFt,poleID,geometry
0,AOI2,35,1,POINT (506059.544 4400335.804)
1,AOI2,35,2,POINT (506106.465 4400335.842)
2,AOI2,35,3,POINT (506160.698 4400335.642)
3,AOI2,35,4,POINT (506205.816 4400337.059)
4,AOI2,35,5,POINT (506261.851 4400336.553)


In [14]:
poles.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 276 entries, 0 to 275
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   level1      276 non-null    object  
 1   heightInFt  276 non-null    int64   
 2   poleID      276 non-null    object  
 3   geometry    276 non-null    geometry
dtypes: geometry(1), int64(1), object(2)
memory usage: 8.8+ KB


In [15]:
# Rechecking the datatype
print(type(poles.loc[0, 'poleID']), type(poles.loc[0, 'heightInFt'])) 

<class 'str'> <class 'numpy.int64'>


In [17]:
# checking if all the geometries in the geometry column are of type "Point"
def geometry_is_point(geom: gpd.GeoSeries) -> bool:
    return (geom.geom_type == "Point").all()

all_points = geometry_is_point(poles['geometry'])

print(f"All geometries are Points: {all_points}")

All geometries are Points: True


In [19]:
poles.crs

<Projected CRS: EPSG:32613>
Name: WGS 84 / UTM zone 13N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: Between 108°W and 102°W, northern hemisphere between equator and 84°N, onshore and offshore. Canada - Northwest Territories (NWT); Nunavut; Saskatchewan. Mexico. United States (USA).
- bounds: (-108.0, 0.0, -102.0, 84.0)
Coordinate Operation:
- name: UTM zone 13N
- method: Transverse Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [18]:


class ClientPoles(SchemaModel):
    """
    Client Poles dataset
    """

    geometry: Series[gpd.array.GeometryDtype]
    poleID: Series[str] = Field(unique=True, description="Unique Pole ID")
    heightInFt: Optional[Series[int]] = Field(nullable=True)
    level1: Optional[Series[str]] = Field(nullable=True)

    class Config:
        name = "Client Poles"
        description = "Cleaned client poles dataset"
        unique_column_names = True

    @check("geometry", name="geometry_is_valid")
    def geometry_is_valid(cls, geom: Series[gpd.array.GeometryDtype]) -> Series[bool]:
        return geom.is_valid

    @check("geometry", name="geometry_is_point")
    def geometry_is_point(
        cls, geom: Series[gpd.array.GeometryDtype]
    ) -> Series[bool]:
        return geom.geom_type == "Point"

    @dataframe_check
    def dataframe_in_utm(cls, gdf: gpd.GeoDataFrame) -> Series[bool]:
        """Ensure dataframe CRS is in UTM"""
        return gdf.estimate_utm_crs() == gdf.crs

TypeError: 'type' object is not subscriptable

In [None]:
#validate_client_poles(poles)