In [117]:
import numpy as np
import time
import json
import dask_geopandas as gdd

class ConfigValidator:
    def __init__(self, config, input_file, report_path):
        self.start_time = time.time()
        self.config = config
        self.input_file = input_file
        self.gdf = gdd.read_file(input_file, npartitions = 4)
        # self.gdf = self.gdf.compute()
        print(len(self.gdf))
        self.report = report_path
        self.validation_success_status = True

    def validate_config_structure(self):
        config = self.config
        if 'attributes' not in config: 
            raise ValueError('Invalid Config - Property "attributes" not found')
        if 'geometry' not in config:
            raise ValueError('Invalid Config - Property "geometry" not found')
        if 'dtypes' in config['attributes']:
            valid_types = ['int', 'int64', 'float', 'double', 'text', 'objectID', 'date', 'json']
            for key in config['attributes']['dtypes'].keys():
                if key not in valid_types:
                    raise ValueError(f'Invalid Config - invalid dtype - "{key}"')

    def update_report(self, df_vals, msg):
        with open(self.report, 'a') as f:
            f.writelines(f'Message - {msg}\n\n\n')
            if(df_vals is not None):
                np.savetxt(f, df_vals, fmt='%s', delimiter='\t')
                f.writelines('\n\n\n')

    def dtypes_validation(self):
        config = self.config
        if 'dtypes' in config['attributes']:
            # read shapefile in geopandas - dtype in pandas - object, int64, float64, datetime64, bool
            input_file_dtypes = self.gdf.dtypes
            # mp to map standard types to pandas types
            mp = {
                'int' : np.dtype('int'),
                'int64' : np.dtype('int64'),
                'float' : np.dtype('float'),
                'float64' : np.dtype('float64'),
                'double' : np.dtype('float'),
                'text' : np.dtype('object_'),
                'objectID' : np.dtype('object_'),
                'date' : np.dtype('datetime64')
            }
            for dtype in config['attributes']['dtypes']:
                for featurename in config['attributes']['dtypes'][dtype]:
                    if(dtype == 'json'):
                        continue
                    if(input_file_dtypes[featurename] != mp[dtype]):
                        self.validation_success_status = False
                        self.update_report(None, f'Invalid data type for {featurename}, should be {mp[dtype]} but is {input_file_dtypes[featurename]}\n\n\n')
    
    def check_json_structure(self, val):
        try:
            json.loads(val)
            return True
        except:
            return False

    def json_structure_validation(self, featurename):
        filtered_gdf = self.gdf[(self.gdf[featurename].apply(self.check_json_structure, meta=(featurename, 'bool')) == False)]
        return len(filtered_gdf) == 0, filtered_gdf

    def json_validation(self):
        config = self.config
        if 'dtypes' in config['attributes']:
            if 'json' in config['attributes']['dtypes']:
                for featurename in config['attributes']['dtypes']['json']:
                    verdict, invalid_df = self.json_structure_validation(featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        self.update_report(invalid_df.values.compute(), f'JSON validation failed - Invalid json found in feature - {featurename} In the following rows')

    def inclusive_range_validation(self, featurename):
        # lower <= all_vals <= upper 
        bounds = self.config['attributes']['ranges']['inclusive'][featurename]
        lower, upper = bounds[0], bounds[1]

        filtered_gdf = self.gdf[(self.gdf[featurename].notnull()) & ((self.gdf[featurename] < lower) | (self.gdf[featurename] > upper))]
        return len(filtered_gdf) == 0, filtered_gdf

    def exclusive_range_validation(self, featurename):
        # all_vals < lower or all_vals > upper 
        bounds = self.config['attributes']['ranges']['exclusive'][featurename]
        lower, upper = bounds[0], bounds[1]
        
        filtered_gdf = self.gdf[(self.gdf[featurename].notnull()) & ((self.gdf[featurename] >= lower) & (self.gdf[featurename] <= upper))]
        return len(filtered_gdf) == 0, filtered_gdf
        
    def ranges_validation(self):
        config = self.config
        if 'ranges' in config['attributes']:
            if 'inclusive' in config['attributes']['ranges']:

                for featurename in config['attributes']['ranges']['inclusive'].keys():
                    verdict, invalid_df = self.inclusive_range_validation(featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        bounds = self.config['attributes']['ranges']['inclusive'][featurename]
                        lower, upper = bounds[0], bounds[1]
                        self.update_report(invalid_df.values.compute(), f'Inclusive ranges validation failed - Invalid value for {featurename}, value outside [{lower},{upper}] found in the following rows')

            if 'exclusive' in config['attributes']['ranges']:

                for featurename in config['attributes']['ranges']['exclusive'].keys():
                    verdict, invalid_df = self.exclusive_range_validation(featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        bounds = self.config['attributes']['ranges']['exclusive'][featurename]
                        lower, upper = bounds[0], bounds[1]
                        self.update_report(invalid_df.values.compute(), f'Exclusive ranges validation failed - Invalid value for {featurename}, value found in range [{lower}, {upper}] found in the following rows')

    def equal_value_validation(self, featurename):
        # all_vals = val
        val = self.config['attributes']['values']['equal'][featurename]
        
        filtered_gdf = self.gdf[(self.gdf[featurename].notnull()) & (self.gdf[featurename] != val)]
        return len(filtered_gdf) == 0, filtered_gdf

    def not_equal_value_validation(self, featurename):
        # all_vals != val
        val = self.config['attributes']['values']['not_equal'][featurename]
        
        filtered_gdf = self.gdf[(self.gdf[featurename].notnull()) & (self.gdf[featurename] == val)]
        return len(filtered_gdf) == 0, filtered_gdf
        
    def values_validation(self):
        config = self.config
        if 'values' in config['attributes']:
            if 'equal' in config['attributes']['values']:
                for featurename in config['attributes']['values']['equal'].keys():
                    verdict, invalid_df = self.equal_value_validation(featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        val = self.config['attributes']['values']['equal'][featurename]
                        self.update_report(invalid_df.values.compute(), f'Equal value validation failed - Invalid value for {featurename}, value found not equal to {val} in the following rows')

            if 'not_equal' in config['attributes']['values']:
                for featurename in config['attributes']['values']['not_equal'].keys():
                    verdict, invalid_df = self.not_equal_value_validation(featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        val = self.config['attributes']['values']['not_equal'][featurename]
                        self.update_report(invalid_df.values.compute(), f'Non-Equal value validation failed - Invalid value for {featurename}, value found equal to {val} in the following rows')

    def inclusive_subset_validation(self, featurename):
        # all_vals belongs to vals
        vals = self.config['attributes']['subsets']['inclusive'][featurename]
        # print(self.gdf[featurename])
        filtered_gdf = self.gdf[(self.gdf[featurename].notnull()) & (~self.gdf[featurename].isin(vals))]
        return len(filtered_gdf) == 0, filtered_gdf
        
    def exclusive_subset_validation(self, featurename):
        # all_vals not belongs to vals
        vals = self.config['attributes']['subsets']['exclusive'][featurename]
        
        filtered_gdf = self.gdf[(self.gdf[featurename].notnull()) & (self.gdf[featurename].isin(vals))]
        return len(filtered_gdf) == 0, filtered_gdf
    
    def subsets_validation(self):
        config = self.config
        if 'subsets' in config['attributes']:
            if 'inclusive' in config['attributes']['subsets']:
                for featurename in config['attributes']['subsets']['inclusive'].keys():
                    verdict, invalid_df = self.inclusive_subset_validation(featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        vals = self.config['attributes']['subsets']['inclusive'][featurename]
                        self.update_report(invalid_df.values.compute(), f'Inclusive subsets validation failed - Invalid value for {featurename}, value found which does not belong to the {vals} in the following rows')

            if 'exclusive' in config['attributes']['subsets']:
                for featurename in config['attributes']['subsets']['exclusive'].keys():
                    verdict, invalid_df = self.exclusive_subset_validation(featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        vals = self.config['attributes']['subsets']['exclusive'][featurename]
                        self.update_report(invalid_df.values.compute(), f'Exclusive subsets validation failed - Invalid value for {featurename}, value found which belongs to the {vals} in the following rows')
        
    def not_null_validation(self):
        config = self.config
        if 'not_null' in config['attributes']:
            for featurename in config['attributes']['not_null']:
                filtered_gdf = self.gdf[self.gdf[featurename].isnull()]
                if(len(filtered_gdf) > 0):
                    self.validation_success_status = False
                    self.update_report(filtered_gdf.values.compute(), f'Null check validation failed - Invalid value for {featurename}, null value found in the following rows')
    
    def create_function(self, code):
        func_dict = {}
        exec(code, globals(), func_dict)
        return func_dict['fun']

    def run_check_functions_validation(self, featurename, func):
        filtered_gdf = self.gdf[(self.gdf[featurename].apply(self.create_function(func), meta=(featurename, 'bool')) == False)]
        return len(filtered_gdf) == 0, filtered_gdf
    
    def attributes_check_functions_validation(self):
        config = self.config
        if 'check_functions' in config['attributes']:
            for featurename in config['attributes']['check_functions'].keys():
                funcs = config['attributes']['check_functions'][featurename]
                for func in funcs:
                    verdict, invalid_df = self.run_check_functions_validation(featurename, func)
                    if(verdict == False):
                        self.validation_success_status = False
                        self.update_report(invalid_df.values.compute(), f'Function check failed - Invalid value for {featurename} - A function check {func} failed in the following rows')

    def crs_validation(self):
        config = self.config
        if 'crs' in config['geometry']:
            if(str(self.gdf.crs) != config['geometry']['crs']):
                self.validation_success_status = False
                self.update_report(None, f'Invalid crs {str(self.gdf.crs)} found')
            
    def geometry_types_validation(self):
        config = self.config
        # workaround for now...
        if 'types' in config['geometry']:
            valid_types = config['geometry']['types']
            types_found = set(self.gdf.geom_type)
            for type in types_found:
                if type not in valid_types:
                    self.validation_success_status = False
                    self.update_report(None, f'Invalid geometry type {type} found, it should be from {valid_types}')

    def geometry_check_function_validation(self):
        config = self.config
        if 'check_functions' in config['geometry']:
            funcs = config['geometry']['check_functions']
            for func in funcs:
                verdict, invalid_df = self.run_check_functions_validation('geometry', func)
                if(verdict == False):
                    self.validation_success_status = False
                    self.update_report(invalid_df.values.compute(), f'Function check failed - Invalid value for geometry - A function check {func} failed in the following rows')

    def validate(self):
        # self.validate_config_structure()
        # #### ATTRIBUTES ####
        # dtypes validation,
        self.dtypes_validation()
        # json validation,
        self.json_validation()
        # ranges validation,
        self.ranges_validation()
        # values validation,
        self.values_validation()
        # subsets validation, (considered for only belonging condition)
        self.subsets_validation()
        # not_null validation,
        self.not_null_validation()
        # check_functions validation, (run function for all values in feature, all must be true)
        self.attributes_check_functions_validation()
        
        #### GEOMETRY VALIDATION ####
        # crs validation,
        self.crs_validation()
        # types validation
        self.geometry_types_validation()
        # check_functions validation
        self.geometry_check_function_validation()
        time_taken = time.time() - self.start_time
        if(self.validation_success_status == True):
            self.update_report(None, f'Validation successful')
        self.update_report(None, f'time taken for whole process: {time_taken}')
        print('Time :', time_taken)
        pass


### Testing

In [146]:
config = {
    'attributes' : {
        'dtypes' : {
            'int64' : ['Division'],
            'text' : ['FeatureNam', 'FeatureSta', 'Condition', 'Visible', 'Legible', 'Reflective',
                      'images', 'bboxes', 'geohash', 'fpath', 'RouteName', 'StreetName', 'UUID', 
                      'RouteMaint', 'RouteID', 'BeginFeatu', 'EndFeature', 'MaintCnt', 'LocCntyC', 
                      'RouteCla', 'RouteInv', 'Direction', 'TravelDir', 'UniqueID', 'SyncID'],
            'float' : ['lat', 'lon', 'MPLength', 'Length', 'Width', 'Area', 'BeginMp1', 'EndMp1',
                       'MaxMp1', 'Shape_Leng'],
            'json' : ['bboxes', 'images']
        },
        'ranges' : {
            'inclusive' : {
                'lat' : [0, 100],
                'lon' : [-100, 100],
                'Division' : [0, 100],
                'Length' : [0, 10000],
                'Width' : [0, 10000],
                'BeginMp1' : [0, 10000],
                'EndMp1' : [0, 10000],
                'MaxMp1' : [0, 10000],
                'MPLength' : [0, 10000],
            },
            'exclusive' : {
                'lat' : [100, 100],
                'lon' : [100, 100],
                'Division' : [100, 100],
                'Length' : [10000, 10000],
                'Width' : [10000, 10000],
                'BeginMp1' : [10000, 10000],
                'EndMp1' : [10000, 10000],
                'MaxMp1' : [10000, 10000],
                'MPLength' : [10000, 10000],
            }
        },
        'values' : {
            'equal' : {
                'RouteMaint' : 'System'
            },
            'not_equal' : {
                'FeatureNam' : '-1',
                'FeatureSta' : '-1', 
                'Condition' : '-1', 
                'Visible' : '-1', 
                'Legible' : '-1', 
                'Reflective' : '-1',
                'images' : '-1',
                'bboxes' : '-1', 
                'geohash' : '-1', 
                'fpath' : '-1', 
                'RouteName' : '-1', 
                'StreetName' : '-1', 
                'UUID' : '-1', 
                'RouteMaint' : '-1', 
                'RouteID' : '-1', 
                'BeginFeatu' : '-1', 
                'EndFeature' : '-1', 
                'MaintCnt' : '-1', 
                'LocCntyC' : '-1', 
                'RouteCla' : '-1', 
                'RouteInv' : '-1', 
                'Direction' : '-1', 
                'TravelDir' : '-1', 
                'UniqueID' : '-1', 
                'SyncID' : '-1',
                'EndFeature' : '-1', 
                'MaintCnt' : '-1', 
                'LocCntyC' : '-1', 
                'RouteCla' : '-1', 
                'RouteInv' : '-1', 
                'Direction' : '-1', 
                'TravelDir' : '-1', 
                'UniqueID' : '-1', 
                'SyncID' : '-1',
            }
        },
        'subsets' : {
            'inclusive' : {
                'Condition' : ['good', 'damaged'],
                'Visible' : ['0', '1'],
                'Legible' : ['0', '1'],
                'Reflective' : ['0', '1'],
            },
            'exclusive' : {
                'FeatureNam' : ['a', 'b'],
                'FeatureSta' : ['a', 'b'], 
                'Condition' : ['a', 'b'], 
                'Visible' : ['a', 'b'], 
                'Legible' : ['a', 'b'], 
                'Reflective' : ['a', 'b'],
                'images' : ['a', 'b'],
                'bboxes' : ['a', 'b'], 
                'geohash' : ['a', 'b'], 
                'fpath' : ['a', 'b'], 
                'RouteName' : ['a', 'b'], 
                'StreetName' : ['a', 'b'], 
                'UUID' : ['a', 'b'], 
                'RouteMaint' : ['a', 'b'], 
                'RouteID' : ['a', 'b'], 
                'BeginFeatu' : ['a', 'b'], 
                'EndFeature' : ['a', 'b'], 
                'MaintCnt' : ['a', 'b'], 
                'LocCntyC' : ['a', 'b'], 
                'RouteCla' : ['a', 'b'], 
                'RouteInv' : ['a', 'b'], 
                'Direction' : ['a', 'b'], 
                'TravelDir' : ['a', 'b'], 
                'UniqueID' : ['a', 'b'], 
                'SyncID' : ['a', 'b'],
            }
        },
        'not_null' : ['lat', 'lon', 'images', 'bboxes', 'geohash', 'fpath', 'Length', 'Width', 'Area'], #much more
        'check_functions' : {
            'lat' : [
                    '''def fun(val):
                        return val >= 0 and val <= 100
                    ''',
                    # '''def fun(val):
                    #     return val < 35
                    # '''
                    ]
        }
    },
    'geometry' : {
        'types' : ['Point']
    }
}
validator = ConfigValidator(config, '../shapefile_/point_.shp', 'report.txt')
validator.validate()

903168
Time : 35.86536979675293


In [143]:
from geopandas import pd
import numpy as np
import threading
import time
import json
import dask_geopandas as gdd

class ConfigValidator:
    def __init__(self, config, input_file, report_path):
        self.start_time = time.time()        
        self.config = config
        self.input_file = input_file
        self.NUM_THREADS = 4
        self.gdf = gdd.read_file(input_file, npartitions=self.NUM_THREADS).compute()
        self.report = report_path
        self.validation_success_status = True
        print(len(self.gdf))
        batch_size = len(self.gdf)//self.NUM_THREADS
        self.gdf_batches = [self.gdf[i:len(self.gdf) if i//batch_size == self.NUM_THREADS-1 else i+batch_size] for i in range(0, len(self.gdf), batch_size)]

    def validate_config_structure(self):
        config = self.config
        if 'attributes' not in config: 
            raise ValueError('Invalid Config - Property "attributes" not found')
        if 'geometry' not in config:
            raise ValueError('Invalid Config - Property "geometry" not found')
        if 'dtypes' in config['attributes']:
            valid_types = ['int', 'int64', 'float', 'double', 'text', 'objectID', 'date', 'json']
            for key in config['attributes']['dtypes'].keys():
                if key not in valid_types:
                    raise ValueError(f'Invalid Config - invalid dtype - "{key}"')
        # we can have more validations on this, like valid functions, valid featurename lists etc 
        # not needed as config will be generated from template or stored

    # function is the validation function which must return two things,
    # bool, filtered dataframe which would be used to generate the report
    def parallel_execution(self, function, *args):
        threads = []
        results = [None]*self.NUM_THREADS

        def worker(*args):
            results[args[0]] = function(*args)

        for i in range(self.NUM_THREADS):
            thread = threading.Thread(target=worker, args=(i,) + args)
            threads.append(thread)

        # start the threads
        for thread in threads:
            thread.start()

        # wait for the threads
        for thread in threads:
            thread.join()

        verdicts = []
        invalid_dfs = []
        for verdict, invalid_df in results:
            verdicts.append(verdict)
            invalid_dfs.append(invalid_df)

        return all(verdicts), pd.concat(invalid_dfs)

    def update_report(self, df, msg):
        with open(self.report, 'a') as f:
            f.writelines(f'Message - {msg}\n\n\n')
            if(df is not None):
                np.savetxt(f, df.values, fmt='%s', delimiter='\t')
                f.writelines('\n\n\n')

    def dtypes_validation(self):
        config = self.config
        if 'dtypes' in config['attributes']:
            # gdf = self.gdf
            # read input_file in geopandas - dtype in pandas - object, int64, float64, datetime64, bool
            input_file_dtypes = self.gdf.dtypes
            # mp to map standard types to pandas types
            mp = {
                'int' : np.dtype('int'),
                'int64' : np.dtype('int64'),
                'float' : np.dtype('float'),
                'float64' : np.dtype('float64'),
                'double' : np.dtype('float'),
                'text' : np.dtype('object_'),
                'objectID' : np.dtype('object_'),
                'date' : np.dtype('datetime64')
            }
            for dtype in config['attributes']['dtypes']:
                for featurename in config['attributes']['dtypes'][dtype]:
                    if(dtype == 'json'):
                        continue
                    if(input_file_dtypes[featurename] != mp[dtype]):
                        self.validation_success_status = False
                        self.update_report(None, f'Invalid data type for {featurename}, should be {mp[dtype]} but is {input_file_dtypes[featurename]}\n\n\n')
    
    def check_json_structure(self, val):
        try:
            json.loads(val)
            return True
        except:
            return False

    def json_structure_validation(self, thread_id, featurename):
        filtered_gdf = self.gdf_batches[thread_id][(self.gdf_batches[thread_id][featurename].apply(self.check_json_structure) == False)]
        return len(filtered_gdf) == 0, filtered_gdf

    def json_validation(self):
        config = self.config
        if 'dtypes' in config['attributes']:
            if 'json' in config['attributes']['dtypes']:
                for featurename in config['attributes']['dtypes']['json']:
                    verdict, invalid_df = self.parallel_execution(self.json_structure_validation, featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        self.update_report(invalid_df, f'JSON validation failed - Invalid json found in feature - {featurename} In the following rows')

    def inclusive_range_validation(self, thread_id, featurename):
        # lower <= all_vals <= upper 
        bounds = self.config['attributes']['ranges']['inclusive'][featurename]
        lower, upper = bounds[0], bounds[1]

        filtered_gdf = self.gdf_batches[thread_id][(self.gdf_batches[thread_id][featurename].notnull()) & ((self.gdf_batches[thread_id][featurename] < lower) | (self.gdf_batches[thread_id][featurename] > upper))]
        return len(filtered_gdf) == 0, filtered_gdf

    def exclusive_range_validation(self, thread_id, featurename):
        # all_vals < lower or all_vals > upper 
        bounds = self.config['attributes']['ranges']['exclusive'][featurename]
        lower, upper = bounds[0], bounds[1]
        
        filtered_gdf = self.gdf_batches[thread_id][(self.gdf_batches[thread_id][featurename].notnull()) & ((self.gdf_batches[thread_id][featurename] >= lower) & (self.gdf_batches[thread_id][featurename] <= upper))]
        return len(filtered_gdf) == 0, filtered_gdf
        
    def ranges_validation(self):
        config = self.config
        if 'ranges' in config['attributes']:
            if 'inclusive' in config['attributes']['ranges']:

                for featurename in config['attributes']['ranges']['inclusive'].keys():
                    verdict, invalid_df = self.parallel_execution(self.inclusive_range_validation, featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        bounds = self.config['attributes']['ranges']['inclusive'][featurename]
                        lower, upper = bounds[0], bounds[1]
                        self.update_report(invalid_df, f'Inclusive ranges validation failed - Invalid value for {featurename}, value outside [{lower},{upper}] found in the following rows')

            if 'exclusive' in config['attributes']['ranges']:

                for featurename in config['attributes']['ranges']['exclusive'].keys():
                    verdict, invalid_df = self.parallel_execution(self.exclusive_range_validation, featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        bounds = self.config['attributes']['ranges']['exclusive'][featurename]
                        lower, upper = bounds[0], bounds[1]
                        self.update_report(invalid_df, f'Exclusive ranges validation failed - Invalid value for {featurename}, value found in range [{lower}, {upper}] found in the following rows')

    def equal_value_validation(self, thread_id, featurename):
        # all_vals = val
        val = self.config['attributes']['values']['equal'][featurename]
        
        filtered_gdf = self.gdf_batches[thread_id][(self.gdf_batches[thread_id][featurename].notnull()) & (self.gdf_batches[thread_id][featurename] != val)]
        return len(filtered_gdf) == 0, filtered_gdf

    def not_equal_value_validation(self, thread_id, featurename):
        # all_vals != val
        val = self.config['attributes']['values']['not_equal'][featurename]
        
        filtered_gdf = self.gdf_batches[thread_id][(self.gdf_batches[thread_id][featurename].notnull()) & (self.gdf_batches[thread_id][featurename] == val)]
        return len(filtered_gdf) == 0, filtered_gdf
        
    def values_validation(self):
        config = self.config
        if 'values' in config['attributes']:
            if 'equal' in config['attributes']['values']:
                for featurename in config['attributes']['values']['equal'].keys():
                    verdict, invalid_df = self.parallel_execution(self.equal_value_validation, featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        val = self.config['attributes']['values']['equal'][featurename]
                        self.update_report(invalid_df, f'Equal value validation failed - Invalid value for {featurename}, value found not equal to {val} in the following rows')

            if 'not_equal' in config['attributes']['values']:
                for featurename in config['attributes']['values']['not_equal'].keys():
                    verdict, invalid_df = self.parallel_execution(self.not_equal_value_validation, featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        val = self.config['attributes']['values']['not_equal'][featurename]
                        self.update_report(invalid_df, f'Non-Equal value validation failed - Invalid value for {featurename}, value found equal to {val} in the following rows')

    def inclusive_subset_validation(self, thread_id, featurename):
        # all_vals belongs to vals
        vals = self.config['attributes']['subsets']['inclusive'][featurename]
        # print(self.gdf_batches[thread_id][featurename])
        filtered_gdf = self.gdf_batches[thread_id][(self.gdf_batches[thread_id][featurename].notnull()) & (~self.gdf_batches[thread_id][featurename].isin(vals))]
        return len(filtered_gdf) == 0, filtered_gdf
        
    def exclusive_subset_validation(self, thread_id, featurename):
        # all_vals not belongs to vals
        vals = self.config['attributes']['subsets']['exclusive'][featurename]
        
        filtered_gdf = self.gdf_batches[thread_id][(self.gdf_batches[thread_id][featurename].notnull()) & (self.gdf_batches[thread_id][featurename].isin(vals))]
        return len(filtered_gdf) == 0, filtered_gdf
    
    def subsets_validation(self):
        config = self.config
        if 'subsets' in config['attributes']:
            if 'inclusive' in config['attributes']['subsets']:
                for featurename in config['attributes']['subsets']['inclusive'].keys():
                    verdict, invalid_df = self.parallel_execution(self.inclusive_subset_validation, featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        vals = self.config['attributes']['subsets']['inclusive'][featurename]
                        self.update_report(invalid_df, f'Inclusive subsets validation failed - Invalid value for {featurename}, value found which does not belong to the {vals} in the following rows')

            if 'exclusive' in config['attributes']['subsets']:
                for featurename in config['attributes']['subsets']['exclusive'].keys():
                    verdict, invalid_df = self.parallel_execution(self.exclusive_subset_validation, featurename)
                    if(verdict == False):
                        self.validation_success_status = False
                        vals = self.config['attributes']['subsets']['exclusive'][featurename]
                        self.update_report(invalid_df, f'Exclusive subsets validation failed - Invalid value for {featurename}, value found which belongs to the {vals} in the following rows')
        
    def not_null_validation(self):
        config = self.config
        if 'not_null' in config['attributes']:
            for featurename in config['attributes']['not_null']:
                filtered_gdf = self.gdf[self.gdf[featurename].isnull()]
                if(len(filtered_gdf) > 0):
                    self.validation_success_status = False
                    self.update_report(filtered_gdf, f'Null check validation failed - Invalid value for {featurename}, null value found in the following rows')
    
    def create_function(self, code):
        func_dict = {}
        exec(code, globals(), func_dict)
        return func_dict['fun']

    def run_check_functions_validation(self, thread_id, featurename, func):
        filtered_gdf = self.gdf_batches[thread_id][(self.gdf_batches[thread_id][featurename].apply(self.create_function(func)) == False)]
        return len(filtered_gdf) == 0, filtered_gdf
    
    def attributes_check_functions_validation(self):
        config = self.config
        if 'check_functions' in config['attributes']:
            for featurename in config['attributes']['check_functions'].keys():
                funcs = config['attributes']['check_functions'][featurename]
                for func in funcs:
                    verdict, invalid_df = self.parallel_execution(self.run_check_functions_validation, featurename, func)
                    if(verdict == False):
                        self.validation_success_status = False
                        self.update_report(invalid_df, f'Function check failed - Invalid value for {featurename} - A function check {func} failed in the following rows')

    def crs_validation(self):
        config = self.config
        if 'crs' in config['geometry']:
            if(str(self.gdf.crs) != config['geometry']['crs']):
                self.validation_success_status = False
                self.update_report(None, f'Invalid crs {str(self.gdf.crs)} found')
            
    def geometry_types_validation(self):
        config = self.config
        # workaround for now...
        if 'types' in config['geometry']:
            valid_types = config['geometry']['types']
            types_found = set(self.gdf.geom_type)
            for type in types_found:
                if type not in valid_types:
                    self.validation_success_status = False
                    self.update_report(None, f'Invalid geometry type {type} found, it should be from {valid_types}')

    def geometry_check_function_validation(self):
        config = self.config
        if 'check_functions' in config['geometry']:
            funcs = config['geometry']['check_functions']
            for func in funcs:
                verdict, invalid_df = self.parallel_execution(self.run_check_functions_validation, 'geometry', func)
                if(verdict == False):
                    self.validation_success_status = False
                    self.update_report(invalid_df, f'Function check failed - Invalid value for geometry - A function check {func} failed in the following rows')

    def validate(self):
        # self.validate_config_structure()
        # #### ATTRIBUTES ####
        # dtypes validation,
        self.dtypes_validation()
        # json validation,
        self.json_validation()
        # ranges validation,
        self.ranges_validation()
        # values validation,
        self.values_validation()
        # subsets validation, (considered for only belonging condition)
        self.subsets_validation()
        # not_null validation,
        self.not_null_validation()
        # check_functions validation, (run function for all values in feature, all must be true)
        self.attributes_check_functions_validation()
        
        #### GEOMETRY VALIDATION ####
        # crs validation,
        self.crs_validation()
        # types validation
        self.geometry_types_validation()
        # check_functions validation
        self.geometry_check_function_validation()
        time_taken = time.time() - self.start_time
        if(self.validation_success_status == True):
            self.update_report(None, f'Validation successful')
        self.update_report(None, f'time taken for whole process: {time_taken}')
        print('Time :', time_taken)
        pass
