Skip to content

Commit

Permalink
normalization for centroids and itemized clusters, location clutering
Browse files Browse the repository at this point in the history
  • Loading branch information
Sieboldianus committed Jan 20, 2019
1 parent 9dcf22e commit 60970b6
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 104 deletions.
9 changes: 6 additions & 3 deletions tagmaps/__main__.py
Expand Up @@ -173,12 +173,15 @@ def main():
if cfg.cluster_locations and user_intf.abort is False:
log.info(
"\n########## STEP 6 of 6: Calculating Overall Location Clusters ##########")
shapes_and_meta_list.clear()
for clusterer in clusterer_list:
if clusterer.cls_type == LOCATIONS:
cluster_centroids = clusterer.get_cluster_centroids()
Compile.write_centroids(
clusterer.get_overall_clusters()
cluster_shapes = clusterer.get_cluster_centroids()
shapes_and_meta_list.append(cluster_shapes)
Compile.write_shapes(
bounds=lbsn_data.bounds,
cluster_centroids=cluster_centroids)
shapes_and_meta_list=shapes_and_meta_list)

#
# #if not 'clusterTreeCuttingDist' in locals():
Expand Down
8 changes: 6 additions & 2 deletions tagmaps/classes/cluster.py
Expand Up @@ -515,6 +515,7 @@ def get_itemized_clusters(self):

def get_cluster_centroids(self):
"""Get centroids for clustered data"""
itemized = False
resultshapes_and_meta = list()
for post_cluster in self.clustered_guids_all:
posts = [self.cleaned_post_dict[x] for x in post_cluster]
Expand All @@ -540,7 +541,9 @@ def get_cluster_centroids(self):
p_center = geometry.Point(x, y)
if p_center is not None and not p_center.is_empty:
resultshapes_and_meta.append((p_center, 1))
return resultshapes_and_meta
sys.stdout.flush()
# log.debug(f'{resultshapes_and_meta[:10]}')
return resultshapes_and_meta, self.cls_type, itemized

def get_cluster_shapes(self):
"""For each cluster of points,
Expand All @@ -549,6 +552,7 @@ def get_cluster_shapes(self):
Returns results as alphashapes_and_meta = list()
"""
itemized = True
saturation_exclude_count = 0
resultshapes_and_meta = list()
tnum = 0
Expand Down Expand Up @@ -638,4 +642,4 @@ def get_cluster_shapes(self):
logging.getLogger("tagmaps").info(
f'Excluded {saturation_exclude_count} '
f'{self.cls_type.rstrip("s")} on local saturation check.')
return resultshapes_and_meta, self.cls_type
return resultshapes_and_meta, self.cls_type, itemized
235 changes: 136 additions & 99 deletions tagmaps/classes/compile_output.py
Expand Up @@ -22,46 +22,89 @@ def write_shapes(cls,
shapes_and_meta_list):
"""Main wrapper for writing
all results to output
shapes_and_meta_list is either:
- List[[Tuple[List[],cls_type, itemized: bool = True]]]
or:
List[[Tuple[List[],cls_type, itemized: bool = False]]]
(overall clusters)
"""
bound_points_shapely = Utils._get_shapely_bounds(
bounds)
# data always in lat/lng WGS1984
__, epsg_code = Utils._get_best_utmzone(
bound_points_shapely)
cls._shape_writer_select(shapes_and_meta_list, epsg_code)

# Define polygon feature geometry
schema = {
'geometry': 'Polygon',
'properties': {'Join_Count': 'int',
'Views': 'int',
'COUNT_User': 'int',
'ImpTag': 'str',
'TagCountG': 'int',
'HImpTag': 'int',
'Weights': 'float',
'WeightsV2': 'float',
'WeightsV3': 'float',
# 'shapetype': 'str',
'emoji': 'int'},
}
cls._shape_writer(schema,
shapes_and_meta_list, epsg_code)
@classmethod
def _shape_writer_select(cls, shapes_and_meta_list,
epsg_code):
"""Wrapper to select different shape writer for itemized and
not itemized cluster results"""
itemized_shapes = list()
non_itemized_shapes = list()
for shapes_and_meta, cls_type, itemized in shapes_and_meta_list:
if itemized:
itemized_shapes.append(
(shapes_and_meta, cls_type))
else:
non_itemized_shapes.append(
(shapes_and_meta, cls_type))
if itemized_shapes:
cls._shape_writer(itemized_shapes,
epsg_code, True)
if non_itemized_shapes:
cls._shape_writer(non_itemized_shapes,
epsg_code, False)

@staticmethod
def _get_shape_schema(itemized):
"""Define polygon feature geometry"""
if itemized:
schema = {
'geometry': 'Polygon',
'properties': {'Join_Count': 'int',
'Views': 'int',
'COUNT_User': 'int',
'ImpTag': 'str',
'TagCountG': 'int',
'HImpTag': 'int',
'Weights': 'float',
'WeightsV2': 'float',
'WeightsV3': 'float',
# 'shapetype': 'str',
'emoji': 'int'},
}
else:
# Define a polygon feature geometry with one attribute
schema = {
'geometry': 'Point',
'properties': {'Join_Count': 'int',
'Weights': 'float'},
}
return schema

@classmethod
def _shape_writer(cls, schema,
shapes_and_meta_list, epsg_code):
def _shape_writer(cls, shapes_and_meta_list,
epsg_code, itemized):
# Initialize a new Shapefile
# WGS1984

schema = cls._get_shape_schema(itemized)
# better parametrization needed here:
# select name based on cls_type, not itemized
contains_emoji_output = cls._contains_emoji_output(
shapes_and_meta_list)
if (contains_emoji_output and len(shapes_and_meta_list) == 1):
# if only emoji output
shapefile_name = "allEmojiCluster"
if itemized:
if (contains_emoji_output and
len(shapes_and_meta_list) == 1):
# if only emoji output
shapefile_name = "allEmojiCluster"
else:
# if writing only tags
# or tags and emoji
shapefile_name = "allTagCluster"
else:
# if writing only tags
# or tags and emoji
shapefile_name = "allTagCluster"
shapefile_name = "allLocationCluster"
with fiona.open(
f'02_Output/{shapefile_name}.shp', mode='w',
encoding='UTF-8', driver='ESRI Shapefile',
Expand All @@ -70,13 +113,15 @@ def _shape_writer(cls, schema,
shapefile,
shapes_and_meta_list,
epsg_code, schema,
contains_emoji_output)
contains_emoji_output,
itemized)

@classmethod
def _attach_emojitable_handler(cls, shapefile,
shapes_and_meta_list,
epsg_code, schema,
contains_emoji_output):
contains_emoji_output,
itemized):
"""If Emoji Output present, open csv for writing
Note: refactor as optional property!
"""
Expand All @@ -87,12 +132,13 @@ def _attach_emojitable_handler(cls, shapefile,
cls._loop_shapemetalist(shapefile,
shapes_and_meta_list,
epsg_code, schema,
emoji_table)
emoji_table,
itemized)
else:
cls._loop_shapemetalist(shapefile,
shapes_and_meta_list,
epsg_code, schema,
None)
None, itemized)

@staticmethod
def _contains_emoji_output(shapes_and_meta_list):
Expand All @@ -105,16 +151,41 @@ def _contains_emoji_output(shapes_and_meta_list):

@classmethod
def _loop_shapemetalist(cls, shapefile, shapes_and_meta_list,
epsg_code, schema, emoji_table: TextIO):
epsg_code, schema, emoji_table: TextIO,
itemized):
for shapes, cls_type in shapes_and_meta_list:
# normalize types separately (e.g. emoji/tags)
global_weights = cls._get_weights(shapes)
cls._write_shapes(
shapefile, shapes,
cls_type, global_weights, emoji_table)
if itemized:
# normalize types separately (e.g. emoji/tags)
global_weights = cls._get_weights(shapes, [6, 7, 8])
cls._write_itemized_shapes(
shapefile, shapes,
cls_type, global_weights, emoji_table)
else:
# normalize types separately (e.g. emoji/tags)
global_weights = cls._get_weights(shapes, [1])
cls._write_nonitemized_shapes(
shapefile, shapes,
cls_type, global_weights, emoji_table)

@classmethod
def _write_nonitemized_shapes(
cls, shapefile,
shapes, cls_type,
weights: Dict[int, Tuple[float, float]],
emoji_table: TextIO
):
for alphashape_and_meta in shapes:
local_value = alphashape_and_meta[1]
local_value_normalized = cls._get_normalize_value(
local_value, weights.get(1))
shapefile.write({
'geometry': geometry.mapping(alphashape_and_meta[0]),
'properties': {'Join_Count': alphashape_and_meta[1],
'Weights': local_value_normalized},
})

@classmethod
def _write_shapes(
def _write_itemized_shapes(
cls, shapefile,
shapes, cls_type,
weights: Dict[int, Tuple[float, float]],
Expand Down Expand Up @@ -215,71 +286,37 @@ def _write_shape(
})

@classmethod
def write_centroids(cls,
bounds: AnalysisBounds,
cluster_centroids):

bound_points_shapely = Utils._get_shapely_bounds(
bounds)
# data always in lat/lng WGS1984
__, epsg_code = Utils._get_best_utmzone(
bound_points_shapely)

# Define a polygon feature geometry with one attribute
schema = {
'geometry': 'Point',
'properties': {'Join_Count': 'int'},
}

# Write a new Shapefile
# WGS1984
with fiona.open('02_Output/allPhotoCluster.shp',
mode='w',
driver='ESRI Shapefile', schema=schema,
crs=from_epsg(epsg_code)) as c:
# If there are multiple geometries, put the "for" loop here
idx = 0
for cluster_centroid in cluster_centroids:
idx += 1
c.write({
'geometry': geometry.mapping(cluster_centroid[0]),
'properties': {'Join_Count': cluster_centroid[1]},
})

@staticmethod
def _get_weights(shapes):
def _get_weights(cls, shapes, columns: List[int]):
"""Normalization of Values (1-1000 Range),
precalc Step (global weights)
precalc Step (global weights),
see
https://stats.stackexchange.com/questions/70801/how-to-normalize-data-to-0-1-range
"""
# get the n'th column out for calculating the max/min
weightsv1_range = [x[6] for x in shapes]
weightsv2_range = [x[7] for x in shapes]
weightsv3_range = [x[8] for x in shapes]
weightsv1_min = min(weightsv1_range)
weightsv1_max = max(weightsv1_range)
weightsv2_min = min(weightsv2_range)
weightsv2_max = max(weightsv2_range)
weightsv3_min = min(weightsv3_range)
weightsv3_max = max(weightsv3_range)
# precalc, see
# https://stats.stackexchange.com/questions/70801/how-to-normalize-data-to-0-1-range
idx = 1
weights_dict: Dict[int, Tuple[float, float]] = dict()
for column in columns:
# int: weights algorithm
# Tuple: min and max modifiers
values_min, values_max = cls._get_column_min_max(
shapes, column)
weights_mod_a = (1000-1)/(
values_max-values_min)
weights_mod_b = 1000 - weights_mod_a * values_max
weights_dict[idx] = (weights_mod_a, weights_mod_b)
idx += 1
return weights_dict

# int: weights algorithm
# Tuple: min and max modifiers
weights: Dict[int, Tuple[float, float]] = dict()
weightsv1_mod_a = (1000-1)/(
weightsv1_max-weightsv1_min)
weightsv1_mod_b = 1000 - weightsv1_mod_a * weightsv1_max
weights[1] = (weightsv1_mod_a, weightsv1_mod_b)
weightsv2_mod_a = (1000-1)/(
weightsv2_max-weightsv2_min)
weightsv2_mod_b = 1000 - weightsv2_mod_a * weightsv2_max
weights[2] = (weightsv2_mod_a, weightsv2_mod_b)
weightsv3_mod_a = (1000-1)/(
weightsv3_max-weightsv3_min)
weightsv3_mod_b = 1000 - weightsv3_mod_a * weightsv3_max
weights[3] = (weightsv3_mod_a, weightsv3_mod_b)
return weights
@staticmethod
def _get_column_min_max(
shapes,
column: int) -> Tuple[float, float]:
"""Get min and max values of specific column
for normalization"""
# get the n'th column out for calculating the max/min
weights_range = [x[column] for x in shapes]
weights_min = min(weights_range)
weights_max = max(weights_range)
return weights_min, weights_max

@staticmethod
def _normalize_value(
Expand Down

0 comments on commit 60970b6

Please sign in to comment.