From ea66395945418cd688cfe1279f38ad0d67e9d764 Mon Sep 17 00:00:00 2001 From: AD Date: Fri, 18 Jan 2019 14:23:44 +0100 Subject: [PATCH] refactor initial alpha shapes part, not tested --- tagmaps/__main__.py | 29 +- tagmaps/classes/alpha_shapes.py | 231 ++++++++++++++ tagmaps/classes/cluster.py | 547 +++++++++++++++++--------------- tagmaps/classes/load_data.py | 6 +- tagmaps/classes/utils.py | 297 ++++------------- 5 files changed, 603 insertions(+), 507 deletions(-) create mode 100644 tagmaps/classes/alpha_shapes.py diff --git a/tagmaps/__main__.py b/tagmaps/__main__.py index bc8ac51..72cc413 100644 --- a/tagmaps/__main__.py +++ b/tagmaps/__main__.py @@ -124,7 +124,8 @@ def main(): clusterer_type=cls_type, bounds=lbsn_data.bounds, cleaned_post_dict=cleaned_post_dict, - prepared_data=prepared_data + prepared_data=prepared_data, + local_saturation_check=cfg.local_saturation_check ) clusterer_list.append(clusterer) @@ -137,18 +138,20 @@ def main(): if cfg.auto_mode or user_intf.abort is False: for clusterer in clusterer_list: - if not clusterer.ClusterType == ClusterGen.LOCATIONS: - if clusterer.ClusterType == ClusterGen.TAGS: - log.info("Tag clustering: \n") - else: - log.info("Emoji clustering: \n") - clusterer.cluster_all() - log.info( - "########## STEP 4 of 6: Generating Alpha Shapes ##########") - clusterer.alpha_shapes() - log.info( - "########## STEP 5 of 6: Writing Results to Shapefile ##########") - clusterer.write_results() + if clusterer.ClusterType == ClusterGen.LOCATIONS: + # skip location clustering for now + continue + if clusterer.ClusterType == ClusterGen.TAGS: + log.info("Tag clustering: \n") + else: + log.info("Emoji clustering: \n") + clusterer.cluster_all() + log.info( + "########## STEP 4 of 6: Generating Alpha Shapes ##########") + clusterer.alpha_shapes() + log.info( + "########## STEP 5 of 6: Writing Results to Shapefile ##########") + clusterer.write_results() else: print(f'\nUser abort.') if cfg.cluster_locations and user_intf.abort is False: diff --git a/tagmaps/classes/alpha_shapes.py b/tagmaps/classes/alpha_shapes.py new file mode 100644 index 0000000..6e8d489 --- /dev/null +++ b/tagmaps/classes/alpha_shapes.py @@ -0,0 +1,231 @@ +# -*- coding: utf-8 -*- + +""" +Module for tag maps alpha shapes generation +""" + +import math +import pyproj +import numpy as np +import shapely.geometry as geometry +from shapely.ops import transform, cascaded_union, polygonize +from descartes import PolygonPatch +from scipy.spatial import Delaunay +from fiona.crs import from_epsg +from tagmaps.classes.utils import Utils + + +class AlphaShapes(): + + @staticmethod + def _get_best_utmzone(bound_points_shapely: geometry.MultiPoint): + """Calculate best UTM Zone SRID/EPSG Code + Args: + True centroid (coords may be multipoint)""" + input_lon_center = bound_points_shapely.centroid.coords[0][0] + input_lat_center = bound_points_shapely.centroid.coords[0][1] + epsg_code = AlphaShapes._convert_wgs_to_utm( + input_lon_center, input_lat_center) + crs_proj = pyproj.Proj(init=f'epsg:{epsg_code}') + return crs_proj + + @staticmethod + def _convert_wgs_to_utm(lon: float, lat: float): + """"[summary]" + + Args: + lon: latitude + lat: longitude + + Returns: + [type]: [description] + + Notes: + # https://stackoverflow.com/questions/40132542/get-a-cartesian-projection-accurate-around-a-lat-lng-pair + """ + + utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) + if len(utm_band) == 1: + utm_band = '0'+utm_band + if lat >= 0: + epsg_code = '326' + utm_band + else: + epsg_code = '327' + utm_band + return epsg_code + + @staticmethod + def get_cluster_shape( + toptag, clusterPhotoGuidList, + cleanedPhotoDict, crs_wgs, crs_proj, + clusterTreeCuttingDist, localSaturationCheck): + #we define a new list of Temp Alpha Shapes outside the loop, so that it is not overwritten each time + listOfAlphashapesAndMeta_tmp = [] + #points = [] + tagArea = 0 + for photo_guids in clusterPhotoGuidList: + #for each cluster for this toptag + photos = [cleanedPhotoDict[x] for x in photo_guids] + photoCount = len(photo_guids) + uniqueUserCount = len(set([photo.user_guid for photo in photos])) + sumViews = sum([photo.post_views_count for photo in photos]) + #calculate different weighting formulas + #weightsv1 = 1+ photoCount *(sqrt(1/( photoCount / uniqueUserCount )**3)) #-> Standard weighting formula (x**y means x raised to the power y); +1 to UserCount: prevent 1-2 Range from being misaligned + #weightsv2 = 1+ photoCount *(sqrt(1/( photoCount / uniqueUserCount )**2)) + weightsv1 = photoCount *(sqrt(1/( photoCount / (uniqueUserCount+1) )**3)) #-> Standard weighting formula (x**y means x raised to the power y); +1 to UserCount: prevent 1-2 Range from being misaligned + weightsv2 = photoCount *(sqrt(1/( photoCount / (uniqueUserCount+1) )**2)) #-> less importance on User_Count in correlation to photo count [Join_Count]; +1 to UserCount: prevent 1-2 Range from being misaligned + weightsv3 = sqrt((photoCount+(2*sqrt(photoCount)))*2) #-> Ignores User_Count, this will emphasize individual and very active users + #points = [geometry.Point(photo.lng, photo.lat) + # for photo in photos] + #instead of lat/lng for each photo, we use photo_locID to identify a list of distinct locations + distinctLocations = set([photo.loc_id + for photo in photos]) + #simple list comprehension without projection: + #points = [geometry.Point(Decimal(location.split(':')[1]), Decimal(location.split(':')[0])) + # for location in distinctLocations] + points = [geometry.Point(pyproj.transform(crs_wgs, crs_proj, Decimal(location.split(':')[1]), Decimal(location.split(':')[0]))) + for location in distinctLocations] + point_collection = geometry.MultiPoint(list(points)) + result_polygon = None + + if len(points) >= 5: + if len(points) < 10: + result_polygon = point_collection.convex_hull #convex hull + result_polygon = result_polygon.buffer(clusterTreeCuttingDist/4,resolution=3) + shapetype = "between 5 and 10 points_convexHull" + #result_polygon = result_polygon.buffer(min(distXLng,distYLat)/100,resolution=3) + else: + if len(points) > 500: + startalpha = 1000000 + elif len(points) > 200: + startalpha = 10000 + else: + startalpha = 9000 + result_polygon = Utils.alpha_shape(points,alpha=clusterTreeCuttingDist/startalpha) #concave hull/alpha shape /50000 + shapetype = "Initial Alpha Shape + Buffer" + if type(result_polygon) is geometry.multipolygon.MultiPolygon or isinstance(result_polygon, bool): + #repeat generating alpha shapes with smaller alpha value if Multigon is generated + #smaller alpha values mean less granularity of resulting polygon + #but too large alpha may result in empty polygon + #(this branch is sometimes executed for larger scales) + for i in range(1,6): + #try decreasing alpha + alpha = startalpha + (startalpha * (i**i)) #** means cube + result_polygon = Utils.alpha_shape(points,alpha=clusterTreeCuttingDist/alpha)#/100000 + if not type(result_polygon) is geometry.multipolygon.MultiPolygon and not isinstance(result_polygon, bool): + shapetype = "Multipolygon Alpha Shape /" + str(alpha) + break + if type(result_polygon) is geometry.multipolygon.MultiPolygon or isinstance(result_polygon, bool): + #try increasing alpha + for i in range(1,6): + #try decreasing alpha + alpha = startalpha / (i*i) + result_polygon = Utils.alpha_shape(points,alpha=clusterTreeCuttingDist/alpha)#/100000 + if not type(result_polygon) is geometry.multipolygon.MultiPolygon and not isinstance(result_polygon, bool): + shapetype = "Multipolygon Alpha Shape /" + str(alpha) + break + if type(result_polygon) is geometry.multipolygon.MultiPolygon: + shapetype = "Multipolygon Alpha Shape -> Convex Hull" + #if still of type multipolygon, try to remove holes and do a convex_hull + result_polygon = result_polygon.convex_hull + #OR: in case there was a problem with generating alpha shapes (circum_r = a*b*c/(4.0*area) --> ZeroDivisionError: float division by zero) + #this branch is rarely executed for large point clusters where alpha is perhaps set too small + elif isinstance(result_polygon, bool) or result_polygon.is_empty: + shapetype = "BoolAlpha -> Fallback to PointCloud Convex Hull" + result_polygon = point_collection.convex_hull #convex hull + #Finally do a buffer to smooth alpha + result_polygon = result_polygon.buffer(clusterTreeCuttingDist/4,resolution=3) + #result_polygon = result_polygon.buffer(min(distXLng,distYLat)/100,resolution=3) + elif 2 <= len(points) < 5: + shapetype = "between 2 and 5 points_buffer" + #calc distance between points http://www.mathwarehouse.com/algebra/distance_formula/index.php + #bdist = math.sqrt((points[0].coords.xy[0][0]-points[1].coords.xy[0][0])**2 + (points[0].coords.xy[1][0]-points[1].coords.xy[1][0])**2) + #print(str(bdist)) + result_polygon = point_collection.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area + result_polygon = result_polygon.convex_hull + #result_polygon = point_collection.buffer(min(distXLng,distYLat)/100,resolution=3) #single dots are presented as buffer with 0.5% of width-area + elif len(points)==1 or type(result_polygon) is geometry.point.Point or result_polygon is None: + shapetype = "1 point cluster" + result_polygon = point_collection.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area + #result_polygon = point_collection.buffer(min(distXLng,distYLat)/100,resolution=3) #single dots are presented as buffer with 0.5% of width-area + #final check for multipolygon + if type(result_polygon) is geometry.multipolygon.MultiPolygon: + #usually not executed + result_polygon = result_polygon.convex_hull + #Geom, Join_Count, Views, COUNT_User,ImpTag,TagCountG,HImpTag + if result_polygon is not None and not result_polygon.is_empty: + if localSaturationCheck: + tagArea += result_polygon.area + listOfAlphashapesAndMeta_tmp.append((result_polygon,photoCount,sumViews,uniqueUserCount,toptag[0],toptag[1],weightsv1,weightsv2,weightsv3,shapetype)) + if len(listOfAlphashapesAndMeta_tmp) > 0: + # finally sort and append all cluster shapes for this tag + listOfAlphashapesAndMeta_tmp = sorted(listOfAlphashapesAndMeta_tmp,key=lambda x: -x[6]) + return listOfAlphashapesAndMeta_tmp, tagArea + + @staticmethod + def alpha_shape(points, alpha): + """ + Alpha Shapes Code by KEVIN DWYER, see + http://blog.thehumangeo.com/2014/05/12/drawing-boundaries-in-python/ + Compute the alpha shape (concave hull) of a set + of points. + @param points: Iterable container of points. + @param alpha: alpha value to influence the + gooeyness of the border. Smaller numbers + don't fall inward as much as larger numbers. + Too large, and you lose everything! + + with minor adaptions to Tag Maps clustering. + """ + if len(points) < 4: + # When you have a triangle, there is no sense + # in computing an alpha shape. + return geometry.MultiPoint(list(points)).convex_hull + def add_edge(edges, edge_points, coords, i, j): + """ + Add a line between the i-th and j-th points, + if not in the list already + """ + if (i, j) in edges or (j, i) in edges: + # already added + return + edges.add( (i, j) ) + edge_points.append(coords[ [i, j] ]) + coords = np.array([point.coords[0] + for point in points]) + + #print(str(len(coords))) + tri = Delaunay(coords)#,qhull_o}ptions = 'QJ') #To avoid this error, you can joggle the data by specifying the 'QJ' option to the DELAUNAY function. https://de.mathworks.com/matlabcentral/answers/94438-why-does-the-delaunay-function-in-matlab-7-0-r14-produce-an-error-when-passed-colinear-points?s_tid=gn_loc_drop + #tri = Delaunay(coords,{'QJ'}) #Version 3.1 added triangulated output ('Qt'). It should be used for Delaunay triangulations instead of using joggled input ('QJ'). + edges = set() + edge_points = [] + # loop over triangles: + # ia, ib, ic = indices of corner points of the + # triangle + for ia, ib, ic in tri.vertices: + pa = coords[ia] + pb = coords[ib] + pc = coords[ic] + # Lengths of sides of triangle + a = math.sqrt((pa[0]-pb[0])**2 + (pa[1]-pb[1])**2) + b = math.sqrt((pb[0]-pc[0])**2 + (pb[1]-pc[1])**2) + c = math.sqrt((pc[0]-pa[0])**2 + (pc[1]-pa[1])**2) + # Semiperimeter of triangle + s = (a + b + c)/2.0 + # Area of triangle by Heron's formula + try: + area = math.sqrt(s*(s-a)*(s-b)*(s-c)) + except ValueError: + return False + if area == 0: + return False + circum_r = a*b*c/(4.0*area) + # Here's the radius filter. + #print circum_r + if circum_r < 1.0/alpha: + add_edge(edges, edge_points, coords, ia, ib) + add_edge(edges, edge_points, coords, ib, ic) + add_edge(edges, edge_points, coords, ic, ia) + m = geometry.MultiLineString(edge_points) + triangles = list(polygonize(m)) + return cascaded_union(triangles)#, edge_points + #return geometry.polygon.asPolygon(edge_points,holes=None) \ No newline at end of file diff --git a/tagmaps/classes/cluster.py b/tagmaps/classes/cluster.py index ee9525f..0356f7f 100644 --- a/tagmaps/classes/cluster.py +++ b/tagmaps/classes/cluster.py @@ -16,6 +16,7 @@ import shapely.geometry as geometry from multiprocessing.pool import ThreadPool from tagmaps.classes.utils import Utils +from tagmaps.classes.alpha_shapes import AlphaShapes from tagmaps.classes.shared_structure import ( CleanedPost, AnalysisBounds, PreparedData) @@ -41,10 +42,13 @@ def __init__(self, bounds: AnalysisBounds, top_list: List[Tuple[str, int]], total_distinct_locations: int, tmax: int, - cluster_type: ClusterType = TAGS): + cluster_type: ClusterType = TAGS, + topitem: Tuple[str, int] = None, + local_saturation_check: bool = True): self.cls_type = cluster_type self.tnum = 0 self.tmax = tmax + self.topitem = topitem self.bounds = bounds self.cluster_distance = ClusterGen._init_cluster_dist(self.bounds) self.cleaned_post_dict = cleaned_post_dict @@ -56,6 +60,11 @@ def __init__(self, bounds: AnalysisBounds, self.number_of_clusters = None self.mask_noisy = None self.clusterer = None + # storing cluster results: + self.single_items = defaultdict(list) + self.clustered_items = defaultdict(list) + self.local_saturation_check = local_saturation_check + self.alphashapes_and_meta = list() # set initial analysis bounds self._update_bounds() self.bound_points_shapely = ( @@ -70,12 +79,13 @@ def new_clusterer(cls, clusterer_type: ClusterType, bounds: AnalysisBounds, cleaned_post_dict: Dict[str, CleanedPost], - prepared_data: PreparedData - ): + prepared_data: PreparedData, + local_saturation_check: bool): """Create new clusterer from type and input data Args: - clusterer_type (ClusterGen.ClusterType): Either Tags, Locations or Emoji + clusterer_type (ClusterGen.ClusterType): Either Tags, + Locations or Emoji bounds (LoadData.AnalysisBounds): Analaysis spatial boundary cleaned_post_dict (Dict[str, CleanedPost]): Dict of cleaned posts prepared_data (LoadData.PreparedData): Statistics data @@ -86,12 +96,15 @@ def new_clusterer(cls, if clusterer_type == cls.TAGS: top_list = prepared_data.top_tags_list tmax = prepared_data.tmax + topitem = prepared_data.single_mostused_tag elif clusterer_type == cls.EMOJI: top_list = prepared_data.top_emoji_list tmax = prepared_data.emax + topitem = prepared_data.single_mostused_emoji elif clusterer_type == cls.LOCATIONS: top_list = prepared_data.top_locations_list tmax = prepared_data.emax + topitem = prepared_data.single_mostused_location else: sys.exit("Cluster Type unknown.") clusterer = cls( @@ -100,7 +113,9 @@ def new_clusterer(cls, top_list=top_list, total_distinct_locations=prepared_data.total_unique_locations, tmax=tmax, - cluster_type=clusterer_type) + cluster_type=clusterer_type, + topitem=topitem, + local_saturation_check=local_saturation_check) return clusterer @staticmethod @@ -348,263 +363,289 @@ def cluster_points(self, points, # self.sel_colors will be used to gen preview map return None -def cluster_all(self): - """Cluster all data attached to self - """ - noClusterPhotos_perTag_DictOfLists = defaultdict(list) - clustersPerTag = defaultdict(list) - # Proceed with clustering all tags - # data always in lat/lng WGS1984 - crs_wgs = pyproj.Proj(init='epsg:4326') - if cfg.override_crs is None: - # Calculate best UTM Zone SRID/EPSG Code - # True centroid (coords may be multipoint): - input_lon_center = self.bound_points_shapely.centroid.coords[0][0] - input_lat_center = self.bound_points_shapely.centroid.coords[0][1] - epsg_code = Utils.convert_wgs_to_utm(input_lon_center, input_lat_center) - crs_proj = pyproj.Proj(init=f'epsg:{epsg_code}') - project = lambda x, y: pyproj.transform(pyproj.Proj(init='epsg:4326'), pyproj.Proj(init=f'epsg:{epsg_code}'), x, y) - #geom_proj = transform(project, alphaShapeAndMeta[0]) + def _cluster_item(self, sel_item: Tuple[str, int]): + """Cluster specific item""" - if cfg.local_saturation_check: - clusters, selected_post_guids = cluster_tag(prepared_data.single_mostused_tag, None, True) - numpy_selectedPhotoList_Guids = np.asarray(selected_post_guids) - mask_noisy = (clusters == -1) - number_of_clusters = len(np.unique(clusters[~mask_noisy])) - print(f'--> {number_of_clusters} cluster.') - clusterPhotosGuidsList = [] - for x in range(number_of_clusters): - currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x] - clusterPhotosGuidsList.append(currentClusterPhotoGuids) - noClusterPhotos_perTag_DictOfLists[prepared_data.single_mostused_tag[0]] = list(numpy_selectedPhotoList_Guids[clusters==-1]) - # Sort descending based on size of cluster: https://stackoverflow.com/questions/30346356/how-to-sort-list-of-lists-according-to-length-of-sublists - clusterPhotosGuidsList.sort(key=len, reverse=True) - if not len(clusterPhotosGuidsList) == 0: - clustersPerTag[prepared_data.single_mostused_tag[0]] = clusterPhotosGuidsList - global tnum - tnum = 1 - for toptag in top_tags_list: - if cfg.local_saturation_check and tnum == 1 and toptag[0] in clustersPerTag: - #skip toptag if already clustered due to local saturation - continue - clusters, selected_post_guids = cluster_tag(toptag, None, True) - #print("baseDataList: ") - #print(str(type(selectedPhotoList))) - #for s in selectedPhotoList[:2]: - # print(*s) - #print("resultData: ") - ##for s in clusters[:2]: - ## print(*s) - #print(str(type(clusters))) - #print(clusters) - #clusters contains the cluster values (-1 = no cluster, 0 maybe, >0 = cluster - # in the same order, selectedPhotoList contains all original photo data, thus clusters[10] and selectedPhotoList[10] refer to the same photo - - numpy_selectedPhotoList_Guids = np.asarray(selected_post_guids) + points = self._get_np_points(item=sel_item[0], silent=False) + clusters = self.cluster_points(points=points, preview_mode=False) + return clusters, points + + @staticmethod + def _get_cluster_guids(clusters, selected_post_guids): + """Returns two lists: clustered and non clustered guids""" + clustered_guids = list() + np_selected_post_guids = np.asarray(selected_post_guids) mask_noisy = (clusters == -1) if len(selected_post_guids) == 1: number_of_clusters = 0 else: - number_of_clusters = len(np.unique(clusters[~mask_noisy])) #mit noisy (=0) - #if number_of_clusters > 200: - # log.info("--> Too many, skipped for this scale.") - # continue - if not number_of_clusters == 0: + number_of_clusters = len(np.unique(clusters[~mask_noisy])) + if number_of_clusters == 0: + print("--> No cluster.") + non_clustered_guids = list(np_selected_post_guids) + else: print(f'--> {number_of_clusters} cluster.') - tnum += 1 - photo_num = 0 - #clusternum_photolist = zip(clusters,selectedPhotoList) - #clusterPhotosList = [[] for x in range(number_of_clusters)] - clusterPhotosGuidsList = [] for x in range(number_of_clusters): - currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x] - clusterPhotosGuidsList.append(currentClusterPhotoGuids) - noClusterPhotos_perTag_DictOfLists[toptag[0]] = list(numpy_selectedPhotoList_Guids[clusters==-1]) - # Sort descending based on size of cluster: https://stackoverflow.com/questions/30346356/how-to-sort-list-of-lists-according-to-length-of-sublists - clusterPhotosGuidsList.sort(key=len, reverse=True) - if not len(clusterPhotosGuidsList) == 0: - clustersPerTag[toptag[0]] = clusterPhotosGuidsList + current_clustered_guids = np_selected_post_guids[clusters == x] + clustered_guids.append(current_clustered_guids) + non_clustered_guids = list(np_selected_post_guids[clusters == -1]) + # Sort descending based on size of cluster + # https://stackoverflow.com/questions/30346356/how-to-sort-list-of-lists-according-to-length-of-sublists + clustered_guids.sort(key=len, reverse=True) + return clustered_guids, non_clustered_guids + + def _get_update_clusters(self, item, + single_items_dict, + cluster_items_dict): + """Get clusters for items and write results to dicts""" + clusters, selected_post_guids = self._cluster_item(item) + result = self._get_cluster_guids(clusters, selected_post_guids) + clustered_guids = result[0] + non_clustered_guids = result[0] + single_items_dict[item[0]] = non_clustered_guids + if not len(clustered_guids) == 0: + cluster_items_dict[item[0]] = clustered_guids + + def cluster_all(self): + """Cluster all items attached to self + + Updates results to: + self.single_items + self.clustered_items + """ + + # get clusters for top item + if self.local_saturation_check: + self._get_update_clusters( + self.topitem, + self.single_items, + self.clustered_items) + self.tnum = 1 + # get remaining clusters + for item in self.top_list: + if (self.local_saturation_check and + self.tnum == 1 and + item[0] in self.clustered_items): + # skip item if already + # clustered due to local saturation + continue + self._get_update_clusters( + item, + self.single_items, + self.clustered_items) + # flush console output once + sys.stdout.flush() + + def alpha_shapes(self): + """For each cluster of points, + calculate boundary shape and + add statistics (HImpTag etc.) + + Updates results to self.alphashapes_and_meta = list() + """ + saturation_exclude_count = 0 + # data always in lat/lng WGS1984 + crs_wgs = pyproj.Proj(init='epsg:4326') + crs_proj = AlphaShapes._get_best_utmzone( + self.bound_points_shapely) + + alphashapes_and_meta = self.alphashapes_and_meta + self.tnum = 0 + if self.local_saturation_check: + # calculate total area of Top1-Tag + # for 80% saturation check for lower level tags + saturation_exclude_count = 0 + clustered_post_guids = self.clustered_items.get( + self.topitem[0], None) + # print("Topitem: " + str(topitem[0])) + if clustered_post_guids is None: + sys.exit(f'Something went wrong: ' + f'No posts found for toptag: ' + f'{topitem[0]}') + __, topitem_area = AlphaShapes.get_cluster_shape( + self.top_item, clustered_post_guids, self.cleaned_post_dict, + crs_wgs, crs_proj, self.cluster_distance, + self.local_saturation_check) + for item in self.top_tags_list: + self.tnum += 1 + clustered_post_guids = self.clustered_items.get(item[0], None) + # Generate tag Cluster Shapes + if clustered_post_guids: + result = AlphaShapes.get_cluster_shape( + item, clustered_post_guids, self.cleaned_post_dict, + crs_wgs, crs_proj, self.cluster_distance, + self.local_saturation_check) + alphashapes_and_meta_tmp = result[0] + item_area = result[1] + if (self.local_saturation_check + and not item_area == 0 + and not self.tnum == 1): + local_saturation = item_area/(topitem_area/100) + # print("Local Saturation for Tag " + top_item[0] " + # "+ ": " + str(round(localSaturation,0))) + if local_saturation > 60: + # skip tag entirely due to saturation + # (if total area > 80% of total area + # of item clusters) + # print("Skipped: " + top_item[0] + " due + # to saturation (" + + # str(round(localSaturation,0)) + "%).") + saturation_exclude_count += 1 + continue # next item + + if len(alphashapes_and_meta_tmp) > 0: + alphashapes_and_meta = list().extend( + alphashapes_and_meta_tmp) + + non_clustered_guids = self.single_items.get(item[0], None) + if non_clustered_guids: + shapetype = "Single cluster" + # print("Single: " + str(len(singlePhotoGuidList))) + posts = [self.cleaned_post_dict[x] + for x in non_clustered_guids] + for single_post in posts: + # project lat/lng to UTM + x, y = pyproj.transform( + crs_wgs, crs_proj, + single_post.lng, single_post.lat) + pcoordinate = geometry.Point(x, y) + # single dots are presented + # as buffers with 0.5% of width-area + result_polygon = pcoordinate.buffer( + self.cluster_distance/4, + resolution=3) + # result_polygon = pcoordinate.buffer( + # min(distXLng,distYLat)/100, + # resolution=3) + if (result_polygon is None or + result_polygon.is_empty): + continue + # append statistics for item with no cluster + alphashapes_and_meta.append(( + result_polygon, 1, + max(single_post.post_views_count, + single_post.post_like_count), + 1, str(item[0]), + item[1], 1, 1, 1, shapetype)) + self.log.info(f'{len(alphashapes_and_meta)} ' + f'Alpha Shapes. Done.') + if saturation_exclude_count > 0: + self.log.info(f'Excluded {saturationExcludeCount} ' + f'Tags on local saturation check.') + + def write_results(self): + """Write all results to output + """ + + ## Output Boundary Shapes in merged Shapefile ## + log.info("########## STEP 5 of 6: Writing Results to Shapefile ##########") + + #Calculate best UTM Zone SRID/EPSG Code + input_lon_center = bound_points_shapely.centroid.coords[0][0] #True centroid (coords may be multipoint) + input_lat_center = bound_points_shapely.centroid.coords[0][1] + epsg_code = Utils.convert_wgs_to_utm(input_lon_center, input_lat_center) + project = lambda x, y: pyproj.transform(pyproj.Proj(init='epsg:4326'), pyproj.Proj(init='epsg:{0}'.format(epsg_code)), x, y) + + # Define polygon feature geometry + schema = { + 'geometry': 'Polygon', + 'properties': {'Join_Count': 'int', + 'Views': 'int', + 'COUNT_User': 'int', + 'ImpTag': 'str', + 'TagCountG': 'int', + 'HImpTag': 'int', + 'Weights': 'float', + 'WeightsV2': 'float', + 'WeightsV3': 'float', + #'shapetype': 'str', + 'emoji': 'int'}, + } + + #Normalization of Values (1-1000 Range), precalc Step: + ####################################### + weightsv1_range = [x[6] for x in alphashapes_and_meta = list()] #get the n'th column out for calculating the max/min + weightsv2_range = [x[7] for x in alpha_shapes_meta = list()] + weightsv3_range = [x[8] for x in alpha_shapes_meta = list()] + weightsv1_min = min(weightsv1_range) + weightsv1_max = max(weightsv1_range) + weightsv2_min = min(weightsv2_range) + weightsv2_max = max(weightsv2_range) + weightsv3_min = min(weightsv3_range) + weightsv3_max = max(weightsv3_range) + #precalc + #https://stats.stackexchange.com/questions/70801/how-to-normalize-data-to-0-1-range + weightsv1_mod_a = (1000-1)/(weightsv1_max-weightsv1_min) + weightsv1_mod_b = 1000 - weightsv1_mod_a * weightsv1_max + weightsv2_mod_a = (1000-1)/(weightsv2_max-weightsv2_min) + weightsv2_mod_b = 1000 - weightsv2_mod_a * weightsv2_max + weightsv3_mod_a = (1000-1)/(weightsv3_max-weightsv3_min) + weightsv3_mod_b = 1000 - weightsv3_mod_a * weightsv3_max + ####################################### + # Write a new Shapefile + # WGS1984 + if (cfg.cluster_tags == False and cfg.cluster_emoji == True): + shapefileName = "allEmojiCluster" else: - print("--> No cluster.") - noClusterPhotos_perTag_DictOfLists[toptag[0]] = list(numpy_selectedPhotoList_Guids) - #for x in clusters: - # #photolist = [] - # if x >= 0: # no clusters: x = -1 - # clusterPhotosList[x].append([selectedPhotoList[photo_num]]) - # #clusterPhotosArray_dict[x].add(selectedPhotoList[photo_num]) - # else: - # noClusterPhotos_perTag_DictOfLists[toptag[0]].append(selectedPhotoList[photo_num]) - # photo_num+=1 - - #print("resultList: ") - #for s in clusterPhotosList[:2]: - # print(*s) - #print(str(toptag) + " - Number of clusters: " + str(len(clusterPhotosList)) + " Photo num: " + str(photo_num)) - - #plt.autoscale(enable=True) - - #if tnum == 50: - # break - #plt.savefig('foo.png') - #sys.exit() - sys.stdout.flush() - log.info("########## STEP 4 of 6: Generating Alpha Shapes ##########") - #if (tnum % 50 == 0):#modulo: if division has no remainder, force update cmd output - #sys.stdout.flush() - #for each cluster of points, calculate boundary shape and add statistics (HImpTag etc.) - listOfAlphashapesAndMeta = [] - tnum = 0 - if cfg.local_saturation_check: - #calculate total area of Top1-Tag for 80% saturation check for lower level tags - saturationExcludeCount = 0 - clusterPhotoGuidList = clustersPerTag.get(prepared_data.single_mostused_tag[0], None) - #print("Toptag: " + str(singleMostUsedtag[0])) - if clusterPhotoGuidList is None: - sys.exit(f'No Photos found for toptag: {singleMostUsedtag}') - toptagArea = Utils.generateClusterShape(toptag,clusterPhotoGuidList,cleaned_post_dict,crs_wgs,crs_proj,clusterTreeCuttingDist,cfg.local_saturation_check)[1] - for toptag in top_tags_list: - tnum += 1 - clusterPhotoGuidList = clustersPerTag.get(toptag[0], None) - #Generate tag Cluster Shapes - if clusterPhotoGuidList: - listOfAlphashapesAndMeta_tmp,tagArea = Utils.generateClusterShape(toptag,clusterPhotoGuidList,cleaned_post_dict,crs_wgs,crs_proj,clusterTreeCuttingDist,cfg.local_saturation_check) - if cfg.local_saturation_check and not tagArea == 0 and not tnum == 1: - localSaturation = tagArea/(toptagArea/100) - #print("Local Saturation for Tag " + toptag[0] + ": " + str(round(localSaturation,0))) - if localSaturation > 60: - #skip tag entirely due to saturation (if total area > 80% of total area of toptag clusters) - #print("Skipped: " + toptag[0] + " due to saturation (" + str(round(localSaturation,0)) + "%).") - saturationExcludeCount += 1 - continue #next toptag - - if len(listOfAlphashapesAndMeta_tmp) > 0: - listOfAlphashapesAndMeta.extend(listOfAlphashapesAndMeta_tmp) - - singlePhotoGuidList = noClusterPhotos_perTag_DictOfLists.get(toptag[0], None) - if singlePhotoGuidList: - shapetype = "Single cluster" - #print("Single: " + str(len(singlePhotoGuidList))) - photos = [cleaned_post_dict[x] for x in singlePhotoGuidList] - for single_photo in photos: - #project lat/lng to UTM - x, y = pyproj.transform(crs_wgs, crs_proj, single_photo.lng, single_photo.lat) - pcoordinate = geometry.Point(x, y) - result_polygon = pcoordinate.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area - #result_polygon = pcoordinate.buffer(min(distXLng,distYLat)/100,resolution=3) - if result_polygon is not None and not result_polygon.is_empty: - listOfAlphashapesAndMeta.append((result_polygon,1,max(single_photo.post_views_count,single_photo.post_like_count),1,str(toptag[0]),toptag[1],1,1,1,shapetype)) - log.info(f'{len(listOfAlphashapesAndMeta)} Alpha Shapes. Done.') - if cfg.local_saturation_check and not saturationExcludeCount == 0: - log.info(f'Excluded {saturationExcludeCount} Tags on local saturation check.') - ##Output Boundary Shapes in merged Shapefile## - log.info("########## STEP 5 of 6: Writing Results to Shapefile ##########") - - #Calculate best UTM Zone SRID/EPSG Code - input_lon_center = bound_points_shapely.centroid.coords[0][0] #True centroid (coords may be multipoint) - input_lat_center = bound_points_shapely.centroid.coords[0][1] - epsg_code = Utils.convert_wgs_to_utm(input_lon_center, input_lat_center) - project = lambda x, y: pyproj.transform(pyproj.Proj(init='epsg:4326'), pyproj.Proj(init='epsg:{0}'.format(epsg_code)), x, y) - - # Define polygon feature geometry - schema = { - 'geometry': 'Polygon', - 'properties': {'Join_Count': 'int', - 'Views': 'int', - 'COUNT_User': 'int', - 'ImpTag': 'str', - 'TagCountG': 'int', - 'HImpTag': 'int', - 'Weights': 'float', - 'WeightsV2': 'float', - 'WeightsV3': 'float', - #'shapetype': 'str', - 'emoji': 'int'}, - } - - #Normalization of Values (1-1000 Range), precalc Step: - ####################################### - weightsv1_range = [x[6] for x in listOfAlphashapesAndMeta] #get the n'th column out for calculating the max/min - weightsv2_range = [x[7] for x in listOfAlphashapesAndMeta] - weightsv3_range = [x[8] for x in listOfAlphashapesAndMeta] - weightsv1_min = min(weightsv1_range) - weightsv1_max = max(weightsv1_range) - weightsv2_min = min(weightsv2_range) - weightsv2_max = max(weightsv2_range) - weightsv3_min = min(weightsv3_range) - weightsv3_max = max(weightsv3_range) - #precalc - #https://stats.stackexchange.com/questions/70801/how-to-normalize-data-to-0-1-range - weightsv1_mod_a = (1000-1)/(weightsv1_max-weightsv1_min) - weightsv1_mod_b = 1000 - weightsv1_mod_a * weightsv1_max - weightsv2_mod_a = (1000-1)/(weightsv2_max-weightsv2_min) - weightsv2_mod_b = 1000 - weightsv2_mod_a * weightsv2_max - weightsv3_mod_a = (1000-1)/(weightsv3_max-weightsv3_min) - weightsv3_mod_b = 1000 - weightsv3_mod_a * weightsv3_max - ####################################### - # Write a new Shapefile - # WGS1984 - if (cfg.cluster_tags == False and cfg.cluster_emoji == True): - shapefileName = "allEmojiCluster" - else: - shapefileName = "allTagCluster" - with fiona.open(f'02_Output/{shapefileName}.shp', mode='w', encoding='UTF-8', driver='ESRI Shapefile', schema=schema,crs=from_epsg(epsg_code)) as c: - # Normalize Weights to 0-1000 Range - idx = 0 - for alphaShapeAndMeta in listOfAlphashapesAndMeta: - if idx == 0: - HImP = 1 - else: - if listOfAlphashapesAndMeta[idx][4] != listOfAlphashapesAndMeta[idx-1][4]: + shapefileName = "allTagCluster" + with fiona.open(f'02_Output/{shapefileName}.shp', mode='w', encoding='UTF-8', driver='ESRI Shapefile', schema=schema,crs=from_epsg(epsg_code)) as c: + # Normalize Weights to 0-1000 Range + idx = 0 + for alphaShapeAndMeta in alphashapes_and_meta = list(): + if idx == 0: HImP = 1 else: - HImP = 0 - #emoName = unicode_name(alphaShapeAndMeta[4]) - #Calculate Normalized Weights Values based on precalc Step - if not alphaShapeAndMeta[6] == 1: - weight1_normalized = weightsv1_mod_a * alphaShapeAndMeta[6] + weightsv1_mod_b - else: - weight1_normalized = 1 - if not alphaShapeAndMeta[7] == 1: - weight2_normalized = weightsv2_mod_a * alphaShapeAndMeta[7] + weightsv2_mod_b - else: - weight2_normalized = 1 - if not alphaShapeAndMeta[8] == 1: - weight3_normalized = weightsv3_mod_a * alphaShapeAndMeta[8] + weightsv3_mod_b - else: - weight3_normalized = 1 - idx += 1 - #project data - #geom_proj = transform(project, alphaShapeAndMeta[0]) - #c.write({ - # 'geometry': geometry.mapping(geom_proj), - if cfg.cluster_emoji and alphaShapeAndMeta[4] in prepared_data.top_emoji_list: - emoji = 1 - ImpTagText = "" - else: - emoji = 0 - ImpTagText = f'{alphaShapeAndMeta[4]}' - c.write({ - 'geometry': geometry.mapping(alphaShapeAndMeta[0]), - 'properties': {'Join_Count': alphaShapeAndMeta[1], - 'Views': alphaShapeAndMeta[2], - 'COUNT_User': alphaShapeAndMeta[3], - 'ImpTag': ImpTagText, - 'TagCountG': alphaShapeAndMeta[5], - 'HImpTag': HImP, - 'Weights': weight1_normalized, - 'WeightsV2': weight2_normalized, - 'WeightsV3': weight3_normalized, - #'shapetype': alphaShapeAndMeta[9], - 'emoji': emoji}, - }) - if cfg.cluster_emoji: - with open("02_Output/emojiTable.csv", "w", encoding='utf-8') as emojiTable: - emojiTable.write("FID,Emoji\n") - idx = 0 - for alphaShapeAndMeta in listOfAlphashapesAndMeta: - if alphaShapeAndMeta[4] in prepared_data.top_emoji_list: - ImpTagText = f'{alphaShapeAndMeta[4]}' + if alpha_shapes_meta = list()[idx][4] != alpha_shapes_meta = list()[idx-1][4]: + HImP = 1 + else: + HImP = 0 + #emoName = unicode_name(alphaShapeAndMeta[4]) + #Calculate Normalized Weights Values based on precalc Step + if not alphaShapeAndMeta[6] == 1: + weight1_normalized = weightsv1_mod_a * alphaShapeAndMeta[6] + weightsv1_mod_b else: - ImpTagText = "" - emojiTable.write(f'{idx},{ImpTagText}\n') + weight1_normalized = 1 + if not alphaShapeAndMeta[7] == 1: + weight2_normalized = weightsv2_mod_a * alphaShapeAndMeta[7] + weightsv2_mod_b + else: + weight2_normalized = 1 + if not alphaShapeAndMeta[8] == 1: + weight3_normalized = weightsv3_mod_a * alphaShapeAndMeta[8] + weightsv3_mod_b + else: + weight3_normalized = 1 idx += 1 + #project data + #geom_proj = transform(project, alphaShapeAndMeta[0]) + #c.write({ + # 'geometry': geometry.mapping(geom_proj), + if cfg.cluster_emoji and alphaShapeAndMeta[4] in prepared_data.top_emoji_list: + emoji = 1 + ImpTagText = "" + else: + emoji = 0 + ImpTagText = f'{alphaShapeAndMeta[4]}' + c.write({ + 'geometry': geometry.mapping(alphaShapeAndMeta[0]), + 'properties': {'Join_Count': alphaShapeAndMeta[1], + 'Views': alphaShapeAndMeta[2], + 'COUNT_User': alphaShapeAndMeta[3], + 'ImpTag': ImpTagText, + 'TagCountG': alphaShapeAndMeta[5], + 'HImpTag': HImP, + 'Weights': weight1_normalized, + 'WeightsV2': weight2_normalized, + 'WeightsV3': weight3_normalized, + #'shapetype': alphaShapeAndMeta[9], + 'emoji': emoji}, + }) + if cfg.cluster_emoji: + with open("02_Output/emojiTable.csv", "w", encoding='utf-8') as emojiTable: + emojiTable.write("FID,Emoji\n") + idx = 0 + for alphaShapeAndMeta in alpha_shapes_meta = list(): + if alphaShapeAndMeta[4] in prepared_data.top_emoji_list: + ImpTagText = f'{alphaShapeAndMeta[4]}' + else: + ImpTagText = "" + emojiTable.write(f'{idx},{ImpTagText}\n') + idx += 1 \ No newline at end of file diff --git a/tagmaps/classes/load_data.py b/tagmaps/classes/load_data.py index f532045..4b1ee20 100644 --- a/tagmaps/classes/load_data.py +++ b/tagmaps/classes/load_data.py @@ -109,7 +109,9 @@ def _process_inputfile(self, file_handle): def _parse_postlist(self, post_reader: TextIO): """Process posts according to specifications""" + # row_num = 0 for post in post_reader: + # row_num += 1 lbsn_post = self._parse_post(post) if lbsn_post is None: continue @@ -123,8 +125,10 @@ def _parse_postlist(self, post_reader: TextIO): f'Skipped posts: {self.stats.skipped_count} - skipped tags: ' f'{self.stats.count_tags_skipped} of ' f'{self.stats.count_tags_global}') + # if (row_num % 10 == 0): + # modulo: print only once every 10 iterations print(msg, end='\r') - # log last message to file, clean last stdout + # log last message to file, clean stdout print(" " * len(msg), end='\n') sys.stdout.flush() self.log.info(msg) diff --git a/tagmaps/classes/utils.py b/tagmaps/classes/utils.py index c4f3ab4..0176d5a 100644 --- a/tagmaps/classes/utils.py +++ b/tagmaps/classes/utils.py @@ -18,11 +18,11 @@ import hashlib import io import logging -import fiona #Fiona needed for reading Shapefile +import fiona # Fiona needed for reading Shapefile from fiona.crs import from_epsg import shapely.geometry as geometry -import pyproj #import Proj, transform -#https://gis.stackexchange.com/questions/127427/transforming-shapely-polygon-and-multipolygon-objects +import pyproj # import Proj, transform +# https://gis.stackexchange.com/questions/127427/transforming-shapely-polygon-and-multipolygon-objects from shapely.ops import transform, cascaded_union, polygonize #from shapely.geometry import Polygon #from shapely.geometry import shape @@ -34,6 +34,7 @@ from tagmaps.config.config import BaseConfig from tagmaps.classes.shared_structure import CleanedPost + class Utils(): """Collection of various tools and helper functions @@ -79,7 +80,7 @@ def is_number(s): @classmethod def init_main(cls): """Initializing main procedure if package is executed directly""" - + # set console view parameters os.system('mode con: cols=197 lines=40') # initialize logger @@ -95,12 +96,13 @@ def set_logger(cls): """ Set logging handler manually, so we can also print to console while logging to file """ - + cls.init_output_dir() __log_file = "02_Output/log.txt" - + # Set Output to Replace in case of encoding issues (console/windows) - sys.stdout = io.TextIOWrapper(sys.stdout.detach(), sys.stdout.encoding, 'replace') + sys.stdout = io.TextIOWrapper( + sys.stdout.detach(), sys.stdout.encoding, 'replace') # Create logger with specific name log = logging.getLogger("tagmaps") log.format = '%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s' @@ -120,7 +122,7 @@ def init_output_dir(): if not os.path.exists(pathname + '/02_Output/'): os.makedirs(pathname + '/02_Output/') print("Folder /02_Output was created") - + @staticmethod def query_yes_no(question, default="yes"): """Ask a yes/no question via raw_input() and return their answer. @@ -153,9 +155,10 @@ def query_yes_no(question, default="yes"): else: sys.stdout.write("'yes' or 'no' " "(or 'y' or 'n').\n") + @staticmethod def daterange(start_date, end_date): - for n in range(int ((end_date - start_date).days)): + for n in range(int((end_date - start_date).days)): yield start_date + timedelta(n) @staticmethod @@ -172,7 +175,7 @@ def haversine(lon1, lat1, lon2, lat2): a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 c = 2 * asin(sqrt(a)) # Radius of earth in kilometers is 6371 - km = 6371* c + km = 6371 * c m = km*1000 return m @@ -182,29 +185,29 @@ def get_radians_from_meters(dist): degreesDist = dist/111.325 radiansDist = degreesDist/57.2958 return radiansDist - #https://www.mathsisfun.com/geometry/radians.html - #1 Radian is about 57.2958 degrees. - #then see https://sciencing.com/convert-distances-degrees-meters-7858322.html - #Multiply the number of degrees by 111.325 - #To convert this to meters, multiply by 1,000. So, 2 degrees is 222,65 meters. + # https://www.mathsisfun.com/geometry/radians.html + # 1 Radian is about 57.2958 degrees. + # then see https://sciencing.com/convert-distances-degrees-meters-7858322.html + # Multiply the number of degrees by 111.325 + # To convert this to meters, multiply by 1,000. So, 2 degrees is 222,65 meters. @staticmethod def get_meters_from_radians(dist): dist = dist * 57.2958 dist = dist * 111.325 - metersDist = round(dist * 1000,1) + metersDist = round(dist * 1000, 1) return metersDist - #1 Radian is about 57.2958 degrees. - #then see https://sciencing.com/convert-distances-degrees-meters-7858322.html - #Multiply the number of degrees by 111.325 - #To convert this to meters, multiply by 1,000. So, 2 degrees is 222,65 meters. - #plt.close('all') #clear memory + # 1 Radian is about 57.2958 degrees. + # then see https://sciencing.com/convert-distances-degrees-meters-7858322.html + # Multiply the number of degrees by 111.325 + # To convert this to meters, multiply by 1,000. So, 2 degrees is 222,65 meters. + # plt.close('all') #clear memory @staticmethod def checkEmojiType(strEmo): """Is this function really needed, makes no difference! (really?)""" - if unicodedata.name(strEmo).startswith(("EMOJI MODIFIER","VARIATION SELECTOR","ZERO WIDTH")): + if unicodedata.name(strEmo).startswith(("EMOJI MODIFIER", "VARIATION SELECTOR", "ZERO WIDTH")): return False return True @@ -220,28 +223,28 @@ def extract_emoji(str): Utils.checkEmojiType(c) is True) return emoji_list - #see https://stackoverflow.com/questions/43852668/using-collections-counter-to-count-emojis-with-different-colors + # see https://stackoverflow.com/questions/43852668/using-collections-counter-to-count-emojis-with-different-colors # we want to ignore fitzpatrick modifiers and treat all differently colored emojis the same - #https://stackoverflow.com/questions/38100329/some-emojis-e-g-have-two-unicode-u-u2601-and-u-u2601-ufe0f-what-does - #COOKING - #OK HAND SIGN - #EMOJI MODIFIER FITZPATRICK TYPE-1-2 - #GRINNING FACE WITH SMILING EYES - #HEAVY BLACK HEART - #WEARY CAT FACE - #SMILING FACE WITH HEART-SHAPED EYES - #OK HAND SIGN - #EMOJI MODIFIER FITZPATRICK TYPE-1-2 - #GRINNING FACE WITH SMILING EYES - #PERSON WITH FOLDED HANDS - #EMOJI MODIFIER FITZPATRICK TYPE-3 - #WEARY CAT FACE - - ##Emojitest + # https://stackoverflow.com/questions/38100329/some-emojis-e-g-have-two-unicode-u-u2601-and-u-u2601-ufe0f-what-does + # COOKING + # OK HAND SIGN + # EMOJI MODIFIER FITZPATRICK TYPE-1-2 + # GRINNING FACE WITH SMILING EYES + # HEAVY BLACK HEART + # WEARY CAT FACE + # SMILING FACE WITH HEART-SHAPED EYES + # OK HAND SIGN + # EMOJI MODIFIER FITZPATRICK TYPE-1-2 + # GRINNING FACE WITH SMILING EYES + # PERSON WITH FOLDED HANDS + # EMOJI MODIFIER FITZPATRICK TYPE-3 + # WEARY CAT FACE + + # Emojitest #n = '❤️👨‍⚕️' - ##n = '👨‍⚕️' #medical emoji with zero-width joiner (http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html) + # n = '👨‍⚕️' #medical emoji with zero-width joiner (http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html) #nlist = def_functions.extract_emojis(n) - #with open("emojifile.txt", "w", encoding='utf-8') as emojifile: + # with open("emojifile.txt", "w", encoding='utf-8') as emojifile: # emojifile.write("Original: " + n + '\n') # for xstr in nlist: # emojifile.write('Emoji Extract: U+%04x' % ord(xstr) + '\n') @@ -249,17 +252,16 @@ def extract_emoji(str): # for _c in n: # emojifile.write(str(unicode_name(_c)) + '\n') # emojifile.write('Each Codepoint: U+%04x' % ord(_c) + '\n') - #def cleanEmoji(c): + # def cleanEmoji(c): # tuple = (u'\ufeff',u'\u200b',u'\u200d') # for ex in tuple: # c.replace(ex,"") # return(c) - #https://github.com/carpedm20/emoji/ - #https://github.com/carpedm20/emoji/issues/75 + # https://github.com/carpedm20/emoji/ + # https://github.com/carpedm20/emoji/issues/75 - - #this class is needed to override tkinter window with drag&drop option when overrideredirect = true - #class App: + # this class is needed to override tkinter window with drag&drop option when overrideredirect = true + # class App: # global tk # def __init__(self): # self.root = tk.Tk() @@ -280,10 +282,9 @@ def extract_emoji(str): # self.root._offsetx = event.x # self.root._offsety = event.y - #tc unicode problem - #https://stackoverflow.com/questions/40222971/python-find-equivalent-surrogate-pair-from-non-bmp-unicode-char + # tc unicode problem + # https://stackoverflow.com/questions/40222971/python-find-equivalent-surrogate-pair-from-non-bmp-unicode-char - def _surrogatepair(match): char = match.group() assert ord(char) > 0xffff @@ -296,18 +297,7 @@ def with_surrogates(text): _nonbmp = re.compile(r'[\U00010000-\U0010FFFF]') return _nonbmp.sub(_surrogatepair, text) - #https://stackoverflow.com/questions/40132542/get-a-cartesian-projection-accurate-around-a-lat-lng-pair - def convert_wgs_to_utm(lon, lat): - utm_band = str((math.floor((lon + 180) / 6 ) % 60) + 1) - if len(utm_band) == 1: - utm_band = '0'+utm_band - if lat >= 0: - epsg_code = '326' + utm_band - else: - epsg_code = '327' + utm_band - return epsg_code - - #def str2bool(v): + # def str2bool(v): # if v.lower() in ('yes', 'true', 't', 'y', '1'): # return True # elif v.lower() in ('no', 'false', 'f', 'n', '0'): @@ -315,112 +305,8 @@ def convert_wgs_to_utm(lon, lat): # else: # raise argparse.ArgumentTypeError('Boolean value expected.') - def generateClusterShape(toptag,clusterPhotoGuidList,cleanedPhotoDict,crs_wgs,crs_proj,clusterTreeCuttingDist,localSaturationCheck): - #we define a new list of Temp Alpha Shapes outside the loop, so that it is not overwritten each time - listOfAlphashapesAndMeta_tmp = [] - #points = [] - tagArea = 0 - for photo_guids in clusterPhotoGuidList: - #for each cluster for this toptag - photos = [cleanedPhotoDict[x] for x in photo_guids] - photoCount = len(photo_guids) - uniqueUserCount = len(set([photo.user_guid for photo in photos])) - sumViews = sum([photo.post_views_count for photo in photos]) - #calculate different weighting formulas - #weightsv1 = 1+ photoCount *(sqrt(1/( photoCount / uniqueUserCount )**3)) #-> Standard weighting formula (x**y means x raised to the power y); +1 to UserCount: prevent 1-2 Range from being misaligned - #weightsv2 = 1+ photoCount *(sqrt(1/( photoCount / uniqueUserCount )**2)) - weightsv1 = photoCount *(sqrt(1/( photoCount / (uniqueUserCount+1) )**3)) #-> Standard weighting formula (x**y means x raised to the power y); +1 to UserCount: prevent 1-2 Range from being misaligned - weightsv2 = photoCount *(sqrt(1/( photoCount / (uniqueUserCount+1) )**2)) #-> less importance on User_Count in correlation to photo count [Join_Count]; +1 to UserCount: prevent 1-2 Range from being misaligned - weightsv3 = sqrt((photoCount+(2*sqrt(photoCount)))*2) #-> Ignores User_Count, this will emphasize individual and very active users - #points = [geometry.Point(photo.lng, photo.lat) - # for photo in photos] - #instead of lat/lng for each photo, we use photo_locID to identify a list of distinct locations - distinctLocations = set([photo.loc_id - for photo in photos]) - #simple list comprehension without projection: - #points = [geometry.Point(Decimal(location.split(':')[1]), Decimal(location.split(':')[0])) - # for location in distinctLocations] - points = [geometry.Point(pyproj.transform(crs_wgs, crs_proj, Decimal(location.split(':')[1]), Decimal(location.split(':')[0]))) - for location in distinctLocations] - point_collection = geometry.MultiPoint(list(points)) - result_polygon = None - - if len(points) >= 5: - if len(points) < 10: - result_polygon = point_collection.convex_hull #convex hull - result_polygon = result_polygon.buffer(clusterTreeCuttingDist/4,resolution=3) - shapetype = "between 5 and 10 points_convexHull" - #result_polygon = result_polygon.buffer(min(distXLng,distYLat)/100,resolution=3) - else: - if len(points) > 500: - startalpha = 1000000 - elif len(points) > 200: - startalpha = 10000 - else: - startalpha = 9000 - result_polygon = Utils.alpha_shape(points,alpha=clusterTreeCuttingDist/startalpha) #concave hull/alpha shape /50000 - shapetype = "Initial Alpha Shape + Buffer" - if type(result_polygon) is geometry.multipolygon.MultiPolygon or isinstance(result_polygon, bool): - #repeat generating alpha shapes with smaller alpha value if Multigon is generated - #smaller alpha values mean less granularity of resulting polygon - #but too large alpha may result in empty polygon - #(this branch is sometimes executed for larger scales) - for i in range(1,6): - #try decreasing alpha - alpha = startalpha + (startalpha * (i**i)) #** means cube - result_polygon = Utils.alpha_shape(points,alpha=clusterTreeCuttingDist/alpha)#/100000 - if not type(result_polygon) is geometry.multipolygon.MultiPolygon and not isinstance(result_polygon, bool): - shapetype = "Multipolygon Alpha Shape /" + str(alpha) - break - if type(result_polygon) is geometry.multipolygon.MultiPolygon or isinstance(result_polygon, bool): - #try increasing alpha - for i in range(1,6): - #try decreasing alpha - alpha = startalpha / (i*i) - result_polygon = Utils.alpha_shape(points,alpha=clusterTreeCuttingDist/alpha)#/100000 - if not type(result_polygon) is geometry.multipolygon.MultiPolygon and not isinstance(result_polygon, bool): - shapetype = "Multipolygon Alpha Shape /" + str(alpha) - break - if type(result_polygon) is geometry.multipolygon.MultiPolygon: - shapetype = "Multipolygon Alpha Shape -> Convex Hull" - #if still of type multipolygon, try to remove holes and do a convex_hull - result_polygon = result_polygon.convex_hull - #OR: in case there was a problem with generating alpha shapes (circum_r = a*b*c/(4.0*area) --> ZeroDivisionError: float division by zero) - #this branch is rarely executed for large point clusters where alpha is perhaps set too small - elif isinstance(result_polygon, bool) or result_polygon.is_empty: - shapetype = "BoolAlpha -> Fallback to PointCloud Convex Hull" - result_polygon = point_collection.convex_hull #convex hull - #Finally do a buffer to smooth alpha - result_polygon = result_polygon.buffer(clusterTreeCuttingDist/4,resolution=3) - #result_polygon = result_polygon.buffer(min(distXLng,distYLat)/100,resolution=3) - elif 2 <= len(points) < 5: - shapetype = "between 2 and 5 points_buffer" - #calc distance between points http://www.mathwarehouse.com/algebra/distance_formula/index.php - #bdist = math.sqrt((points[0].coords.xy[0][0]-points[1].coords.xy[0][0])**2 + (points[0].coords.xy[1][0]-points[1].coords.xy[1][0])**2) - #print(str(bdist)) - result_polygon = point_collection.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area - result_polygon = result_polygon.convex_hull - #result_polygon = point_collection.buffer(min(distXLng,distYLat)/100,resolution=3) #single dots are presented as buffer with 0.5% of width-area - elif len(points)==1 or type(result_polygon) is geometry.point.Point or result_polygon is None: - shapetype = "1 point cluster" - result_polygon = point_collection.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area - #result_polygon = point_collection.buffer(min(distXLng,distYLat)/100,resolution=3) #single dots are presented as buffer with 0.5% of width-area - #final check for multipolygon - if type(result_polygon) is geometry.multipolygon.MultiPolygon: - #usually not executed - result_polygon = result_polygon.convex_hull - #Geom, Join_Count, Views, COUNT_User,ImpTag,TagCountG,HImpTag - if result_polygon is not None and not result_polygon.is_empty: - if localSaturationCheck: - tagArea += result_polygon.area - listOfAlphashapesAndMeta_tmp.append((result_polygon,photoCount,sumViews,uniqueUserCount,toptag[0],toptag[1],weightsv1,weightsv2,weightsv3,shapetype)) - if len(listOfAlphashapesAndMeta_tmp) > 0: - # finally sort and append all cluster shapes for this tag - listOfAlphashapesAndMeta_tmp = sorted(listOfAlphashapesAndMeta_tmp,key=lambda x: -x[6]) - return listOfAlphashapesAndMeta_tmp, tagArea - def plot_polygon(polygon): - fig = plt.figure(figsize=(10,10)) + fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) margin = .3 x_min, y_min, x_max, y_max = polygon.bounds @@ -432,72 +318,6 @@ def plot_polygon(polygon): ax.add_patch(patch) return fig - def alpha_shape(points, alpha): - """ - Alpha Shapes Code by KEVIN DWYER - see http://blog.thehumangeo.com/2014/05/12/drawing-boundaries-in-python/ - Compute the alpha shape (concave hull) of a set - of points. - @param points: Iterable container of points. - @param alpha: alpha value to influence the - gooeyness of the border. Smaller numbers - don't fall inward as much as larger numbers. - Too large, and you lose everything! - """ - if len(points) < 4: - # When you have a triangle, there is no sense - # in computing an alpha shape. - return geometry.MultiPoint(list(points)).convex_hull - def add_edge(edges, edge_points, coords, i, j): - """ - Add a line between the i-th and j-th points, - if not in the list already - """ - if (i, j) in edges or (j, i) in edges: - # already added - return - edges.add( (i, j) ) - edge_points.append(coords[ [i, j] ]) - coords = np.array([point.coords[0] - for point in points]) - - #print(str(len(coords))) - tri = Delaunay(coords)#,qhull_o}ptions = 'QJ') #To avoid this error, you can joggle the data by specifying the 'QJ' option to the DELAUNAY function. https://de.mathworks.com/matlabcentral/answers/94438-why-does-the-delaunay-function-in-matlab-7-0-r14-produce-an-error-when-passed-colinear-points?s_tid=gn_loc_drop - #tri = Delaunay(coords,{'QJ'}) #Version 3.1 added triangulated output ('Qt'). It should be used for Delaunay triangulations instead of using joggled input ('QJ'). - edges = set() - edge_points = [] - # loop over triangles: - # ia, ib, ic = indices of corner points of the - # triangle - for ia, ib, ic in tri.vertices: - pa = coords[ia] - pb = coords[ib] - pc = coords[ic] - # Lengths of sides of triangle - a = math.sqrt((pa[0]-pb[0])**2 + (pa[1]-pb[1])**2) - b = math.sqrt((pb[0]-pc[0])**2 + (pb[1]-pc[1])**2) - c = math.sqrt((pc[0]-pa[0])**2 + (pc[1]-pa[1])**2) - # Semiperimeter of triangle - s = (a + b + c)/2.0 - # Area of triangle by Heron's formula - try: - area = math.sqrt(s*(s-a)*(s-b)*(s-c)) - except ValueError: - return False - if area == 0: - return False - circum_r = a*b*c/(4.0*area) - # Here's the radius filter. - #print circum_r - if circum_r < 1.0/alpha: - add_edge(edges, edge_points, coords, ia, ib) - add_edge(edges, edge_points, coords, ib, ic) - add_edge(edges, edge_points, coords, ic, ia) - m = geometry.MultiLineString(edge_points) - triangles = list(polygonize(m)) - return cascaded_union(triangles)#, edge_points - #return geometry.polygon.asPolygon(edge_points,holes=None) - def fit_cluster(clusterer, data): clusterer.fit(data) return clusterer @@ -510,14 +330,14 @@ def get_rectangle_bounds(points): limXMax = np.max(points.T[0]) return limYMin, limYMax, limXMin, limXMax - def filterTags(taglist,SortOutAlways_set,SortOutAlways_inStr_set): + def filterTags(taglist, SortOutAlways_set, SortOutAlways_inStr_set): count_tags = 0 count_skipped = 0 - #Filter tags based on two stoplists + # Filter tags based on two stoplists photo_tags_filtered = set() for tag in taglist: count_tags += 1 - #exclude numbers and those tags that are in SortOutAlways_set + # exclude numbers and those tags that are in SortOutAlways_set if len(tag) == 1 or tag == '""' or tag.isdigit() or tag in SortOutAlways_set: count_skipped += 1 continue @@ -527,7 +347,4 @@ def filterTags(taglist,SortOutAlways_set,SortOutAlways_inStr_set): break else: photo_tags_filtered.add(tag) - return photo_tags_filtered, count_tags,count_skipped - - - + return photo_tags_filtered, count_tags, count_skipped