From e015f543386247977d9ce248dde82e40bf643f05 Mon Sep 17 00:00:00 2001 From: AD Date: Tue, 15 Jan 2019 16:15:58 +0100 Subject: [PATCH] intermediate refactor: cluster preview and map preview works TODO: process_data refactor --- tagmaps/__main__.py | 6 ++---- tagmaps/classes/cluster.py | 32 ++++++++++++++++++++------------ tagmaps/classes/interface.py | 22 ++++++++++++---------- tagmaps/classes/load_data.py | 26 +++++++++++++++----------- 4 files changed, 49 insertions(+), 37 deletions(-) diff --git a/tagmaps/__main__.py b/tagmaps/__main__.py index 5aaa87f..83d4fd2 100644 --- a/tagmaps/__main__.py +++ b/tagmaps/__main__.py @@ -84,7 +84,7 @@ def main(): log.info(lbsn_data.bounds.get_bound_report()) if (cfg.cluster_tags or cfg.cluster_emoji): - log.info("########## STEP 2 of 6: Tag Ranking ##########") + log.info("\n########## STEP 2 of 6: Tag Ranking ##########") prepared_data = lbsn_data.get_prepared_data() @@ -97,14 +97,12 @@ def main(): f'Total emoji count for cleaned (tmax) emoji list ' f'(Top {prepared_data.emax}): {prepared_data.total_emoji_count}.') - global top_tags_list top_tags_list = prepared_data.top_tags_list if cfg.statistics_only is False: # restart time monitoring for actual cluster step now = time.time() log.info( - "########## STEP 3 of 6: Tag Location Clustering ##########") - sys.stdout.flush() + "\n########## STEP 3 of 6: Tag Location Clustering ##########") cluster_tag_data = ClusterGen( lbsn_data.bounds, diff --git a/tagmaps/classes/cluster.py b/tagmaps/classes/cluster.py index 87019a8..1f17212 100644 --- a/tagmaps/classes/cluster.py +++ b/tagmaps/classes/cluster.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -Module for tag maps clustering functions +Module for tag maps clustering methods """ import warnings @@ -80,7 +80,7 @@ def _select_postguids(self, tag: str) -> Tuple[List[str], int]: distinct_localloc_count.add(cleaned_photo_location.loc_id) return selected_postguids_list, len(distinct_localloc_count) - def _getselect_postguids(self, tag: str, silent: bool = True): + def _getselect_postguids(self, tag: str, silent: bool = True) -> List[str]: """Get list of post guids with specific tag Args: @@ -105,19 +105,26 @@ def _getselect_postguids(self, tag: str, silent: bool = True): f'of total distinct locations in area)', end=" ") return selected_postguids_list - def _getselect_posts(self, selected_postguids_list): + def _getselect_posts(self, + selected_postguids_list: List[str] + ) -> List[CleanedPost]: selected_posts_list = [self.cleaned_post_dict[x] for x in selected_postguids_list] return selected_posts_list - def _get_np_points(self, tag: str = None, silent: bool = None): + def _get_np_points(self, + tag: str = None, + silent: bool = None + ) -> np.ndarray: """Gets numpy array of selected points from tags with latlng - toptag ([type], optional): Defaults to None. [description] - silent ([type], optional): Defaults to None. [description] + Args: + tag: tag to select posts + silent: if true, no console output (interface mode) Returns: - [type]: [description] + points: A list of lat/lng points to map + selected_postguids_list: List of selected post guids """ # no log reporting for selected points if silent is None: @@ -138,11 +145,12 @@ def _get_np_points(self, tag: str = None, silent: bool = None): # (limit by list of column-names) points = df.as_matrix(['lng', 'lat']) # only return preview fig without clustering - return points, selected_postguids_list + return points - def cluster_points(self, points, cluster_distance, - selected_postguids_list, - min_span_tree, preview_mode): + def cluster_points(self, points, + cluster_distance: float, + min_span_tree: bool = False, + preview_mode: bool = False): # cluster data # conversion to radians for HDBSCAN # (does not support decimal degrees) @@ -151,7 +159,7 @@ def cluster_points(self, points, cluster_distance, # (descending), calculate HDBSCAN Clusters # min_cluster_size default - 5% optimum: min_cluster_size = max( - 2, int(((len(selected_postguids_list))/100)*5)) + 2, int(((len(points))/100)*5)) self.clusterer = hdbscan.HDBSCAN( min_cluster_size=min_cluster_size, gen_min_span_tree=min_span_tree, diff --git a/tagmaps/classes/interface.py b/tagmaps/classes/interface.py index ddea25a..07a7f66 100644 --- a/tagmaps/classes/interface.py +++ b/tagmaps/classes/interface.py @@ -22,7 +22,6 @@ from tagmaps.classes.shared_structure import CleanedPost from tagmaps.classes.cluster import ClusterGen -plt.ion() # enable interactive mode for pyplot (not necessary?!) plt.ion() # label_size = 10 @@ -202,12 +201,10 @@ def _cluster_preview(self, sel_tag: Tuple[str, int]): # tkinter.messagebox.showinfo("Num of clusters: ", # str(len(sel_colors)) + " " + str(sel_colors[1])) # output/update matplotlib figures - (points, - selected_postguids_list) = self._clst._get_np_points( - sel_tag[0], silent=True) + points = self._clst._get_np_points(tag=sel_tag[0], silent=True) self._clst.cluster_points( - points, self.cluster_distance, - selected_postguids_list, self.create_min_spanning_tree, + points=points, + cluster_distance=self.cluster_distance, preview_mode=True) mask_noisy = self._clst.mask_noisy number_of_clusters = self._clst.number_of_clusters @@ -413,12 +410,12 @@ def _cluster_preview(self, sel_tag: Tuple[str, int]): from_=(self.cluster_distance/100), to=(self.cluster_distance*2)) - def _selection_preview(self, sel_tag): + def _selection_preview(self, sel_tag: Tuple[str, int]): """Update preview map based on tag selection""" # tkinter.messagebox.showinfo("Proceed", f'{sel_tag}') - points, __ = self._clst._get_np_points( - sel_tag[0], - silent=False) + points = self._clst._get_np_points( + tag=sel_tag[0], + silent=True) if self.fig1: plt.figure(1).clf() # clear figure 1 # earth = Basemap() @@ -556,6 +553,11 @@ def _cluster_current_display_tag(self): self._cluster_preview(self._clst.top_tags_list[0]) def _scaletest_current_display_tag(self): + if self.create_min_spanning_tree is False: + tkinter.messagebox.showinfo( + "Skip: ", + f'Currently deactivated') + return if self.current_display_tag: sel_tag = self.current_display_tag else: diff --git a/tagmaps/classes/load_data.py b/tagmaps/classes/load_data.py index 583921c..f543ce7 100644 --- a/tagmaps/classes/load_data.py +++ b/tagmaps/classes/load_data.py @@ -92,7 +92,7 @@ def _process_inputfile(self, file_handle): Output: produces a list of post that can be parsed """ - post_list = [] # needed? + post_list = list() # needed? if self.cfg.source_map.file_extension == "csv": post_list = csv.reader( file_handle, @@ -102,9 +102,9 @@ def _process_inputfile(self, file_handle): next(post_list, None) # skip headerline elif self.cfg.source_map.file_extension == "json": post_list = post_list + json.loads(file_handle.read()) - self._parse_postlist(post_list) + result_msg = self._parse_postlist(post_list) - def _parse_postlist(self, post_list): + def _parse_postlist(self, post_list: TextIO): """Process posts according to specifications""" for post in post_list: # skip duplicates and erroneous entries @@ -119,15 +119,19 @@ def _parse_postlist(self, post_list): continue self._merge_posts(lbsn_post) # status report - msg = \ - f'Cleaned output to {len(self.distinct_locations_set):02d} ' \ - f'distinct locations from ' \ - f'{self.stats.count_glob:02d} posts ' \ - f'(File {self.stats.partcount} of {len(self.filelist)}) - ' \ - f'Skipped posts: {self.stats.skipped_count} - skipped tags: ' \ - f'{self.stats.count_tags_skipped} of ' \ - f'{self.stats.count_tags_global}' + msg = ( + f'Cleaned output to {len(self.distinct_locations_set):02d} ' + f'distinct locations from ' + f'{self.stats.count_glob:02d} posts ' + f'(File {self.stats.partcount} of {len(self.filelist)}) - ' + f'Skipped posts: {self.stats.skipped_count} - skipped tags: ' + f'{self.stats.count_tags_skipped} of ' + f'{self.stats.count_tags_global}') print(msg, end='\r') + # log last message to file, clean last stdout + print(" " * len(msg), end='\n') + sys.stdout.flush() + self.log.info(msg) def _merge_posts(self, lbsn_post): """Method will union all tags of a single user for each location