From e015f543386247977d9ce248dde82e40bf643f05 Mon Sep 17 00:00:00 2001
From: AD <alexander.dunkel@tu-dresden.de>
Date: Tue, 15 Jan 2019 16:15:58 +0100
Subject: [PATCH] intermediate refactor: cluster preview and map preview works

TODO: process_data refactor
---
 tagmaps/__main__.py          |  6 ++----
 tagmaps/classes/cluster.py   | 32 ++++++++++++++++++++------------
 tagmaps/classes/interface.py | 22 ++++++++++++----------
 tagmaps/classes/load_data.py | 26 +++++++++++++++-----------
 4 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/tagmaps/__main__.py b/tagmaps/__main__.py
index 5aaa87f..83d4fd2 100644
--- a/tagmaps/__main__.py
+++ b/tagmaps/__main__.py
@@ -84,7 +84,7 @@ def main():
     log.info(lbsn_data.bounds.get_bound_report())
 
     if (cfg.cluster_tags or cfg.cluster_emoji):
-        log.info("########## STEP 2 of 6: Tag Ranking ##########")
+        log.info("\n########## STEP 2 of 6: Tag Ranking ##########")
 
         prepared_data = lbsn_data.get_prepared_data()
 
@@ -97,14 +97,12 @@ def main():
             f'Total emoji count for cleaned (tmax) emoji list '
             f'(Top {prepared_data.emax}): {prepared_data.total_emoji_count}.')
 
-        global top_tags_list
         top_tags_list = prepared_data.top_tags_list
         if cfg.statistics_only is False:
             # restart time monitoring for actual cluster step
             now = time.time()
             log.info(
-                "########## STEP 3 of 6: Tag Location Clustering ##########")
-            sys.stdout.flush()
+                "\n########## STEP 3 of 6: Tag Location Clustering ##########")
 
             cluster_tag_data = ClusterGen(
                 lbsn_data.bounds,
diff --git a/tagmaps/classes/cluster.py b/tagmaps/classes/cluster.py
index 87019a8..1f17212 100644
--- a/tagmaps/classes/cluster.py
+++ b/tagmaps/classes/cluster.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 """
-Module for tag maps clustering functions
+Module for tag maps clustering methods
 """
 
 import warnings
@@ -80,7 +80,7 @@ def _select_postguids(self, tag: str) -> Tuple[List[str], int]:
                 distinct_localloc_count.add(cleaned_photo_location.loc_id)
         return selected_postguids_list, len(distinct_localloc_count)
 
-    def _getselect_postguids(self, tag: str, silent: bool = True):
+    def _getselect_postguids(self, tag: str, silent: bool = True) -> List[str]:
         """Get list of post guids with specific tag
 
         Args:
@@ -105,19 +105,26 @@ def _getselect_postguids(self, tag: str, silent: bool = True):
               f'of total distinct locations in area)', end=" ")
         return selected_postguids_list
 
-    def _getselect_posts(self, selected_postguids_list):
+    def _getselect_posts(self,
+                         selected_postguids_list: List[str]
+                         ) -> List[CleanedPost]:
         selected_posts_list = [self.cleaned_post_dict[x]
                                for x in selected_postguids_list]
         return selected_posts_list
 
-    def _get_np_points(self, tag: str = None, silent: bool = None):
+    def _get_np_points(self,
+                       tag: str = None,
+                       silent: bool = None
+                       ) -> np.ndarray:
         """Gets numpy array of selected points from tags with latlng
 
-            toptag ([type], optional): Defaults to None. [description]
-            silent ([type], optional): Defaults to None. [description]
+        Args:
+            tag: tag to select posts
+            silent: if true, no console output (interface mode)
 
         Returns:
-            [type]: [description]
+            points: A list of lat/lng points to map
+            selected_postguids_list: List of selected post guids
         """
         # no log reporting for selected points
         if silent is None:
@@ -138,11 +145,12 @@ def _get_np_points(self, tag: str = None, silent: bool = None):
         # (limit by list of column-names)
         points = df.as_matrix(['lng', 'lat'])
         # only return preview fig without clustering
-        return points, selected_postguids_list
+        return points
 
-    def cluster_points(self, points, cluster_distance,
-                       selected_postguids_list,
-                       min_span_tree, preview_mode):
+    def cluster_points(self, points,
+                       cluster_distance: float,
+                       min_span_tree: bool = False,
+                       preview_mode: bool = False):
         # cluster data
         # conversion to radians for HDBSCAN
         # (does not support decimal degrees)
@@ -151,7 +159,7 @@ def cluster_points(self, points, cluster_distance,
         # (descending), calculate HDBSCAN Clusters
         # min_cluster_size default - 5% optimum:
         min_cluster_size = max(
-            2, int(((len(selected_postguids_list))/100)*5))
+            2, int(((len(points))/100)*5))
         self.clusterer = hdbscan.HDBSCAN(
             min_cluster_size=min_cluster_size,
             gen_min_span_tree=min_span_tree,
diff --git a/tagmaps/classes/interface.py b/tagmaps/classes/interface.py
index ddea25a..07a7f66 100644
--- a/tagmaps/classes/interface.py
+++ b/tagmaps/classes/interface.py
@@ -22,7 +22,6 @@
 from tagmaps.classes.shared_structure import CleanedPost
 from tagmaps.classes.cluster import ClusterGen
 
-plt.ion()
 # enable interactive mode for pyplot (not necessary?!)
 plt.ion()
 # label_size = 10
@@ -202,12 +201,10 @@ def _cluster_preview(self, sel_tag: Tuple[str, int]):
         # tkinter.messagebox.showinfo("Num of clusters: ",
         # str(len(sel_colors)) + " " + str(sel_colors[1]))
         # output/update matplotlib figures
-        (points,
-         selected_postguids_list) = self._clst._get_np_points(
-            sel_tag[0], silent=True)
+        points = self._clst._get_np_points(tag=sel_tag[0], silent=True)
         self._clst.cluster_points(
-            points, self.cluster_distance,
-            selected_postguids_list, self.create_min_spanning_tree,
+            points=points,
+            cluster_distance=self.cluster_distance,
             preview_mode=True)
         mask_noisy = self._clst.mask_noisy
         number_of_clusters = self._clst.number_of_clusters
@@ -413,12 +410,12 @@ def _cluster_preview(self, sel_tag: Tuple[str, int]):
             from_=(self.cluster_distance/100),
             to=(self.cluster_distance*2))
 
-    def _selection_preview(self, sel_tag):
+    def _selection_preview(self, sel_tag: Tuple[str, int]):
         """Update preview map based on tag selection"""
         # tkinter.messagebox.showinfo("Proceed", f'{sel_tag}')
-        points, __ = self._clst._get_np_points(
-            sel_tag[0],
-            silent=False)
+        points = self._clst._get_np_points(
+            tag=sel_tag[0],
+            silent=True)
         if self.fig1:
             plt.figure(1).clf()  # clear figure 1
             # earth = Basemap()
@@ -556,6 +553,11 @@ def _cluster_current_display_tag(self):
             self._cluster_preview(self._clst.top_tags_list[0])
 
     def _scaletest_current_display_tag(self):
+        if self.create_min_spanning_tree is False:
+            tkinter.messagebox.showinfo(
+                "Skip: ",
+                f'Currently deactivated')
+            return
         if self.current_display_tag:
             sel_tag = self.current_display_tag
         else:
diff --git a/tagmaps/classes/load_data.py b/tagmaps/classes/load_data.py
index 583921c..f543ce7 100644
--- a/tagmaps/classes/load_data.py
+++ b/tagmaps/classes/load_data.py
@@ -92,7 +92,7 @@ def _process_inputfile(self, file_handle):
 
         Output: produces a list of post that can be parsed
         """
-        post_list = []  # needed?
+        post_list = list()  # needed?
         if self.cfg.source_map.file_extension == "csv":
             post_list = csv.reader(
                 file_handle,
@@ -102,9 +102,9 @@ def _process_inputfile(self, file_handle):
             next(post_list, None)  # skip headerline
         elif self.cfg.source_map.file_extension == "json":
             post_list = post_list + json.loads(file_handle.read())
-        self._parse_postlist(post_list)
+        result_msg = self._parse_postlist(post_list)
 
-    def _parse_postlist(self, post_list):
+    def _parse_postlist(self, post_list: TextIO):
         """Process posts according to specifications"""
         for post in post_list:
             # skip duplicates and erroneous entries
@@ -119,15 +119,19 @@ def _parse_postlist(self, post_list):
                 continue
             self._merge_posts(lbsn_post)
             # status report
-            msg = \
-                f'Cleaned output to {len(self.distinct_locations_set):02d} ' \
-                f'distinct locations from ' \
-                f'{self.stats.count_glob:02d} posts ' \
-                f'(File {self.stats.partcount} of {len(self.filelist)}) - ' \
-                f'Skipped posts: {self.stats.skipped_count} - skipped tags: ' \
-                f'{self.stats.count_tags_skipped} of ' \
-                f'{self.stats.count_tags_global}'
+            msg = (
+                f'Cleaned output to {len(self.distinct_locations_set):02d} '
+                f'distinct locations from '
+                f'{self.stats.count_glob:02d} posts '
+                f'(File {self.stats.partcount} of {len(self.filelist)}) - '
+                f'Skipped posts: {self.stats.skipped_count} - skipped tags: '
+                f'{self.stats.count_tags_skipped} of '
+                f'{self.stats.count_tags_global}')
             print(msg, end='\r')
+        # log last message to file, clean last stdout
+        print(" " * len(msg), end='\n')
+        sys.stdout.flush()
+        self.log.info(msg)
 
     def _merge_posts(self, lbsn_post):
         """Method will union all tags of a single user for each location