Initial feat for cleaned data load (not ready)

Sieboldianus · Feb 21, 2019 · f8166f6 · f8166f6
1 parent ae23464
commit f8166f6
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 11 deletions.
diff --git a/tagmaps/__main__.py b/tagmaps/__main__.py
@@ -75,7 +75,7 @@ def main():
     # get statistics for input data
     # and indested data
     input_data.input_stats_report()
-    tagmaps.global_stats_report()
+    tagmaps.global_stats_report(cleaned=True)
     # get current time for monitoring
     now = time.time()
 

diff --git a/tagmaps/classes/prepare_data.py b/tagmaps/classes/prepare_data.py
@@ -109,6 +109,7 @@ def add_record(
         self.distinct_userlocations_set.add(post_locid_userid)
         # print(f'Added: {post_locid_userid} to distinct_userlocations_set '
         #       f'(len: {len(distinct_userlocations_set)})')
+        # todo:
         if isinstance(lbsn_post, CleanedPost):
             # no need to merge terms and other parameter
             return
@@ -197,7 +198,9 @@ def _get_item_stats(self) -> 'PreparedStats':
         - prepare data for tag maps clustering
         - store to self.data_prepared
         """
-        self._prepare_item_stats()
+        if self.cleaned_stats is None:
+            self.cleaned_stats = PreparedStats()
+            self._prepare_item_stats()
         return self.cleaned_stats
 
     def _prepare_item_stats(self):
@@ -525,8 +528,8 @@ def _parse_cleaned_post(cpost: Dict[str, str]) -> CleanedPost:
             post_publish_date=cpost.get("post_publish_date"),
             post_views_count=int(cpost.get("post_views_count")),
             post_like_count=int(cpost.get("post_like_count")),
-            emoji=set(cpost.get("post_publish_date").split(';')),
-            hashtags=set(cpost.get("post_publish_date").split(';')),
+            emoji=set(cpost.get("emoji").split(';')),
+            hashtags=set(cpost.get("hashtags").split(';')),
             loc_id=cpost.get("loc_id"),
             loc_name=cpost.get("loc_name")
         )
@@ -630,3 +633,28 @@ def _get_wordlist(cleaned_post_body):
         wordlist = [word for word in cleaned_post_body.lower().split(
             ' ') if len(word) > 2]
         return wordlist
+
+    def global_stats_report(self, cleaned=None):
+        """Report global stats after data has been read"""
+        if cleaned is None:
+            cleaned = True
+        self.log.info(
+            f'Total user count (UC): '
+            f'{len(self.locations_per_userid_dict)}')
+        self.log.info(
+            f'Total user post locations (UPL): '
+            f'{len(self.distinct_userlocations_set)}')
+        if not cleaned:
+            return
+        if self.cleaned_stats is None:
+            self.cleaned_stats = PreparedStats()
+            self._prepare_item_stats()
+        self.log.info(
+            f'Total (cleaned) post count (PC): '
+            f'{self.count_glob:02d}')
+        self.log.info(
+            f'Total (cleaned) tag count (PTC): '
+            f'{self.cleaned_stats.total_tag_count}')
+        self.log.info(
+            f'Total (cleaned) emoji count (PEC): '
+            f'{self.cleaned_stats.total_emoji_count}')
diff --git a/tagmaps/tagmaps_.py b/tagmaps/tagmaps_.py
@@ -198,14 +198,11 @@ def init_lbsn_data(self):
             self.limit_bottom_user_count, self.topic_modeling)
 
     @TMDec.data_added_check
-    def global_stats_report(self):
+    def global_stats_report(self, cleaned=None):
         """Report global stats after data has been read"""
-        self.log.info(
-            f'Total user count (UC): '
-            f'{len(self.lbsn_data.locations_per_userid_dict)}')
-        self.log.info(
-            f'Total user post locations (UPL): '
-            f'{len(self.lbsn_data.distinct_userlocations_set)}')
+        if cleaned is None:
+            cleaned = True
+        self.lbsn_data.global_stats_report(cleaned=cleaned)
 
     @TMDec.init_data_check
     def load_cleaned_data(self, input_path):

diff --git a/tests/test_emoji.py b/tests/test_emoji.py
@@ -88,3 +88,5 @@ def test_capital_case():
 # alternative Shapefile module pure Python
 # https://github.com/GeospatialPython/pyshp#writing-shapefiles
 # import shapefile
+# to test:
+# bba9f14180134069a9da5b3eb3539130822f917a55fc42d4adcc816018f79d9e "bei;bilzbad;flamingos;radebeul;@mcfitti;👯🎉😋;30grad;und","radebeul;bilzbad;flamingos","🐬;🌊;😋;🎉;👯"