From f8166f6485f357617ef98b736ee6a7438005ced5 Mon Sep 17 00:00:00 2001 From: AD Date: Thu, 21 Feb 2019 15:00:33 +0100 Subject: [PATCH] Initial feat for cleaned data load (not ready) --- tagmaps/__main__.py | 2 +- tagmaps/classes/prepare_data.py | 34 ++++++++++++++++++++++++++++++--- tagmaps/tagmaps_.py | 11 ++++------- tests/test_emoji.py | 2 ++ 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/tagmaps/__main__.py b/tagmaps/__main__.py index 51bb49f..70e7020 100644 --- a/tagmaps/__main__.py +++ b/tagmaps/__main__.py @@ -75,7 +75,7 @@ def main(): # get statistics for input data # and indested data input_data.input_stats_report() - tagmaps.global_stats_report() + tagmaps.global_stats_report(cleaned=True) # get current time for monitoring now = time.time() diff --git a/tagmaps/classes/prepare_data.py b/tagmaps/classes/prepare_data.py index db55601..438b475 100644 --- a/tagmaps/classes/prepare_data.py +++ b/tagmaps/classes/prepare_data.py @@ -109,6 +109,7 @@ def add_record( self.distinct_userlocations_set.add(post_locid_userid) # print(f'Added: {post_locid_userid} to distinct_userlocations_set ' # f'(len: {len(distinct_userlocations_set)})') + # todo: if isinstance(lbsn_post, CleanedPost): # no need to merge terms and other parameter return @@ -197,7 +198,9 @@ def _get_item_stats(self) -> 'PreparedStats': - prepare data for tag maps clustering - store to self.data_prepared """ - self._prepare_item_stats() + if self.cleaned_stats is None: + self.cleaned_stats = PreparedStats() + self._prepare_item_stats() return self.cleaned_stats def _prepare_item_stats(self): @@ -525,8 +528,8 @@ def _parse_cleaned_post(cpost: Dict[str, str]) -> CleanedPost: post_publish_date=cpost.get("post_publish_date"), post_views_count=int(cpost.get("post_views_count")), post_like_count=int(cpost.get("post_like_count")), - emoji=set(cpost.get("post_publish_date").split(';')), - hashtags=set(cpost.get("post_publish_date").split(';')), + emoji=set(cpost.get("emoji").split(';')), + hashtags=set(cpost.get("hashtags").split(';')), loc_id=cpost.get("loc_id"), loc_name=cpost.get("loc_name") ) @@ -630,3 +633,28 @@ def _get_wordlist(cleaned_post_body): wordlist = [word for word in cleaned_post_body.lower().split( ' ') if len(word) > 2] return wordlist + + def global_stats_report(self, cleaned=None): + """Report global stats after data has been read""" + if cleaned is None: + cleaned = True + self.log.info( + f'Total user count (UC): ' + f'{len(self.locations_per_userid_dict)}') + self.log.info( + f'Total user post locations (UPL): ' + f'{len(self.distinct_userlocations_set)}') + if not cleaned: + return + if self.cleaned_stats is None: + self.cleaned_stats = PreparedStats() + self._prepare_item_stats() + self.log.info( + f'Total (cleaned) post count (PC): ' + f'{self.count_glob:02d}') + self.log.info( + f'Total (cleaned) tag count (PTC): ' + f'{self.cleaned_stats.total_tag_count}') + self.log.info( + f'Total (cleaned) emoji count (PEC): ' + f'{self.cleaned_stats.total_emoji_count}') diff --git a/tagmaps/tagmaps_.py b/tagmaps/tagmaps_.py index 4748558..96cf5cd 100644 --- a/tagmaps/tagmaps_.py +++ b/tagmaps/tagmaps_.py @@ -198,14 +198,11 @@ def init_lbsn_data(self): self.limit_bottom_user_count, self.topic_modeling) @TMDec.data_added_check - def global_stats_report(self): + def global_stats_report(self, cleaned=None): """Report global stats after data has been read""" - self.log.info( - f'Total user count (UC): ' - f'{len(self.lbsn_data.locations_per_userid_dict)}') - self.log.info( - f'Total user post locations (UPL): ' - f'{len(self.lbsn_data.distinct_userlocations_set)}') + if cleaned is None: + cleaned = True + self.lbsn_data.global_stats_report(cleaned=cleaned) @TMDec.init_data_check def load_cleaned_data(self, input_path): diff --git a/tests/test_emoji.py b/tests/test_emoji.py index 296be17..0dbd85c 100644 --- a/tests/test_emoji.py +++ b/tests/test_emoji.py @@ -88,3 +88,5 @@ def test_capital_case(): # alternative Shapefile module pure Python # https://github.com/GeospatialPython/pyshp#writing-shapefiles # import shapefile +# to test: +# bba9f14180134069a9da5b3eb3539130822f917a55fc42d4adcc816018f79d9e "bei;bilzbad;flamingos;radebeul;@mcfitti;👯🎉😋;30grad;und","radebeul;bilzbad;flamingos","🐬;🌊;😋;🎉;👯"