Skip to content

Commit

Permalink
Initial feat for cleaned data load (not ready)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sieboldianus committed Feb 21, 2019
1 parent ae23464 commit f8166f6
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 11 deletions.
2 changes: 1 addition & 1 deletion tagmaps/__main__.py
Expand Up @@ -75,7 +75,7 @@ def main():
# get statistics for input data
# and indested data
input_data.input_stats_report()
tagmaps.global_stats_report()
tagmaps.global_stats_report(cleaned=True)
# get current time for monitoring
now = time.time()

Expand Down
34 changes: 31 additions & 3 deletions tagmaps/classes/prepare_data.py
Expand Up @@ -109,6 +109,7 @@ def add_record(
self.distinct_userlocations_set.add(post_locid_userid)
# print(f'Added: {post_locid_userid} to distinct_userlocations_set '
# f'(len: {len(distinct_userlocations_set)})')
# todo:
if isinstance(lbsn_post, CleanedPost):
# no need to merge terms and other parameter
return
Expand Down Expand Up @@ -197,7 +198,9 @@ def _get_item_stats(self) -> 'PreparedStats':
- prepare data for tag maps clustering
- store to self.data_prepared
"""
self._prepare_item_stats()
if self.cleaned_stats is None:
self.cleaned_stats = PreparedStats()
self._prepare_item_stats()
return self.cleaned_stats

def _prepare_item_stats(self):
Expand Down Expand Up @@ -525,8 +528,8 @@ def _parse_cleaned_post(cpost: Dict[str, str]) -> CleanedPost:
post_publish_date=cpost.get("post_publish_date"),
post_views_count=int(cpost.get("post_views_count")),
post_like_count=int(cpost.get("post_like_count")),
emoji=set(cpost.get("post_publish_date").split(';')),
hashtags=set(cpost.get("post_publish_date").split(';')),
emoji=set(cpost.get("emoji").split(';')),
hashtags=set(cpost.get("hashtags").split(';')),
loc_id=cpost.get("loc_id"),
loc_name=cpost.get("loc_name")
)
Expand Down Expand Up @@ -630,3 +633,28 @@ def _get_wordlist(cleaned_post_body):
wordlist = [word for word in cleaned_post_body.lower().split(
' ') if len(word) > 2]
return wordlist

def global_stats_report(self, cleaned=None):
"""Report global stats after data has been read"""
if cleaned is None:
cleaned = True
self.log.info(
f'Total user count (UC): '
f'{len(self.locations_per_userid_dict)}')
self.log.info(
f'Total user post locations (UPL): '
f'{len(self.distinct_userlocations_set)}')
if not cleaned:
return
if self.cleaned_stats is None:
self.cleaned_stats = PreparedStats()
self._prepare_item_stats()
self.log.info(
f'Total (cleaned) post count (PC): '
f'{self.count_glob:02d}')
self.log.info(
f'Total (cleaned) tag count (PTC): '
f'{self.cleaned_stats.total_tag_count}')
self.log.info(
f'Total (cleaned) emoji count (PEC): '
f'{self.cleaned_stats.total_emoji_count}')
11 changes: 4 additions & 7 deletions tagmaps/tagmaps_.py
Expand Up @@ -198,14 +198,11 @@ def init_lbsn_data(self):
self.limit_bottom_user_count, self.topic_modeling)

@TMDec.data_added_check
def global_stats_report(self):
def global_stats_report(self, cleaned=None):
"""Report global stats after data has been read"""
self.log.info(
f'Total user count (UC): '
f'{len(self.lbsn_data.locations_per_userid_dict)}')
self.log.info(
f'Total user post locations (UPL): '
f'{len(self.lbsn_data.distinct_userlocations_set)}')
if cleaned is None:
cleaned = True
self.lbsn_data.global_stats_report(cleaned=cleaned)

@TMDec.init_data_check
def load_cleaned_data(self, input_path):
Expand Down
2 changes: 2 additions & 0 deletions tests/test_emoji.py
Expand Up @@ -88,3 +88,5 @@ def test_capital_case():
# alternative Shapefile module pure Python
# https://github.com/GeospatialPython/pyshp#writing-shapefiles
# import shapefile
# to test:
# bba9f14180134069a9da5b3eb3539130822f917a55fc42d4adcc816018f79d9e "bei;bilzbad;flamingos;radebeul;@mcfitti;👯🎉😋;30grad;und","radebeul;bilzbad;flamingos","🐬;🌊;😋;🎉;👯"

0 comments on commit f8166f6

Please sign in to comment.