Skip to content

Commit

Permalink
feat: add topic clusterer
Browse files Browse the repository at this point in the history
  • Loading branch information
Sieboldianus committed Feb 19, 2019
1 parent c43d86f commit 4694157
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 27 deletions.
2 changes: 1 addition & 1 deletion tagmaps/__init__.py
Expand Up @@ -6,4 +6,4 @@
from .classes.prepare_data import PrepareData
from .classes.interface import UserInterface
from .classes.shared_structure import (
EMOJI, LOCATIONS, TAGS, PostStructure, ClusterType, PreparedStats)
EMOJI, LOCATIONS, TAGS, TOPICS, PostStructure, ClusterType, PreparedStats)
75 changes: 58 additions & 17 deletions tagmaps/classes/cluster.py
Expand Up @@ -21,7 +21,7 @@
from functools import partial

from tagmaps.classes.alpha_shapes import AlphaShapes
from tagmaps.classes.shared_structure import (EMOJI, LOCATIONS, TAGS,
from tagmaps.classes.shared_structure import (EMOJI, LOCATIONS, TAGS, TOPICS,
AnalysisBounds, CleanedPost,
ClusterType, PreparedStats)
from tagmaps.classes.plotting import TPLT
Expand Down Expand Up @@ -113,8 +113,11 @@ def new_clusterer(cls,
top_list = cleaned_stats.top_emoji_list
elif clusterer_type == LOCATIONS:
top_list = cleaned_stats.top_locations_list
elif clusterer_type == TOPICS:
# TODO:
top_list = cleaned_stats.top_tags_list
else:
raise ValueError("Cluster Type unknown.")
raise ValueError(f"Cluster Type unknown: {clusterer_type}")

clusterer = cls(
bounds=bounds,
Expand Down Expand Up @@ -175,20 +178,25 @@ def _select_postguids(self, item: str) -> Tuple[List[str], int]:
"""
distinct_localloc_count = set()
selected_postguids_list = list()
for cleaned_photo_location in self.cleaned_post_list:
for cleaned_post_location in self.cleaned_post_list:
if self.cls_type == TAGS:
self._filter_tags(
item, cleaned_photo_location,
item, cleaned_post_location,
selected_postguids_list,
distinct_localloc_count)
elif self.cls_type == EMOJI:
self._filter_emoji(
item, cleaned_photo_location,
item, cleaned_post_location,
selected_postguids_list,
distinct_localloc_count)
elif self.cls_type == LOCATIONS:
self._filter_locations(
item, cleaned_photo_location,
item, cleaned_post_location,
selected_postguids_list,
distinct_localloc_count)
elif self.cls_type == TOPICS:
self._filter_topics(
item, cleaned_post_location,
selected_postguids_list,
distinct_localloc_count)
else:
Expand All @@ -208,6 +216,31 @@ def _filter_tags(
distinct_localloc_count.add(
cleaned_photo_location.loc_id)

@staticmethod
def _filter_topics(
item: List[str],
cleaned_photo_location: CleanedPost,
selected_postguids_list: List[str],
distinct_localloc_count: Set[str]):
"""Check topics against tags, body and emoji"""
if (ClusterGen._compare_anyinlist(
item, cleaned_photo_location.hashtags)
or ClusterGen._compare_anyinlist(
item, cleaned_photo_location.post_body)
or ClusterGen._compare_anyinlist(
item, cleaned_photo_location.emoji)):
selected_postguids_list.append(
cleaned_photo_location.guid)
distinct_localloc_count.add(
cleaned_photo_location.loc_id)

@staticmethod
def _compare_anyinlist(items, item_list):
"""Check if any term of topic is in list"""
if any(x in items for x in item_list):
return True
return False

@staticmethod
def _filter_emoji(
item: str,
Expand Down Expand Up @@ -242,12 +275,13 @@ def _getselect_postguids(self, item: str,
query_result = self._select_postguids(item)
selected_postguids_list = query_result[0]
distinct_localloc_count = query_result[1]

if silent:
return selected_postguids_list
# console reporting
if self.cls_type == EMOJI:
item_text = Utils._get_emojiname(item)
if self.cls_type == TOPICS:
item_text = '-'.join(item)
else:
item_text = item
type_text = self.cls_type.rstrip('s')
Expand Down Expand Up @@ -290,7 +324,7 @@ def _get_np_points_guids(self,
"""Gets numpy array of selected points with latlng containing _item
Args:
item: tag, emoji, location
item: tag, emoji, location; or topic (list of terms)
silent: if true, no console output (interface mode)
Returns:
Expand Down Expand Up @@ -731,7 +765,7 @@ def _get_sel_preview(self, item):
points = self._get_np_points(
item=item,
silent=True)
fig = TPLT._get_sel_preview(points, item, self.bounds)
fig = TPLT._get_sel_preview(points, item, self.bounds, self.cls_type)
return fig

def _get_cluster_preview(self, item):
Expand All @@ -747,9 +781,12 @@ def _get_cluster_preview(self, item):
item=item,
preview_mode=True)
fig = TPLT._get_cluster_preview(
points, sel_colors, item, self.bounds, mask_noisy,
self.cluster_distance, number_of_clusters,
self.autoselect_clusters)
points=points, sel_colors=sel_colors, item_text=item,
bounds=self.bounds, mask_noisy=mask_noisy,
cluster_distance=self.cluster_distance,
number_of_clusters=number_of_clusters,
auto_select_clusters=self.autoselect_clusters,
cls_type=self.cls_type)
return fig

def _get_clustershapes_preview(self, item):
Expand All @@ -771,19 +808,23 @@ def _get_clustershapes_preview(self, item):
# cluster_guids: those guids that are clustered
cluster_guids, _ = self._get_cluster_guids(
clusters, selected_post_guids)
shapes, _ = self._get_item_clustershapes(item, cluster_guids)
if self.cls_type == TOPICS:
first_item = item[0]
shapes, _ = self._get_item_clustershapes(first_item, cluster_guids)
# proj shapes back to WGS1984 for plotting in matplotlib
# simple list comprehension with projection:
project = partial(
pyproj.transform,
self.crs_proj, # source coordinate system
self.crs_wgs) # destination coordinate system
shapes_wgs = [transform(project, shape[0]) for shape in shapes]

fig = TPLT._get_cluster_preview(
points, sel_colors, item, self.bounds, mask_noisy,
self.cluster_distance, number_of_clusters,
self.autoselect_clusters, shapes_wgs)
points=points, sel_colors=sel_colors, item_text=item,
bounds=self.bounds, mask_noisy=mask_noisy,
cluster_distance=self.cluster_distance,
number_of_clusters=number_of_clusters,
auto_select_clusters=self.autoselect_clusters,
shapes=shapes_wgs, cls_type=self.cls_type)
return fig

def get_singlelinkagetree_preview(self, item):
Expand Down
2 changes: 1 addition & 1 deletion tagmaps/classes/interface.py
Expand Up @@ -22,7 +22,7 @@
from tagmaps.classes.utils import Utils
from tagmaps.classes.shared_structure import (
CleanedPost, AnalysisBounds,
ClusterType, TAGS, LOCATIONS, EMOJI)
ClusterType, TAGS, LOCATIONS, EMOJI, TOPICS)
from tagmaps.classes.cluster import ClusterGen
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure
Expand Down
14 changes: 8 additions & 6 deletions tagmaps/classes/plotting.py
Expand Up @@ -10,7 +10,7 @@
from tagmaps.classes.shared_structure import CleanedPost, AnalysisBounds
from tagmaps.classes.utils import Utils
from tagmaps.classes.shared_structure import (
TAGS, LOCATIONS, EMOJI)
TAGS, LOCATIONS, EMOJI, TOPICS)


class TPLT():
Expand Down Expand Up @@ -67,18 +67,18 @@ def _get_fig_points(fig, points, bounds):
return fig

@staticmethod
def _get_sel_preview(points, item, bounds):
def _get_sel_preview(points, item, bounds, cls_type):
"""Returns plt map for item selection preview"""
# img_ratio = TPLT._get_img_ratio(bounds)
fig = None
fig = TPLT._get_fig_points(fig, points, bounds)
fig.suptitle(item, fontsize=18, fontweight='bold')
TPLT._set_plt_suptitle(fig, item, cls_type)
return fig

@staticmethod
def _get_cluster_preview(points, sel_colors, item_text, bounds, mask_noisy,
cluster_distance, number_of_clusters, auto_select_clusters=None,
shapes=None, fig=None):
shapes=None, fig=None, cls_type=None):
if auto_select_clusters is None:
auto_select_clusters = False
# img_ratio = TPLT._get_img_ratio(bounds)
Expand All @@ -90,7 +90,7 @@ def _get_cluster_preview(points, sel_colors, item_text, bounds, mask_noisy,
ax.scatter(points.T[0], points.T[1],
c=sel_colors, **TPLT.PLOT_KWDS)
fig.canvas.set_window_title('Cluster Preview')
TPLT._set_plt_suptitle_st(fig, item_text)
TPLT._set_plt_suptitle(fig, item_text, cls_type)
dist_text = ''
if shapes:
for shape in shapes:
Expand Down Expand Up @@ -179,6 +179,8 @@ def _get_pltspec_suptitle(item: str, cls_type=None) -> str:
elif cls_type == EMOJI:
emoji_name = Utils._get_emojiname(item)
title = f'{item} ({emoji_name})'
elif cls_type == TOPICS:
title = '-'.join(item)
else:
title = item.upper()
return title
Expand Down Expand Up @@ -206,4 +208,4 @@ def _get_poly_patch(ax, polygon):
# # distYLat = Utils.haversine(limXMin,limYMax,limXMin,limYMin)
# # distXLng = Utils.haversine(limXMax,limYMin,limXMin,limYMin)
# img_ratio = dist_x_lng/(dist_y_lat*2)
# return img_ratio
# return img_ratio
2 changes: 2 additions & 0 deletions tagmaps/classes/shared_structure.py
Expand Up @@ -12,10 +12,12 @@
LOCATIONS: str = 'Locations'
TAGS: str = 'Tags'
EMOJI: str = 'Emoji'
TOPICS: List[str] = 'Topics'
ClusterType: Tuple[Tuple[str, int]] = (
(LOCATIONS, 1),
(TAGS, 2),
(EMOJI, 3),
(TOPICS, 4),
)

CleanedPost_ = namedtuple(
Expand Down
8 changes: 6 additions & 2 deletions tagmaps/tagmaps_.py
Expand Up @@ -18,7 +18,7 @@
from .classes.load_data import LoadData
from .classes.prepare_data import PrepareData
from .classes.shared_structure import (
EMOJI, LOCATIONS, TAGS, PostStructure, ClusterType, PreparedStats)
EMOJI, LOCATIONS, TAGS, TOPICS, PostStructure, ClusterType, PreparedStats)
from .classes.utils import Utils


Expand Down Expand Up @@ -137,7 +137,7 @@ def __init__(
output_folder=None, remove_long_tail=True,
limit_bottom_user_count=5, topic_modeling=False,
local_saturation_check=False, max_items=None,
logging_level=None):
logging_level=None, topic_cluster=None):
"""Init settings for Tag Maps Clustering"""
self.write_cleaned_data = write_cleaned_data
self.output_folder = output_folder
Expand All @@ -146,6 +146,8 @@ def __init__(
self.topic_modeling = topic_modeling
if max_items is None:
max_items = 1000
if topic_cluster is None:
topic_cluster = False
self.max_items = max_items
self.local_saturation_check = local_saturation_check
# initialize list of types to cluster
Expand All @@ -156,6 +158,8 @@ def __init__(
self.cluster_types.append(EMOJI)
if location_cluster:
self.cluster_types.append(LOCATIONS)
if topic_cluster:
self.cluster_types.append(TOPICS)
# create output dir if not exists
Utils.init_output_dir(self.output_folder)
# init logger (logging to console and file log.txt)
Expand Down

0 comments on commit 4694157

Please sign in to comment.