diff --git a/.gitignore b/.gitignore index b4a4665..4158343 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,8 @@ graph.png .vs/standalone_tag_cluster_hdbscan/v15/.suo .vs/ + emojifile.txt + +tagmaps.egg-info +tests/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b42f1c5 --- /dev/null +++ b/README.md @@ -0,0 +1,76 @@ +Tag Maps +============= +Spatio-Temporal Tag and Photo Location Clustering for generating Tag Maps + +**Tag Maps** are similar to Tag Clouds, but Tag Maps use the spatial information that is attached to geotagged photographs, in addition to tag frequency, to visualize tags on a map. +This Library uses the single-linkage tree that is available from [HDBSCAN](https://github.com/scikit-learn-contrib/hdbscan) to cut trees at a specific user-defined distance for all available tags in the given dataset. +Afterwards, Alpha Shapes are generated as a means to 'soft' placement of tags on a map, according to their area of use. Two Shapefiles are generated that can be used to visualize maps, for example, in ArcGIS. + +```diff +- Please be advised: This repository contains, in its current form, +- Python code that is very difficult to read or contribute to. +- I intend to put more work into structuring code ...as soon as more time is available. +``` + +![Tag Map Example](/resources/img6.png?raw=true) + +## Installation + +1. The easiest way for Windows users is to download the Pre-compiled build that is available [here](https://cloudstore.zih.tu-dresden.de/index.php/s/8EFfeJcpNCStQ9X/download) (315MB!) and run `generateTagClusters.exe` + - you can also compile the program yourself using the `setup.py` with [cx_Freeze](https://anthony-tuininga.github.io/cx_Freeze/): run `python setup.py build` + - or simple run `generateTagClusters.py` if you have Python and all dependencies installed +2. Place geotagged photo data in `/01_Input` subfolder + - example files/format are available in the Pre-compiled build zip-file above +3. Output files will be saved in `/02_Output` (2 Shapefiles in WGS1984 projection, one containing all Tag Cluster and one with the Photo Location Clusters) +4. Visualize Shapefiles using ArcGIS (I haven't tried other GIS Software such as QGIS, but it should theoretically be possible..) + - download `BasemapLayout_World.mxd` from [resources folder](/resources/BasemapLayout_World.mxd) and replace missing links with 2 resulting Shapefiles in `/02_Output` + - adjust minimum and maximum Font Sizes, Weighting Formula or other metrics to your needs. There are two Power Point files available which explain the complete process: [Tag Clustering](/resources/01_TagMaps.pptx) and [Photo Location Clustering](/resources/02_PhotoDensityMaps.pptx) + +## Code + +At the moment, the code is pretty messy. I wrote this in less than a week, without any other contributors in mind. I will work on this when there's more time available.. + +## Resources + +* Check out [this album on Flickr](https://www.flickr.com/photos/64974314@N08/albums/72157628868173205) with some more Tag Maps examples +* There's also an semi-interactive interface to explore some Tag Maps [here](http://maps.alexanderdunkel.com/) +* Check out my blog [here](http://blog.alexanderdunkel.com/) with some background information + + +## Contributors + +* todo: future goals, extending scope of program beyond Flickr photo data (include Twitter & Instagram, for example) + +## Built With +This project includes and makes use of several other projects/libraries/frameworks: + +>[*Alpha Shapes*](http://blog.thehumangeo.com/2014/05/12/drawing-boundaries-in-python/) Kevin Dwyer/ Sean Gillies +>>Generating Concave Hull for Point Clouds + +>[*HDBSCAN*](https://github.com/scikit-learn-contrib/hdbscan) McInnes, J. Healy, S. Astels - BSD licensed +>> A high performance implementation of HDBSCAN clustering. + +>[*Shapely*](https://github.com/Toblerity/Shapely) +>> Manipulation and analysis of geometric objects + +>[*SciPy and Convex Hull*](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.ConvexHull.html#scipy.spatial.ConvexHull) +>> Simple shapes for point clusters are generated using SciPy's excellent Convex Hull functions + +## License + +GNU GPLv3 + +## Changelog & Download + +2018-01-31: [**TagMaps v0.9.2**](https://cloudstore.zih.tu-dresden.de/index.php/s/8EFfeJcpNCStQ9X/download) + +* Because Tag Maps can be generated from local to regional to continental scale, finding an algorythm that fits all was not straight forward. The current implementation will produce shapes for all of these scales without any user input. +* This Final Alpha Shape Implementation is motivated from [Kevin Dwyer/ Sean Gillies](http://blog.thehumangeo.com/2014/05/12/drawing-boundaries-in-python/) great base code +* Implementation of Auto-Projection from Geographic to Projected Coordinate System. The code will select the most suitable UTM Zone for projecting data. + +2018-01-17: **TagMaps v0.9.1** + +* First build +* Initial commit, still lots of unnecessary comments/code parts left in code + +[//]: # (Readme formatting based on https://gist.github.com/PurpleBooth/109311bb0361f32d87a2) diff --git a/cx_setup.py b/cx_setup.py new file mode 100644 index 0000000..3acb6d3 --- /dev/null +++ b/cx_setup.py @@ -0,0 +1,52 @@ +import sys +from cx_Freeze import setup, Executable + +#Derive Package Paths Dynamically +import os.path +PYTHON_INSTALL_DIR = os.path.dirname(os.path.dirname(os.__file__)) +os.environ['TCL_LIBRARY'] = os.path.join(PYTHON_INSTALL_DIR, 'tcl', 'tcl8.6') +os.environ['TK_LIBRARY'] = os.path.join(PYTHON_INSTALL_DIR, 'tcl', 'tk8.6') + +# Dependencies are automatically detected, but it might need fine tuning. +#build_exe_options = {"packages": ["os"], "excludes": []} +includes_mod = ["tkinter.filedialog", + 'numpy.core._methods', + 'numpy.lib.format', + 'matplotlib.backends.backend_tkagg', + 'seaborn', + 'seaborn.cm', + 'scipy.sparse.csgraph', + 'argparse', + 'scipy.sparse.csgraph._validation' + ]#,'scipy.spatial.ckdtree' + +include_folders_files = ["C:/Python36/DLLs/tcl86t.dll", + "C:/Python36/DLLs/tk86t.dll", + 'tagmaps/01_Input/', + 'tagmaps/00_Config/', + '00_generateClusters_OnlyEmoji.cmd', + '00_generateClusters_OnlyPhotoLocations.cmd', + '00_generateClusters_OnlyTags.cmd', + ("D:/03_EvaVGI/05_Code/Py/standalone_tag_cluster_hdbscan/tagmaps/matplotlibrc","matplotlibrc") + ] +packages_mod = ["tkinter", "hdbscan"] +excludes_mod = ["scipy.spatial.cKDTree"] + + + +# GUI applications require a different base on Windows (the default is for a +# console application). +base = None +#base = "Console" +#if sys.platform == "win32": +# base = "Win32GUI" + +executables = [ + Executable('tagmaps/__main__.py', base=base) +] + +setup( name = "tagmaps", + version = "0.9.3", + description = "Tag Clustering for Tag Maps", + options = {'build_exe': {'includes': includes_mod, 'include_files': include_folders_files,'packages':packages_mod,'excludes':excludes_mod}}, + executables = executables) \ No newline at end of file diff --git a/setup.py b/setup.py index 0f676c7..0bb5f4c 100644 --- a/setup.py +++ b/setup.py @@ -1,51 +1,27 @@ -import sys -from cx_Freeze import setup, Executable - -#Derive Package Paths Dynamically -import os.path -PYTHON_INSTALL_DIR = os.path.dirname(os.path.dirname(os.__file__)) -os.environ['TCL_LIBRARY'] = os.path.join(PYTHON_INSTALL_DIR, 'tcl', 'tcl8.6') -os.environ['TK_LIBRARY'] = os.path.join(PYTHON_INSTALL_DIR, 'tcl', 'tk8.6') - -# Dependencies are automatically detected, but it might need fine tuning. -#build_exe_options = {"packages": ["os"], "excludes": []} -includes_mod = ['numpy.core._methods', - 'numpy.lib.format', - 'matplotlib.backends.backend_tkagg', - 'seaborn', - 'seaborn.cm', - 'scipy.sparse.csgraph', - 'argparse', - 'scipy.sparse.csgraph._validation', - ]#,'scipy.spatial.ckdtree' - -include_folders_files = ["C:/Python36/DLLs/tcl86t.dll", - "C:/Python36/DLLs/tk86t.dll", - 'tagmaps/01_Input/', - 'tagmaps/00_Config/', - '00_generateClusters_OnlyEmoji.cmd', - '00_generateClusters_OnlyPhotoLocations.cmd', - '00_generateClusters_OnlyTags.cmd', - ("D:/03_EvaVGI/05_Code/Py/standalone_tag_cluster_hdbscan/tagmaps/matplotlibrc","matplotlibrc") - ] -packages_mod = ["tkinter", "tkinter.filedialog"] -excludes_mod = ["scipy.spatial.cKDTree"] +# -*- coding: utf-8 -*- +from setuptools import setup +import sys - -# GUI applications require a different base on Windows (the default is for a -# console application). -base = None -#base = "Console" -#if sys.platform == "win32": -# base = "Win32GUI" - -executables = [ - Executable('tagmaps/generateTagClusters.py', base=base) -] - -setup( name = "generateTagClusters", +with open('README.md') as f: + long_description = f.read() + + +## setuptools dev +setup( name = "tagmaps", version = "0.9.3", description = "Tag Clustering for Tag Maps", - options = {'build_exe': {'includes': includes_mod, 'include_files': include_folders_files,'packages':packages_mod,'excludes':excludes_mod}}, - executables = executables) \ No newline at end of file + long_description=long_description, + long_description_content_type='text/markdown', + author='Alexander Dunkel', + author_email='alexander.dunkel@tu-dresden.de', + url='https://gitlab.vgiscience.de/ad/TagCluster', + license='GNU GPLv3 or any higher', + packages=['tagmaps'], + install_requires=[ + ], + entry_points={ + 'console_scripts': [ + 'tagmaps = tagmaps.__main__:main' + ] + }) \ No newline at end of file diff --git a/tagmaps/__init__.py b/tagmaps/__init__.py index e826ce7..43918de 100644 --- a/tagmaps/__init__.py +++ b/tagmaps/__init__.py @@ -1,5 +1,3 @@ -from .hdbscan_ import HDBSCAN, hdbscan -from .robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage -from .validity import validity_index -from .prediction import approximate_predict, membership_vector, all_points_membership_vectors - +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from .classes.utils import Utils diff --git a/tagmaps/__main__.py b/tagmaps/__main__.py new file mode 100644 index 0000000..96aac89 --- /dev/null +++ b/tagmaps/__main__.py @@ -0,0 +1,1876 @@ +#!/usr/bin/env python +# coding: utf-8 +# generateTagClusters + +""" +generateTagClusters.py + +- will read in geotagged (lat/lng) decimal degree point data +- will generate HDBSCAN Cluster Hierarchy +- will output Alpha Shapes/Delauney for cluster cut at specific distance +""" + +__author__ = "Alexander Dunkel" +__license__ = "GNU GPLv3" +__version__ = "0.9.3" + +import csv +import os +os.system('mode con: cols=197 lines=40') +import sys +import re +from glob import glob +from _csv import QUOTE_MINIMAL +from collections import defaultdict +from collections import Counter +from collections import namedtuple +import collections +import tkinter as tk +from tkinter.messagebox import showerror +import tkinter.messagebox + +#from .utils import * +import datetime +import warnings +from unicodedata import name as unicode_name + +#Cluster stuff +import numpy as np +import matplotlib +import matplotlib.pyplot as plt +plt.ion() #enable interactive mode for pyplot (not necessary?!) +import seaborn as sns +import sklearn.datasets as data +import pandas as pd +import hdbscan + +from multiprocessing.pool import ThreadPool +pool = ThreadPool(processes=1) +import time +from time import sleep +import copy + +#needed for anonymization (Topic Clustering) +import hashlib + +#enable for map display +#from mpl_toolkits.basemap import Basemap +#from PIL import Image + +import fiona #Fiona needed for reading Shapefile +from fiona.crs import from_epsg +import shapely.geometry as geometry +import pyproj #import Proj, transform +#https://gis.stackexchange.com/questions/127427/transforming-shapely-polygon-and-multipolygon-objects + +from shapely.ops import transform +from decimal import Decimal + +#alternative Shapefile module pure Python +#https://github.com/GeospatialPython/pyshp#writing-shapefiles +#import shapefile + + +##Emojitest +#n = '❤️👨‍⚕️' +#n = '😍,146' +##print(n.encode("utf-8")) +###n = '👨‍⚕️' #medical emoji with zero-width joiner (http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html) +#nlist = Utils.extract_emojis(n) +#with open("emojifile.txt", "w", encoding='utf-8') as emojifile: +# emojifile.write("Original: " + n + '\n') +# for xstr in nlist: +# emojifile.write('Emoji Extract: U+%04x' % ord(xstr) + '\n') +# emojifile.write(xstr + '\n') +# for _c in n: +# emojifile.write(str(unicode_name(_c)) + '\n') +# emojifile.write('Each Codepoint: U+%04x' % ord(_c) + '\n') + +#initialize global variables for analysis bounds (lat, lng coordinates) +limLatMin = None +limLatMax = None +limLngMin = None +limLngMax = None +abort = False +#definition of global figure for reusing windows +fig1 = None +fig2 = None +fig3 = None +fig4 = None +proceedClusting = False +currentDisplayTag = None +imgRatio = 0 +floaterX = 0 +floaterY = 0 +clusterTreeCuttingDist = 0 +topTagsList = [] +lastselectionList = [] +tnum = 0 +tkScalebar = None + +def main(): + from tagmaps.classes.utils import Utils + ###################### + ####config section#### + ###################### + log_file = "02_Output/log.txt" + log_texts_list = [] + + def print_store_log(text,end=None): + if end is None: + addEnd = False + end = '\n' + else: + addEnd = True + #watch out for non-printable characters in console + try: + print(text,end=end) + except UnicodeEncodeError: + print("#".join(re.findall("[a-zA-Z]+", text))) + log_texts_list.append(text + end) + + ##args + #Choose one of four options for Input data type: + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-s', "--source", default= "fromLBSN") # naming it "source" + parser.add_argument('-r', "--removeLongTail", type=Utils.str2bool, nargs='?', const=True,default= True) + parser.add_argument('-e', "--EPSG") + parser.add_argument('-t', "--clusterTags", type=Utils.str2bool, nargs='?', const=True,default= True) + parser.add_argument('-p', "--clusterPhotos", type=Utils.str2bool, nargs='?', const=True,default= True) + parser.add_argument('-c', "--localSaturationCheck", type=Utils.str2bool, nargs='?', const=True, default= False) + parser.add_argument('-j', "--tokenizeJapanese", type=Utils.str2bool, nargs='?', const=True, default= False) + parser.add_argument('-o', "--clusterEmojis", type=Utils.str2bool, nargs='?', const=True, default= True) + parser.add_argument('-m', "--topicModeling", type=Utils.str2bool, nargs='?', const=True, default= False) + parser.add_argument('-w', "--writeCleanedData", type=Utils.str2bool, nargs='?', const=True, default= True) + parser.add_argument('-i', "--shapefileIntersect", type=Utils.str2bool, nargs='?', const=True, default= False) + parser.add_argument('-f', "--shapefilePath", default= "") + parser.add_argument('-is',"--ignoreStoplists", type=Utils.str2bool, nargs='?', const=True, default= False) + parser.add_argument('-ip',"--ignorePlaceCorrections", type=Utils.str2bool, nargs='?', const=True, default= False) + parser.add_argument('-stat',"--statisticsOnly", type=Utils.str2bool, nargs='?', const=True, default= False) + args = parser.parse_args() # returns data from the options specified (source) + DSource = args.source + clusterTags = args.clusterTags + clusterPhotos = args.clusterPhotos + removeLongTail = args.removeLongTail + clusterEmojis = args.clusterEmojis + topicModeling = args.topicModeling + writeCleanedData = args.writeCleanedData + localSaturationCheck = args.localSaturationCheck + shapefileIntersect = args.shapefileIntersect + shapefilePath = args.shapefilePath + ignoreStoplists = args.ignoreStoplists + ignorePlaceCorrections = args.ignorePlaceCorrections + statisticsOnly = args.statisticsOnly + writeGISCompLine = True # writes placeholder entry after headerline for avoiding GIS import format issues + + ##Load Filterlists + SortOutAlways_file = "00_Config/SortOutAlways.txt" + SortOutAlways_inStr_file = "00_Config/SortOutAlways_inStr.txt" + SortOutAlways_set = set() + SortOutAlways_inStr_set = set() + if not os.path.isfile(SortOutAlways_file): + print(f'{SortOutAlways_file} not found.') + #else read logfile + else: + if ignoreStoplists == False: + with open(SortOutAlways_file, newline='', encoding='utf8') as f: #read each unsorted file and sort lines based on datetime (as string) + SortOutAlways_set = set([line.lower().rstrip('\r\n') for line in f]) + print(f'Loaded {len(SortOutAlways_set)} stoplist items.') + if not os.path.isfile(SortOutAlways_inStr_file): + print(f'{SortOutAlways_inStr_file} not found.') + #else read logfile + else: + if ignoreStoplists == False: + with open(SortOutAlways_inStr_file, newline='', encoding='utf8') as f: #read each unsorted file and sort lines based on datetime (as string) + SortOutAlways_inStr_set = set([line.lower().rstrip('\r\n') for line in f]) + print(f'Loaded {len(SortOutAlways_inStr_set)} inStr stoplist items.') + #manually filter places or correct lat/lng + SortOutPlaces_file = "00_Config/SortOutPlaces.txt" + CorrectPlaceLatLng_file = "00_Config/CorrectPlaceLatLng.txt" + SortOutPlaces_set = set() + CorrectPlaceLatLng_dict = dict() + SortOutPlaces = False + if os.path.isfile(SortOutPlaces_file): + if ignoreStoplists == False: + with open(SortOutPlaces_file, newline='', encoding='utf8') as f: + f.readline() + #placeid + SortOutPlaces_set = set([line.rstrip('\r\n').split(",")[0] for line in f if len(line) > 0]) + SortOutPlaces = True + print(f'Loaded {len(SortOutPlaces_set)} stoplist places.') + CorrectPlaces = False + if os.path.isfile(CorrectPlaceLatLng_file): + if ignorePlaceCorrections == False: + with open(CorrectPlaceLatLng_file, newline='', encoding='utf8') as f: + f.readline() + for line in f: + if len(line) > 0: + linesplit = line.rstrip('\r\n').split(",") + if len(linesplit) > 1: + #placeid = #lat,lng + CorrectPlaceLatLng_dict[linesplit[0]] = (linesplit[1],linesplit[2]) + CorrectPlaces = True + print(f'Loaded {len(CorrectPlaceLatLng_dict)} place lat/lng corrections.') + + ###SHAPEFILESTUFF### + if shapefileIntersect: + if shapefilePath == "": + sys.exit(f'No Shapefile-Path specified. Exiting..') + from shapely.geometry import Polygon + from shapely.geometry import shape + from shapely.geometry import Point + PShape = fiona.open(shapefilePath) + ######Single Polygon:###### + first = PShape.next() + print("Loaded Shapefile with " + str(len(first['geometry']['coordinates'][0])) + " Vertices.") # (GeoJSON format) + shp_geom = shape(first['geometry']) #shape(first) + + ######Multipolygon:###### + #vcount = PShape.next()['geometry']['coordinates'] #needed for count of vertices + #geom = MultiPolygon([shape(pol['geometry']) for pol in PShape]) + #shp_geom = geom + #print("Loaded Shapefile with Vertices ", sum([len(poly[0]) for poly in vcount])) # (GeoJSON format) + ###END SHAPEFILESTUFF### + + if args.EPSG is None: + overrideCRS = None + else: + #try loading Custom CRS at beginning + crs_proj = pyproj.Proj(init='epsg:{0}'.format(overrideCRS)) + print("Custom CRS set: " + str(crs_proj.srs)) + epsg_code = overrideCRS + + tokenizeJapanese = args.tokenizeJapanese + if tokenizeJapanese: + from jNlp.jTokenize import jTokenize + #print(str(removeLongTail)) + pathname = os.getcwd() + if not os.path.exists(pathname + '/02_Output/'): + os.makedirs(pathname + '/02_Output/') + print("Folder /02_Output was created") + #if not os.path.exists(pathname + '/Output/ClusterImg/'): + # os.makedirs(pathname + '/Output/ClusterImg/') + # print("Folder /Output/ClusterImg/ was created") + + # READ All JSON in Current Folder and join to list + #partnum = 0 + guid_list = set() #global list of guids + count_glob = 0 + partcount = 0 + #filenameprev = "" + if (DSource == "fromFlickr_CSV"): + filelist = glob('01_Input/*.txt') + GMTTimetransform = 0 + guid_columnNameID = 5 #guid + Sourcecode = 2 + quoting_opt = csv.QUOTE_NONE + elif (DSource == "fromInstagram_PGlbsnEmoji") or (DSource == "fromLBSN") or (DSource == "fromLBSN_old"): + filelist = glob('01_Input/*.csv') + guid_columnNameID = 1 #guid + quoting_opt = csv.QUOTE_MINIMAL + elif (DSource == "fromSensorData_InfWuerz"): + filelist = glob('01_Input/*.csv') + GMTTimetransform = 0 + guid_columnNameID = 1 #guid + Sourcecode = 11 + quoting_opt = csv.QUOTE_NONE + else: + sys.exit("Source not supported yet.") + + print('\n') + print_store_log("########## STEP 1 of 6: Data Cleanup ##########") + if (len(filelist) == 0): + sys.exit(f'No *.json/csv/txt files found.') + else: + if clusterTags or clusterEmojis: + inputtext = input(f'Files to process: {len(filelist)}. \nOptional: Enter a Number for the variety of Tags to process (Default is 1000)\nPress Enter to proceed.. \n') + if inputtext == "" or not inputtext.isdigit(): + tmax = 1000 + else: + tmax = int(inputtext) + + skippedCount = 0 + appendToAlreadyExist = False + count_non_geotagged = 0 + count_outside_shape = 0 + count_tags_global = 0 + count_emojis_global = 0 + count_tags_skipped = 0 + shapeFileExcludelocIDhash = set() + shapeFileIncludedlocIDhash = set() + + def setLatLngBounds(Lat,Lng): + global limLatMin, limLatMax, limLngMin, limLngMax + if limLatMin is None or (Lat < limLatMin and not Lat == 0): + limLatMin = Lat + if limLatMax is None or (Lat > limLatMax and not Lat == 0): + limLatMax = Lat + if limLngMin is None or (Lng < limLngMin and not Lng == 0): + limLngMin = Lng + if limLngMax is None or (Lng > limLngMax and not Lng == 0): + limLngMax = Lng + def is_number(s): + try: + float(s) + return True + except ValueError: + return False + photoIDHash = set() + LocationsPerUserID_dict = defaultdict(set) + UserLocationTagList_dict = defaultdict(set) + if topicModeling: + UserTopicList_dict = defaultdict(set) + UserPhotoIDS_dict = defaultdict(set) + UserPhotoFirstThumb_dict = defaultdict(str) + UserLocationWordList_dict = defaultdict(set) + UserLocationsFirstPhoto_dict = defaultdict(set) + if clusterEmojis: + overallNumOfEmojis_global = collections.Counter() + + #UserDict_TagCounters = defaultdict(set) + UserDict_TagCounters_global = defaultdict(set) + #UserIDsPerLocation_dict = defaultdict(set) + #PhotoLocDict = defaultdict(set) + distinctLocations_set = set() + distinctUserLocations_set = set() + count_loc = 0 + now = time.time() + for file_name in filelist: + #filename = "02_Output/" + os.path.basename(file_name) + #with open(filename, 'a', encoding='utf8') as file: + # file.write("ID_Date,SOURCE,Latitude,Longitude,PhotoID,Owner,UserID,Name,URL,DateTaken,UploadDate,Views,Tags,MTags,Likes,Comments,Shortcode,Type,LocName,LocID" + '\n') + # file.write('"2000-01-01 00:00:00","TESTLINE","43.2544706","28.023467","24PHOTOID3534","testowner","812643S9812644","testcaption","https://scontent.cdninstagram.com/t/s640x640/22344798_1757535311005731_6649353052090269696_n.jpg","2000-01-01 00:00:00","2000-01-01 00:00:00","22",";blacksea;romania;ig;seaside;mareaneagra;travel;getfit;trip;travelog;sun;beachy;avenit;mytinyatlas;islandhopping;flashesofdelight;beachvibes;beautiful;waves;barbershop;sea;love;photooftheday;picoftheday;vsco;vscocam;snapshot;instatravel;instamood;ich;io;summer;photography;europa;happy;end;je;lacrusesc;contrejour;chiaroscuro;morninsunshine;treadmill;gainz;workout;sunshine;getstrong;eu;rumunsko;calatoriecupasiune;superduper;selfie;lazyday;","TESTMTAG","50","25","BaE5OZpgfRu","Image","Sunshine Boulevard Sunshine Boulevard Sunshine Bou","821648SS21642"' +'\n') + + photolist = [] # clear photolist for every file + ##f_count += 1 + ##if f_count > 25: + ## break + # guid_list.clear() #duplicate detection only for last 500k items + with open(file_name, newline='', encoding='utf8') as f: # On input, if newline is None, universal newlines mode is enabled. Lines in the input can end in '\n', '\r', or '\r\n', and these are translated into '\n' before being returned to the caller. + partcount += 1 + if (DSource == "fromInstagram_LocMedia_CSV" or DSource == "fromLBSN" or DSource == "fromLBSN_old" or DSource == "fromInstagram_UserMedia_CSV" or DSource == "fromFlickr_CSV" or DSource == "fromInstagram_PGlbsnEmoji" or DSource == "fromSensorData_InfWuerz"): + photolist = csv.reader(f, delimiter=',', quotechar='"', quoting=quoting_opt) #QUOTE_NONE is important because media saved from php/Flickr does not contain any " check; only ',' are replaced + next(photolist, None) # skip headerline + elif (DSource == "fromInstagram_HashMedia_JSON"): + photolist = photolist + json.loads(f.read()) + #PhotosPerDayLists = defaultdict(list) + #keyCreatedHash = set() + for item in photolist: + #duplicate check based on GUID + if item[guid_columnNameID] in photoIDHash: + skippedCount += 1 + continue + else: + photoIDHash.add(item[guid_columnNameID]) + if (DSource == "fromInstagram_LocMedia_CSV"): + if len(item) < 15: + #skip + skippedCount += 1 + continue + else: + photo_source = Sourcecode #LocamediaCode + photo_guid = item[0] #guid + photo_userid = item[2] + #photo_owner = item[7] ##!!! + photo_shortcode = item[5] + photo_uploadDate = item[3] # format datetime as string + photo_idDate = photo_uploadDate #use upload date as sorting ID + photo_caption = item[7] + photo_likes = item[12] + photo_tags = ";" + item[8] + ";" + photo_thumbnail = item[6] + photo_comments = item[13] + photo_mediatype = item[4] + photo_locID = item[14] + photo_locName = item[1] + #assign lat/lng coordinates from dict + if (photo_locID in loc_dict): + photo_latitude = loc_dict[photo_locID][0] + photo_longitude = loc_dict[photo_locID][1] + #setLatLngBounds(photo_latitude,photo_longitude) + if shapefileIntersect: + #skip all outside shapefile + if photo_locID in shapeFileExcludelocIDhash: + count_outside_shape += 1 + continue + if not photo_locID in shapeFileIncludedlocIDhash: + LngLatPoint = Point(photo_longitude, photo_latitude) + if not LngLatPoint.within(shp_geom): + count_outside_shape += 1 + shapeFileExcludelocIDhash.add(photo_locID) + continue + else: + shapeFileIncludedlocIDhash.add(photo_locID) + else: + if excludeWhereMissingGeocode: + skippedCount += 1 + continue #skip non-geotagged medias + else: + photo_latitude = "" + photo_longitude = "" + #assign usernames from dict + if photo_userid in user_dict: + photo_owner = user_dict[photo_userid] + elif photo_userid in netlytics_usernameid_dict: + photo_owner = netlytics_usernameid_dict[photo_userid] + else: + photo_owner = "" + #empty for instagram: + photo_mTags = "" + photo_dateTaken = "" + photo_views = "" + elif DSource == "fromInstagram_UserMedia_CSV": + if len(item) < 15: + #skip + skippedCount += 1 + continue + else: + photo_source = Sourcecode #LocMediaCode + photo_guid = item[0].split("_")[0] #guid + photo_userid = item[0].split("_")[1] #userid + photo_owner = item[1] ##!!! + photo_shortcode = item[6] + photo_uploadDate = item[4] # format datetime as string + photo_idDate = photo_uploadDate #use upload date as sorting ID + photo_caption = item[8] + photo_likes = item[13] + photo_tags = ";" + item[9] + ";" + photo_thumbnail = item[7] + photo_comments = item[14] + photo_mediatype = item[5] + photo_locID = item[2] + if photo_locID == "": + count_non_geotagged += 1 + continue #skip non-geotagged medias + photo_locName = item[3] + #assign lat/lng coordinates from dict + if (photo_locID in loc_dict): + photo_latitude = loc_dict[photo_locID][0] + photo_longitude = loc_dict[photo_locID][1] + if shapefileIntersect: + #skip all outside shapefile + if photo_locID in shapeFileExcludelocIDhash: + count_outside_shape += 1 + continue + if not photo_locID in shapeFileIncludedlocIDhash: + LngLatPoint = Point(photo_longitude, photo_latitude) + if not LngLatPoint.within(shp_geom): + count_outside_shape += 1 + shapeFileExcludelocIDhash.add(photo_locID) + continue + else: + shapeFileIncludedlocIDhash.add(photo_locID) + else: + if excludeWhereMissingGeocode: + skippedCount += 1 + continue #skip non-geotagged medias + else: + photo_latitude = "" + photo_longitude = "" + #empty for Instagram: + photo_mTags = "" + photo_dateTaken = "" + photo_views = "" + elif DSource == "fromFlickr_CSV": + if len(item) < 12: + #skip + skippedCount += 1 + continue + else: + photo_source = Sourcecode #LocMediaCode + photo_guid = item[5] #photoID + photo_userid = item[7] #userid + photo_owner = item[6] ##!!! + photo_shortcode = "" + photo_uploadDate = datetime.datetime.strptime(item[9],'%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S') # format datetime as string + photo_dateTaken = datetime.datetime.strptime(item[8] ,'%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S') + photo_idDate = photo_dateTaken #use date taken date as sorting ID + photo_caption = item[3] + photo_likes = "" + #Filter tags based on two stoplists + if clusterTags or topicModeling: + photo_tags = set(filter(None, item[11].lower().split(";"))) #filter empty strings from photo_tags list and convert to set (hash) with unique values + #Filter tags based on two stoplists + photo_tags, count_tags, count_skipped = Utils.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set) + count_tags_global += count_tags + count_tags_skipped += count_skipped + else: + photo_tags = set() + #if not "water" in photo_tags: + # continue + photo_thumbnail = item[4] + photo_comments = "" + photo_mediatype = "" + photo_locName = "" + if is_number(item[1]): + photo_latitude = Decimal(item[1]) + else: + skippedCount += 1 + continue + if is_number(item[2]): + photo_longitude = Decimal(item[2]) + else: + skippedCount += 1 + continue + setLatLngBounds(photo_latitude,photo_longitude) + photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng + photo_mTags = "" #not used currently but available + photo_views = item[10] + elif (DSource == "fromInstagram_HashMedia_JSON"): + photo_source = Sourcecode #HashMediaCode + if item.get('owner'): + photo_userid = item["owner"]["id"] + else: + # skip problematic entries + skippedCount += 1 + continue + if item.get('edge_liked_by'): + photo_likes = item["edge_liked_by"]["count"] + else: + photo_likes = "" + if item.get('edge_media_to_caption') and not len(item.get("edge_media_to_caption", {}).get("edges")) == 0: + photo_caption = item["edge_media_to_caption"]["edges"][0]["node"]["text"].replace('\n', ' ').replace('\r', '') + else: + photo_caption = "" + if item.get('edge_media_to_comment'): + photo_comments = item["edge_media_to_comment"]["count"] + else: + photo_comments = "" + if item.get('id'): + photo_guid = item["id"] + else: + # skip problematic entries + skippedCount += 1 + continue + if item.get('is_video'): + photo_mediatype = "video" + else: + photo_mediatype = "image" + if item.get('location'): + photo_locID = item["location"]["id"] + photo_locName = item["location"]["name"] + else: + # skip non geotagged + count_non_geotagged += 1 + continue + if item.get('shortcode'): + photo_shortcode = item["shortcode"] + else: + photo_shortcode = "" + if item.get('tags'): + photo_tags =';'.join(item["tags"]).replace('\n', ' ').replace('\r', '') + photo_tags = ";%s;" % (photo_tags.replace(",", ";")) + else: + photo_tags = "" + if item.get('taken_at_timestamp'): + pdate = datetime.datetime.fromtimestamp(int(item["taken_at_timestamp"])) + timedelta(hours = GMTTimetransform) #GMT conversion + photo_uploadDate = pdate.strftime('%Y-%m-%d %H:%M:%S') # format datetime as string + photo_idDate = photo_uploadDate + else: + # skip problematic entries + skippedCount += 1 + continue + if item.get('thumbnail_src'): + photo_thumbnail = item["thumbnail_src"] + else: + photo_thumbnail = "" + + #assign lat/lng coordinates from dict + if (photo_locID in loc_dict): + photo_latitude = loc_dict[photo_locID][0] + photo_longitude = loc_dict[photo_locID][1] + if shapefileIntersect: + #skip all outside shapefile + if photo_locID in shapeFileExcludelocIDhash: + count_outside_shape += 1 + continue + if not photo_locID in shapeFileIncludedlocIDhash: + LngLatPoint = Point(photo_longitude, photo_latitude) + if not LngLatPoint.within(shp_geom): + count_outside_shape += 1 + shapeFileExcludelocIDhash.add(photo_locID) + continue + else: + shapeFileIncludedlocIDhash.add(photo_locID) + else: + if excludeWhereMissingGeocode: + #count_non_geotagged += 1 + skippedCount += 1 + continue #skip non-geotagged medias + else: + photo_latitude = "" + photo_longitude = "" + + #assign usernames from dict + if photo_userid in user_dict: + photo_owner = user_dict[photo_userid] + elif photo_userid in netlytics_usernameid_dict: + photo_owner = netlytics_usernameid_dict[photo_userid] + else: + photo_owner = "" + #empty for instagram: + photo_mTags = "" + photo_dateTaken = "" + photo_views = "" + elif DSource == "fromInstagram_PGlbsnEmoji": + if len(item) < 15: + #skip + skippedCount += 1 + continue + else: + photo_source = item[0] + photo_guid = item[1] #guid + photo_userid = item[7] #guid + photo_owner = ""#item[1] ##!!! + photo_shortcode = item[18] + photo_uploadDate = item[8] #guid + photo_idDate = photo_uploadDate #use upload date as sorting ID + photo_caption = item[9] + photo_likes = item[13] + #photo_tags = ";" + item[11] + ";" + tags_filtered = Utils.extract_emojis(photo_caption) + if not len(tags_filtered) == 0: + count_tags_global += len(tags_filtered) + photo_tags = set(tags_filtered) + else: + photo_tags = set() + #photo_emojis = extract_emojis(photo_caption) + photo_thumbnail = item[17] + photo_comments = item[14] + photo_mediatype = item[19] + + photo_locName = item[4] #guid + if item[2] == "" or item[3] == "": + count_non_geotagged += 1 + continue #skip non-geotagged medias + else: + photo_latitude = Decimal(item[2]) #guid + photo_longitude = Decimal(item[3]) #guid + setLatLngBounds(photo_latitude,photo_longitude) + photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng + #empty for Instagram: + photo_mTags = "" + photo_dateTaken = "" + photo_views = 0 + elif DSource == "fromLBSN": + if len(item) < 15: + #skip + skippedCount += 1 + continue + else: + photo_source = item[0] #LocMediaCode + photo_guid = item[1] #guid + photo_userid = item[4] #guid + photo_owner = ""#item[1] ##!!! + photo_shortcode = None#item[18] + photo_uploadDate = item[6] #guid + photo_idDate = None#photo_uploadDate #use upload date as sorting ID + #Process Spatial Query first (if skipping necessary) + if SortOutPlaces: + if not item[19] == "": + if item[19] in SortOutPlaces_set: + skippedCount += 1 + continue + if item[2] == "" or item[3] == "": + count_non_geotagged += 1 + continue #skip non-geotagged medias + else: + if CorrectPlaces and not item[19] and item[19] in CorrectPlaceLatLng_dict: + photo_latitude = Decimal(CorrectPlaceLatLng_dict[item[19]][0]) #correct lat/lng + photo_longitude = Decimal(CorrectPlaceLatLng_dict[item[19]][1]) #correct lat/lng + else: + photo_latitude = Decimal(item[2]) #guid + photo_longitude = Decimal(item[3]) #guid + setLatLngBounds(photo_latitude,photo_longitude) + photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng + #assign lat/lng coordinates from dict + if shapefileIntersect: + #skip all outside shapefile + if photo_locID in shapeFileExcludelocIDhash: + skippedCount += 1 + continue + if not photo_locID in shapeFileIncludedlocIDhash: + LngLatPoint = Point(photo_longitude, photo_latitude) + if not LngLatPoint.within(shp_geom): + skippedCount += 1 + shapeFileExcludelocIDhash.add(photo_locID) + continue + else: + shapeFileIncludedlocIDhash.add(photo_locID) + if clusterTags or clusterEmojis or topicModeling: + photo_caption = item[14] + else: + photo_caption = "" + photo_likes = 0 + if not item[9] == "": + try: + photo_likes = int(item[9]) + except TypeError: + pass + photo_tags = set() + if clusterTags or topicModeling: + photo_tags = set(filter(None, item[11].lower().split(";"))) #[1:-1] removes curly brackets, second [1:-1] removes quotes + #Filter tags based on two stoplists + if ignoreStoplists: + count_tags = len(photo_tags) + count_skipped = 0 + else: + photo_tags,count_tags,count_skipped = Utils.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set) + count_tags_global += count_tags + count_tags_skipped += count_skipped + if clusterEmojis: + emojis_filtered = set(Utils.extract_emojis(photo_caption)) + if not len(emojis_filtered) == 0: + count_emojis_global += len(emojis_filtered) + overallNumOfEmojis_global.update(emojis_filtered) + photo_tags = set.union(emojis_filtered) + #photo_tags = ";" + item[11] + ";" + photo_thumbnail = None#item[17] + photo_comments = None#item[14] + photo_mediatype = None#item[19] + photo_locName = item[4] #guid + #empty for Instagram: + photo_mTags = "" + photo_dateTaken = "" + photo_views = 0 + if not item[8] == "": + try: + photo_views = int(item[8]) + except TypeError: + pass + elif DSource == "fromLBSN_old": + if len(item) < 15: + #skip + skippedCount += 1 + continue + else: + photo_source = item[0] #LocMediaCode + photo_guid = item[1] #guid + photo_userid = item[7] #guid + photo_owner = ""#item[1] ##!!! + photo_shortcode = None#item[18] + photo_uploadDate = item[8] #guid + photo_idDate = None#photo_uploadDate #use upload date as sorting ID + if clusterTags or clusterEmojis or topicModeling: + photo_caption = item[9] + else: + photo_caption = "" + photo_likes = None#item[13] + photo_tags = set() + if clusterTags or topicModeling: + photo_tags = set(filter(None, item[11].strip('"').lstrip('{').rstrip('}').lower().split(","))) #[1:-1] removes curly brackets, second [1:-1] removes quotes + #Filter tags based on two stoplists + photo_tags,count_tags,count_skipped = Utils.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set) + count_tags_global += count_tags + count_tags_skipped += count_skipped + if clusterEmojis: + emojis_filtered = set(Utils.extract_emojis(photo_caption)) + if not len(emojis_filtered) == 0: + count_emojis_global += len(emojis_filtered) + overallNumOfEmojis_global.update(emojis_filtered) + photo_tags = set.union(emojis_filtered) + #photo_tags = ";" + item[11] + ";" + photo_thumbnail = None#item[17] + photo_comments = None#item[14] + photo_mediatype = None#item[19] + photo_locName = item[4] #guid + if item[2] == "" or item[3] == "": + count_non_geotagged += 1 + continue #skip non-geotagged medias + else: + photo_latitude = Decimal(item[2]) #guid + photo_longitude = Decimal(item[3]) #guid + setLatLngBounds(photo_latitude,photo_longitude) + photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng + #empty for Instagram: + photo_mTags = "" + photo_dateTaken = "" + photo_views = 0 + elif DSource == "fromSensorData_InfWuerz": + if len(item) < 5: + #skip + skippedCount += 1 + continue + else: + photo_source = Sourcecode #LocMediaCode + photo_guid = item[1] #guid + photo_userid = item[4] #meta_device_id + photo_owner = ""#item[1] ##!!! + photo_shortcode = "" + photo_uploadDate = item[3] #meta_timestamp_received + photo_idDate = item[2] #meta_timestamp_recorded + photo_caption = item[8] + if not len(photo_caption) == 0: + removeSpecialChars = photo_caption.translate ({ord(c): " " for c in "?.!/;:,[]()'-&#"}) + wordlist = [word for word in removeSpecialChars.lower().split(' ') if not word == "" and len(word) > 1] + photo_tags = set(wordlist) + else: + photo_tags = set() + photo_tags_filtered = set() + for tag in photo_tags: + count_tags_global += 1 + #exclude numbers and those tags that are in SortOutAlways_set + if tag.isdigit() or tag in SortOutAlways_set: + count_tags_skipped += 1 + continue + for inStr in SortOutAlways_inStr_set: + if inStr in tag: + count_tags_skipped += 1 + break + else: + photo_tags_filtered.add(tag) + photo_tags = photo_tags_filtered + if item[6] == "" or item[7] == "": + count_non_geotagged += 1 + continue #skip non-geotagged medias + else: + photo_latitude = Decimal(item[7]) #guid + photo_longitude = Decimal(item[6]) #guid + setLatLngBounds(photo_latitude,photo_longitude) + photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng + #empty for SensorWuerz: + photo_likes = "" + photo_thumbnail = "" + photo_comments = "" + photo_mediatype = "" + photo_locName = "" + photo_mTags = "" + photo_dateTaken = "" + photo_views = 0 + #this code will union all tags of a single user for each location + #further information is derived from the first image for each user-location + photo_locIDUserID = photo_locID + '::' + str(photo_userid) #create userid_loc_id + distinctLocations_set.add(photo_locID) + #print(f'Added: {photo_locID} to distinctLocations_set (len: {len(distinctLocations_set)})') + distinctUserLocations_set.add(photo_locIDUserID) + #print(f'Added: {photo_locIDUserID} to distinctUserLocations_set (len: {len(distinctUserLocations_set)})') + if not photo_userid in LocationsPerUserID_dict or not photo_locID in LocationsPerUserID_dict[photo_userid]: + LocationsPerUserID_dict[photo_userid] |= {photo_locID} # Bit wise or and assignment in one step. -> assign locID to UserDict list if not already contained + count_loc += 1 + UserLocationsFirstPhoto_dict[photo_locIDUserID] = (photo_source, + photo_guid, + photo_owner, + photo_userid, + photo_caption, + photo_thumbnail, + photo_dateTaken, + photo_uploadDate, + photo_views, + photo_tags, + photo_mTags, + photo_likes, + photo_comments, + photo_shortcode, + photo_mediatype, + photo_locName, + photo_locID) + UserLocationTagList_dict[photo_locIDUserID] |= photo_tags #union tags per userid/unique location + removeSpecialChars = photo_caption.translate ({ord(c): " " for c in "?.!/;:,[]()'-&#"}) + if tokenizeJapanese: + wordlist = [word for word in jTokenize(input_sentence) for input_sentence in removeSpecialChars.split(' ')] + else: + wordlist = [word for word in removeSpecialChars.lower().split(' ') if len(word) > 2] #first replace specia characters in caption, then split by space-character + UserLocationWordList_dict[photo_locIDUserID] |= set(wordlist) #union words per userid/unique location + count_glob += 1 + + ##Calculate toplists + if photo_tags: + UserDict_TagCounters_global[photo_userid].update(photo_tags) #add tagcount of this media to overall tagcount or this user, initialize counter for user if not already done + msg = f'Cleaned output to {len(distinctLocations_set):02d} distinct locations from {count_glob:02d} photos (File {partcount} of {len(filelist)}) - Skipped Media: {skippedCount} - Skipped Tags: {count_tags_skipped} of {count_tags_global}' + print(msg, end='\r') + #Append last message to log file + log_texts_list.append(msg) + total_distinct_locations = len(distinctLocations_set) + print_store_log(f'\nTotal users: {len(LocationsPerUserID_dict)} (UC)') + print_store_log(f'Total photos: {count_glob:02d} (PC)') + print_store_log(f'Total tags (PTC): {count_tags_global}') + print_store_log(f'Total emojis (PEC): {count_emojis_global}') + print_store_log(f'Total user photo locations (UPL): {len(distinctUserLocations_set)}') + + #boundary: + print_store_log(f'Bounds are: Min {float(limLngMin)} {float(limLatMin)} Max {float(limLngMax)} {float(limLatMax)}') + + #create structure for tuple with naming for easy referencing + cleanedPhotoLocation_tuple = namedtuple('cleanedPhotoLocation_tuple', 'source lat lng photo_guid photo_owner userid photo_caption photo_dateTaken photo_uploadDate photo_views photo_tags photo_thumbnail photo_mTags photo_likes photo_comments photo_shortcode photo_mediatype photo_locName photo_locID') + cleanedPhotoDict = defaultdict(cleanedPhotoLocation_tuple) + with open("02_Output/Output_cleaned.txt", 'w', encoding='utf8') as csvfile: + csvfile.write("SOURCE,Latitude,Longitude,PhotoID,Owner,UserID,Name,DateTaken,UploadDate,Views,Tags,URL,MTags,Likes,Comments,Shortcode,Type,LocName,LocID," + '\n') + datawriter = csv.writer(csvfile, delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) + for user_key, locationhash in LocationsPerUserID_dict.items(): + for location in locationhash: + locIDUserID = str(location) + '::' + str(user_key) + photo_latlng = location.split(':') + photo = UserLocationsFirstPhoto_dict.get(locIDUserID,(" "," "," "," "," "," "," "," "," "," "," "," "," "," "," "," "," ")) + #create tuple with cleaned photo data + cleanedPhotoLocation = cleanedPhotoLocation_tuple(photo[0],#Source = 0 + float(photo_latlng[0]), #Lat = 1 + float(photo_latlng[1]), #Lng = 2 + photo[1],#photo_guid = 3 + photo[2],#photo_owner = 4 + user_key, #userid = 5 + UserLocationWordList_dict.get(locIDUserID,("",)),#photo_caption = 6 + photo[6],#photo_dateTaken = 7 + photo[7],#photo_uploadDate = 8 + photo[8],#photo_views = 9 + UserLocationTagList_dict.get(locIDUserID,("",)),#photo_tags = 10 + photo[5],#photo_thumbnail = 11 + photo[10],#photo_mTags = 12 + photo[11],#photo_likes = 13 + photo[12],#photo_comments = 14 + photo[13],#photo_shortcode = 15 + photo[14],#photo_mediatype = 16 + photo[15],#photo_locName = 17 + photo[16]#photo_locID = 18 + ) + if writeCleanedData: + ###optional Write Cleaned Data to CSV/TXT + datawriter.writerow([cleanedPhotoLocation.source,#Source = 0 + cleanedPhotoLocation.lat, #Lat = 1 + cleanedPhotoLocation.lng, #Lng = 2 + cleanedPhotoLocation.photo_guid,#photo_guid = 3 + cleanedPhotoLocation.photo_owner,#photo_owner = 4 + cleanedPhotoLocation.userid, #userid = 5 + ";".join(cleanedPhotoLocation.photo_caption),#photo_caption = 6 + cleanedPhotoLocation.photo_dateTaken,#photo_dateTaken = 7 + cleanedPhotoLocation.photo_uploadDate,#photo_uploadDate = 8 + cleanedPhotoLocation.photo_views,#photo_views = 9 + ";".join(cleanedPhotoLocation.photo_tags),#photo_tags = 10 + cleanedPhotoLocation.photo_thumbnail,#photo_thumbnail = 11 + cleanedPhotoLocation.photo_mTags,#photo_mTags = 12 + cleanedPhotoLocation.photo_likes,#photo_likes = 13 + cleanedPhotoLocation.photo_comments,#photo_comments = 14 + cleanedPhotoLocation.photo_shortcode,#photo_shortcode = 15 + cleanedPhotoLocation.photo_mediatype,#photo_mediatype = 16 + cleanedPhotoLocation.photo_locName,#photo_locName = 17 + cleanedPhotoLocation.photo_locID]#photo_locID = 18 + ) + ##optional Write Cleaned Search Terms to CSV for Topic Modeling + #topics = cleanedPhotoLocation.photo_caption.union(cleanedPhotoLocation.photo_tags) + if topicModeling: + if not len(cleanedPhotoLocation.photo_tags) == 0: + UserTopicList_dict[user_key] |= cleanedPhotoLocation.photo_tags + UserTopicList_dict[user_key] |= cleanedPhotoLocation.photo_caption #also use descriptions for Topic Modeling + UserPhotoIDS_dict[user_key] |= {cleanedPhotoLocation.photo_guid} # Bit wise or and assignment in one step. -> assign PhotoGuid to UserDict list if not already contained + #UserPhotoFirstThumb_dict[user_key] = photo[5] + cleanedPhotoDict[cleanedPhotoLocation.photo_guid] = cleanedPhotoLocation + if topicModeling: + #export list of cleaned topics on a per user basis for LDA/TSNE etc. + with open("02_Output/Output_usertopics_anonymized.csv", 'w', encoding='utf8') as csvfile: + csvfile.write("TOPICS,PhotoIDs,UserID" + '\n') + datawriter = csv.writer(csvfile, delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) + for user_key, topics in UserTopicList_dict.items(): + datawriter.writerow([" ".join(topics),"{" + ",".join([hashlib.sha3_256(photoid.encode("utf8")).hexdigest() for photoid in UserPhotoIDS_dict.get(user_key,None)]) + "}",hashlib.sha3_256(user_key.encode("utf8")).hexdigest()]) + with open("02_Output/Output_usertopics.csv", 'w', encoding='utf8') as csvfile: + csvfile.write("TOPICS,PhotoIDs,UserID" + '\n') + datawriter = csv.writer(csvfile, delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) + for user_key, topics in UserTopicList_dict.items(): + datawriter.writerow([" ".join(topics),"{" + ",".join(UserPhotoIDS_dict.get(user_key,None)) + "}",str(user_key)]) + + if (clusterTags or clusterEmojis): + print_store_log("########## STEP 2 of 6: Tag Ranking ##########") + overallNumOfUsersPerTag_global = collections.Counter() + for user_key, taghash in UserDict_TagCounters_global.items(): + #taghash contains unique values (= strings) for each user, thus summing up these taghashes counts each user only once per tag + overallNumOfUsersPerTag_global.update(taghash) + + global topTagsList + topTagsList = overallNumOfUsersPerTag_global.most_common(tmax) + #remove all tags that are used by less than two photographers + if removeLongTail is True: + indexMin = next((i for i, (t1, t2) in enumerate(topTagsList) if t2 < 2), None) + if indexMin: + lenBefore = len(topTagsList) + del topTagsList[indexMin:] + lenAfter = len(topTagsList) + tmax = lenAfter + if not lenBefore == lenAfter: + print_store_log(f'Filtered {lenBefore - lenAfter} Tags that were used by less than 2 users.') + #optional write topemojis to file + globalEmojiSet = {} + if clusterEmojis: + topEmojisList = overallNumOfEmojis_global.most_common() + globalEmojiSet = {tuple[0] for tuple in topEmojisList} + if (not len(globalEmojiSet) == 0): + topemojis = ''.join("%s,%i" % v + '\n' for v in topEmojisList) + with open("02_Output/Output_topemojis.txt", 'w', encoding="utf8") as file: #overwrite + file.write("emoji,usercount\n") + file.write(topemojis) + + if clusterTags: + #optional write toptags to file + toptags = ''.join("%s,%i" % v + '\n' for v in topTagsList if not v[0] in globalEmojiSet) + if (not len(topTagsList) == 0): + with open("02_Output/Output_toptags.txt", 'w', encoding="utf8") as file: #overwrite + file.write("tag,usercount\n") + file.write(toptags) + + if statisticsOnly == False: + singleMostUsedtag = topTagsList[0] + now = time.time() + print_store_log("########## STEP 3 of 6: Tag Location Clustering ##########") + #prepare some variables + label_size = 10 + #plt.rcParams['xtick.labelsize'] = label_size + #plt.rcParams['ytick.labelsize'] = label_size + plot_kwds = {'alpha' : 0.5, 's' : 10, 'linewidths':0} + sys.stdout.flush() + + distY = 0 + distX = 0 + global limYMin, limYMax, limXMin, limXMax, imgRatio, floaterX, floaterY + + #Optional: set global plotting bounds + #plt.gca().set_xlim([limXMin, limXMax]) + #plt.gca().set_ylim([limYMin, limYMax]) + cleanedPhotoList = list(cleanedPhotoDict.values()) + df = pd.DataFrame(cleanedPhotoList) + points = df.as_matrix(['lng','lat']) + limYMin,limYMax,limXMin,limXMax = Utils.getRectangleBounds(points) + bound_points_shapely = geometry.MultiPoint([(limXMin, limYMin), (limXMax, limYMax)]) + distYLat = limYMax - limYMin + distXLng = limXMax - limXMin + #distYLat = Utils.haversine(limXMin,limYMax,limXMin,limYMin) + #distXLng = Utils.haversine(limXMax,limYMin,limXMin,limYMin) + #imgRatio = distXLng/(distYLat*2) + + imgRatio = distXLng/(distYLat*2) + distY = Utils.haversine(limXMin, limYMin, limXMin, limYMax) + distX = Utils.haversine(limXMin, limYMin, limXMax, limYMin) + global clusterTreeCuttingDist + clusterTreeCuttingDist = (min(distX,distY)/100)*7 #4% of research area width/height (max) = default value #223.245922725 #= 0.000035 radians dist + + ######### TKinter Preparation ########### + class App(tk.Tk): + def __init__(self): + tk.Tk.__init__(self) + self.overrideredirect(True) #this is needed to make the root disappear + self.geometry('%dx%d+%d+%d' % (0, 0, 0, 0)) + #create separate floating window + self.floater = FloatingWindow(self) + + class FloatingWindow(tk.Toplevel): + #global app + def __init__(self, *args, **kwargs): + tk.Toplevel.__init__(self, *args, **kwargs) + self.overrideredirect(True) + self.configure(background='gray7') + #self.label = tk.Label(self, text="Click on the grip to move") + self.grip = tk.Label(self, bitmap="gray25") + self.grip.pack(side="left", fill="y") + #self.label.pack(side="right", fill="both", expand=True) + self.grip.bind("", self.StartMove) + self.grip.bind("", self.StopMove) + self.grip.bind("", self.OnMotion) + #center Floating Window + w = self.winfo_reqwidth() + h = self.winfo_reqheight() + ws = self.winfo_screenwidth() + hs = self.winfo_screenheight() + x = (ws/2) - (w/2) + y = (hs/2) - (h/2) + self.geometry('+%d+%d' % (x, y)) + def StartMove(self, event): + self.x = event.x + self.y = event.y + + def StopMove(self, event): + self.x = None + self.y = None + + def OnMotion(self, event): + deltax = event.x - self.x + deltay = event.y - self.y + x = self.winfo_x() + deltax + y = self.winfo_y() + deltay + self.geometry("+%s+%s" % (x, y)) + + #Initialize TKinter Interface + app = App() + #necessary override for error reporting during tkinter mode: + import traceback + def report_callback_exception(self, exc, val, tb): + showerror("Error", message=str(val)) + exc_type, exc_value, exc_traceback = sys.exc_info() + print("*** print_tb:") + traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) + print("*** print_exception:") + traceback.print_exception(exc_type, exc_value, exc_traceback, + limit=2, file=sys.stdout) + print("*** print_exc:") + traceback.print_exc() + print("*** format_exc, first and last line:") + formatted_lines = traceback.format_exc().splitlines() + print(formatted_lines[0]) + print(formatted_lines[-1]) + print("*** format_exception:") + print(repr(traceback.format_exception(exc_type, exc_value, + exc_traceback))) + print("*** extract_tb:") + print(repr(traceback.extract_tb(exc_traceback))) + print("*** format_tb:") + print(repr(traceback.format_tb(exc_traceback))) + print("*** tb_lineno:", exc_traceback.tb_lineno) + tk.Tk.report_callback_exception = report_callback_exception + + #the following code part is a bit garbled due to using TKinter interface + ###################################################################################################################################################### + ###################################################################################################################################################### + ###################################################################################################################################################### + + #definition of global vars for interface and graph design + canvasWidth = 1320 + canvasHeight = 440 + #Cluster preparation + sns.set_context('poster') + sns.set_style('white') + #sns.set_color_codes() + #matplotlib.style.use('ggplot') + plt.style.use('ggplot') + graphFrame = None + + genPreviewMap = tk.IntVar(value = 0) + createMinimumSpanningTree = False + autoselectClusters = False + + + def quitTkinter(): + #exits Tkinter gui and continues with code execution after mainloop() + #global app + #app.floater.destroy() + #tkinter.messagebox.showinfo("Closing App", "Closing App") + #plt.quit() + global abort + abort = True + app.update() #see https://stackoverflow.com/questions/35040168/python-tkinter-error-cant-evoke-event-command + app.destroy() + app.quit() ##root.quit() causes mainloop to exit, see https://stackoverflow.com/questions/2307464/what-is-the-difference-between-root-destroy-and-root-quit + def proceedWithCluster(): + global proceedClusting + global fig1 + proceedClusting = True + #def vis_tag(tag): + #tkinter.messagebox.showinfo("Proceed", "Proceed") + #if plt.figure(1): + # plt.figure(1).clf() + app.update() #see https://stackoverflow.com/questions/35040168/python-tkinter-error-cant-evoke-event-command + app.destroy() + app.quit() + + def sel_photos(tag,cleanedPhotoList): + #select photos from list based on a specific tag + distinctLocalLocationCount = set() + selectedPhotoList_Guids = [] + for cleanedPhotoLocation in cleanedPhotoList: + if tag in (cleanedPhotoLocation.photo_tags) or (tag in cleanedPhotoLocation.photo_caption): + selectedPhotoList_Guids.append(cleanedPhotoLocation.photo_guid) + distinctLocalLocationCount.add(cleanedPhotoLocation.photo_locID) + return selectedPhotoList_Guids, len(distinctLocalLocationCount) + + def cluster_tag(toptag=None,preview=None,silent=None): + if preview is None: + preview = False + if silent is None: + silent = False + global currentDisplayTag + global limYMin, limYMax, limXMin, limXMax, imgRatio, floaterX, floaterY + global fig1, fig2, fig3, fig4 + selectedPhotoList_Guids, distinctLocalLocationCount = sel_photos(toptag[0],cleanedPhotoList) + percentageOfTotalLocations = distinctLocalLocationCount/(total_distinct_locations/100) + if silent: + if toptag[0] in globalEmojiSet: + text = unicode_name(toptag[0]) + else: + text = toptag[0] + print_store_log(f'({tnum} of {tmax}) Found {len(selectedPhotoList_Guids)} photos (UPL) for tag \'{text}\' ({percentageOfTotalLocations:.0f}% of total distinct locations in area)', end=" ") + + + #clustering + if len(selectedPhotoList_Guids) < 2: + return [], selectedPhotoList_Guids + selectedPhotoList = [cleanedPhotoDict[x] for x in selectedPhotoList_Guids] + #only used for tag clustering, otherwise (photo location clusters), global vars are used (df, points) + df = pd.DataFrame(selectedPhotoList) + points = df.as_matrix(['lng','lat']) #converts pandas data to numpy array (limit by list of column-names) + + #only return preview fig without clustering + if preview == True: + #only map data + if genPreviewMap.get() == 1: + if fig1: + plt.figure(1).clf() #clear figure 1 + #earth = Basemap() + #earth.bluemarble(alpha=0.42) + #earth.drawcoastlines(color='#555566', linewidth=1) + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') + #reuse window of figure 1 for new figure + plt.scatter(points.T[0], points.T[1], color='red', **plot_kwds) + fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) + fig1.canvas.set_window_title('Preview Map') + #displayImgPath = pathname + '/Output/ClusterImg/00_displayImg.png' + #fig1.figure.savefig(displayImgPath) + else: + + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') + plt.scatter(points.T[0], points.T[1], color='red', **plot_kwds) + fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) + #earth = Basemap() + #earth.bluemarble(alpha=0.42) + #earth.drawcoastlines(color='#555566', linewidth=1) + fig1.canvas.set_window_title('Preview Map') + plt.gca().set_xlim([limXMin, limXMax]) + plt.gca().set_ylim([limYMin, limYMax]) + plt.tick_params(labelsize=10) + currentDisplayTag = toptag + else: + #cluster data + tagRadiansData = np.radians(points) #conversion to radians for HDBSCAN (does not support decimal degrees) + #for each tag in overallNumOfUsersPerTag_global.most_common(1000) (descending), calculate HDBSCAN Clusters + + minClusterSize = max(2,int(((len(selectedPhotoList_Guids))/100)*5)) #4% optimum + clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize,gen_min_span_tree=createMinimumSpanningTree,allow_single_cluster=True,min_samples=1) + #clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize,gen_min_span_tree=True,min_samples=1) + #clusterer = hdbscan.HDBSCAN(min_cluster_size=10,metric='haversine',gen_min_span_tree=False,allow_single_cluster=True) + #clusterer = hdbscan.robust_single_linkage_.RobustSingleLinkage(cut=0.000035) + #srsl_plt = hdbscan.robust_single_linkage_.plot() + + #Start clusterer on different thread to prevent GUI from freezing + #See: http://stupidpythonideas.blogspot.de/2013/10/why-your-gui-app-freezes.html + #https://stackoverflow.com/questions/6893968/how-to-get-the-return-value-from-a-thread-in-python + #if silent: + # #on silent command line operation, don't use multiprocessing + # clusterer = fit_cluster(clusterer,tagRadiansData) + #else: + with warnings.catch_warnings(): + #disable joblist multithread warning + warnings.simplefilter('ignore', UserWarning) + async_result = pool.apply_async(Utils.fit_cluster, (clusterer, tagRadiansData)) + clusterer = async_result.get() + #clusterer.fit(tagRadiansData) + #updateNeeded = False + + if autoselectClusters: + sel_labels = clusterer.labels_ #auto selected clusters + else: + sel_labels = clusterer.single_linkage_tree_.get_clusters(Utils.getRadiansFromMeters(clusterTreeCuttingDist), min_cluster_size=2) #0.000035 without haversine: 223 m (or 95 m for 0.000015) + + #exit function in case final processing loop (no figure generating) + if silent: + return sel_labels, selectedPhotoList_Guids + mask_noisy = (sel_labels == -1) + number_of_clusters = len(np.unique(sel_labels[~mask_noisy])) #len(sel_labels) + #palette = sns.color_palette("hls", ) + #palette = sns.color_palette(None, len(sel_labels)) #sns.color_palette("hls", ) #sns.color_palette(None, 100) + palette = sns.color_palette("husl", number_of_clusters+1) + sel_colors = [palette[x] if x >= 0 + else (0.5, 0.5, 0.5) + #for x in clusterer.labels_ ] + for x in sel_labels] #clusterer.labels_ (best selection) or sel_labels (cut distance) + #tkinter.messagebox.showinfo("Num of clusters: ", str(len(sel_colors)) + " " + str(sel_colors[1])) + #output/update matplotlib figures + + if fig1: + plt.figure(1).clf() + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') #plt references the last figure accessed + ax = plt.scatter(points.T[0], points.T[1], color=sel_colors, **plot_kwds) + fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) + fig1.canvas.set_window_title('Cluster Preview') + distText = '' + if autoselectClusters == False: + distText = '@ ' + str(clusterTreeCuttingDist) +'m' + plt.title(f'Cluster Preview {distText}', fontsize=12,loc='center') + #plt.title('Cluster Preview') + #xmax = ax.get_xlim()[1] + #ymax = ax.get_ylim()[1] + noisyTxt = '{}/{}'.format(mask_noisy.sum(), len(mask_noisy)) + plt.text(limXMax, limYMax,f'{number_of_clusters} Cluster (Noise: {noisyTxt})',fontsize=10,horizontalalignment='right',verticalalignment='top',fontweight='bold') + else: + plt.scatter(points.T[0], points.T[1], c=sel_colors, **plot_kwds) + fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) + fig1.canvas.set_window_title('Cluster Preview') + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') + distText = '' + if autoselectClusters == False: + distText = '@ ' + str(clusterTreeCuttingDist) +'m' + plt.title(f'Cluster Preview {distText}', fontsize=12,loc='center') + #xmax = fig1.get_xlim()[1] + #ymax = fig1.get_ylim()[1] + noisyTxt = '{} / {}'.format(mask_noisy.sum(), len(mask_noisy)) + plt.text(limXMax, limYMax,f'{number_of_clusters} Clusters (Noise: {noisyTxt})',fontsize=10,horizontalalignment='right',verticalalignment='top',fontweight='bold') + plt.gca().set_xlim([limXMin, limXMax]) + plt.gca().set_ylim([limYMin, limYMax]) + plt.tick_params(labelsize=10) + #if len(tagRadiansData) < 10000: + if fig2: + plt.figure(2).clf() + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') + #plt.title('Condensed Tree', fontsize=12,loc='center') + clusterer.condensed_tree_.plot(select_clusters=False, selection_palette=sel_colors)#,label_clusters=False + #tkinter.messagebox.showinfo("Num of clusters: ", str(len(sel_colors)) + " " + str(sel_colors[0])) + else: + plt.figure(2).canvas.set_window_title('Condensed Tree') + fig2 = clusterer.condensed_tree_.plot(select_clusters=False, selection_palette=sel_colors)#,label_clusters=False + #tkinter.messagebox.showinfo("Num of clusters: ", str(len(sel_colors)) + " " + str(sel_colors[1])) + #fig2 = clusterer.condensed_tree_.plot(select_clusters=False, selection_palette=sel_colors,label_clusters=True) + #plt.title('Condensed Tree', fontsize=12,loc='center') + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') + plt.tick_params(labelsize=10) + if fig3: + plt.figure(3).clf() + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') + plt.title('Single Linkage Tree', fontsize=12,loc='center') + #clusterer.single_linkage_tree_.plot(truncate_mode='lastp',p=50) + ax = clusterer.single_linkage_tree_.plot(truncate_mode='lastp',p=max(50,min(number_of_clusters*10,256))) #p is the number of max count of leafs in the tree, this should at least be the number of clusters*10, not lower than 50 [but max 500 to not crash] + #tkinter.messagebox.showinfo("messagr", str(type(ax))) + #plot cutting distance + y = Utils.getRadiansFromMeters(clusterTreeCuttingDist) + xmin = ax.get_xlim()[0] + xmax = ax.get_xlim()[1] + line, = ax.plot([xmin, xmax], [y, y], color='k', label=f'Cluster (Cut) Distance {clusterTreeCuttingDist}m') + line.set_label(f'Cluster (Cut) Distance {clusterTreeCuttingDist}m') + ax.legend(fontsize=10) + vals = ax.get_yticks() + ax.set_yticklabels(['{:3.1f}m'.format(Utils.getMetersFromRadians(x)) for x in vals]) + else: + plt.figure(3).canvas.set_window_title('Single Linkage Tree') + fig3 = clusterer.single_linkage_tree_.plot(truncate_mode='lastp',p=max(50,min(number_of_clusters*10,256))) + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') + plt.title('Single Linkage Tree', fontsize=12,loc='center') + #tkinter.messagebox.showinfo("messagr", str(type(fig3))) + #plot cutting distance + y = Utils.getRadiansFromMeters(clusterTreeCuttingDist) + xmin = fig3.get_xlim()[0] + xmax = fig3.get_xlim()[1] + line, = fig3.plot([xmin, xmax], [y, y], color='k', label=f'Cluster (Cut) Distance {clusterTreeCuttingDist}m') + line.set_label(f'Cluster (Cut) Distance {clusterTreeCuttingDist}m') + fig3.legend(fontsize=10) + vals = fig3.get_yticks() + fig3.set_yticklabels([f'{Utils.getMetersFromRadians(x):3.1f}m' for x in vals]) + plt.tick_params(labelsize=10) + if createMinimumSpanningTree: + if fig4: + plt.figure(4).clf() + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') + #plt.title('Single Linkage Tree', fontsize=12,loc='center') + #clusterer.single_linkage_tree_.plot(truncate_mode='lastp',p=50) + ax = clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis', + edge_alpha=0.6, + node_size=10, + edge_linewidth=1) #tkinter.messagebox.showinfo("messagr", str(type(ax))) + fig4.canvas.set_window_title('Minimum Spanning Tree') + plt.title(f'Minimum Spanning Tree @ {clusterTreeCuttingDist}m', fontsize=12,loc='center') + ax.legend(fontsize=10) + #ax=plt.gca() #plt.gca() for current axis, otherwise set appropriately. + #im=ax.images #this is a list of all images that have been plotted + #cb=im[0].colorbar ##in this case I assume to be interested to the last one plotted, otherwise use the appropriate index + #cb.ax.tick_params(labelsize=10) + #vals = cb.ax.get_yticks() + #cb.ax.set_yticklabels(['{:3.1f}m'.format(getMetersFromRadians(x)) for x in vals]) + else: + plt.figure(4).canvas.set_window_title('Minimum Spanning Tree') + fig4 = clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis', + edge_alpha=0.6, + node_size=10, + edge_linewidth=1) #tkinter.messagebox.showinfo("messagr", str(type(ax))) + plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') + plt.title(f'Minimum Spanning Tree @ {clusterTreeCuttingDist}m', fontsize=12,loc='center') + fig4.legend(fontsize=10) + #ax=plt.gca() #plt.gca() for current axis, otherwise set appropriately. + #im=ax.images #this is a list of all images that have been plotted + #cb=im[0].colorbar ##in this case I assume to be interested to the last one plotted, otherwise use the appropriate index + #cb.ax.tick_params(labelsize=10) + #vals = cb.ax.get_yticks() + #cb.ax.set_yticklabels(['{:3.1f}m'.format(getMetersFromRadians(x)) for x in vals]) + plt.tick_params(labelsize=10) + #adjust scalebar limits + global tkScalebar + tkScalebar.configure(from_=(clusterTreeCuttingDist/100), to=(clusterTreeCuttingDist*2)) + + def change_clusterDist(val): + #tkinter.messagebox.showinfo("messagr", val) + global clusterTreeCuttingDist + #global canvas + #global tkScalebar + clusterTreeCuttingDist = float(val)#tkScalebar.get() + + def onselect(evt): + # Note here that Tkinter passes an event object to onselect() + global lastselectionList + global tnum + w = evt.widget + if lastselectionList: #if not empty + changedSelection = set(lastselectionList).symmetric_difference(set(w.curselection())) + lastselectionList = w.curselection() + else: + lastselectionList = w.curselection() + changedSelection = w.curselection() + index = int(list(changedSelection)[0]) + value = w.get(index) + #tkinter.messagebox.showinfo("You selected ", value) + tnum = 1 + cluster_tag(topTagsList[index],True) #generate only preview map + #plt.close('all') + def cluster_currentDisplayTag(): + if currentDisplayTag: + #tkinter.messagebox.showinfo("Clustertag: ", str(currentDisplayTag)) + cluster_tag(currentDisplayTag) + else: + cluster_tag(topTagsList[0]) + def scaletest_currentDisplayTag(): + global tkScalebar + global fig1 + if currentDisplayTag: + clustertag = currentDisplayTag + else: + clustertag = topTagsList[0] + scalecalclist = [] + dmax = int(clusterTreeCuttingDist*10) + dmin = int(clusterTreeCuttingDist/10) + dstep = int(((clusterTreeCuttingDist*10)-(clusterTreeCuttingDist/10))/100) + for i in range(dmin,dmax,dstep): + change_clusterDist(i) + tkScalebar.set(i) + app.update() + #clusterTreeCuttingDist = i + clusters, selectedPhotoList_Guids = cluster_tag(clustertag, None, True) + mask_noisy = (clusters == -1) + number_of_clusters = len(np.unique(clusters[~mask_noisy])) #mit noisy (=0) + if number_of_clusters == 1: + break + form_string = f'{i},{number_of_clusters},{mask_noisy.sum()},{len(mask_noisy)},\n' + scalecalclist.append(form_string) + with open(f'02_Output/scaletest_{clustertag[0]}.txt', "w", encoding='utf-8') as logfile_a: + for scalecalc in scalecalclist: + logfile_a.write(scalecalc) + plt.figure(1).clf() + plt.suptitle(clustertag[0].upper(), fontsize=18, fontweight='bold') #plt references the last figure accessed + fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) + fig1.canvas.set_window_title('Cluster Preview') + distText = '' + if autoselectClusters == False: + distText = f'@ {clusterTreeCuttingDist}m' + plt.title(f'Cluster Preview {distText}', fontsize=12,loc='center') + noisyTxt = f'{mask_noisy.sum()}/{len(mask_noisy)}' + plt.text(limXMax, limYMax,f'{number_of_clusters} Cluster (Noise: {noisyTxt})',fontsize=10,horizontalalignment='right',verticalalignment='top',fontweight='bold') + + def delete(listbox): + global topTagsList + global lastselectionList + lastselectionList = [] + # Delete from Listbox + selection = listbox.curselection() + for index in selection[::-1]: + listbox.delete(index) + del(topTagsList[index]) + ###################################################################################################################################################### + ###################################################################################################################################################### + ###################################################################################################################################################### + + #A frame is created for each window/part of the gui; after it is used, it is destroyed with frame.destroy() + + listboxFrame = tk.Frame(app.floater) + canvas = tk.Canvas(listboxFrame, width=150, height=200, highlightthickness=0,background="gray7") + l = tk.Label(canvas, text="Optional: Exclude tags.", background="gray7",fg="gray80",font="Arial 10 bold") + l.pack(padx=10, pady=10) + l = tk.Label(canvas, text="Select all tags you wish to exclude from analysis \n and click on remove to proceed.", background="gray7",fg="gray80") + l.pack(padx=10, pady=10) + #if DSource == "fromInstagram_PGlbsnEmoji": + # listbox_font = ("twitter Color Emoji", 12, "bold") + # #listbox_font = ("Symbola", 12, "bold") + #else: + listbox_font = None + listbox = tk.Listbox(canvas,selectmode=tk.MULTIPLE, bd=0,background="gray29",fg="gray91",width=30,font=listbox_font) + listbox.bind('<>', onselect) + scroll = tk.Scrollbar(canvas, orient=tk.VERTICAL,background="gray20",borderwidth=0) + scroll.configure(command=listbox.yview) + scroll.pack(side="right", fill="y") + listbox.pack() + listbox.config(yscrollcommand=scroll.set) + for item in topTagsList: #only for first 500 entries: use topTagsList[:500] + try: + listbox.insert(tk.END, f'{item[0]} ({item[1]} user)') + except tk.TclError: + emoji = unicode_name(item[0]) #Utils.with_surrogates() + listbox.insert(tk.END, f'{emoji} ({item[1]} user)') + canvas.pack(fill='both',padx=0, pady=0) + listboxFrame.pack(fill='both',padx=0, pady=0) + buttonsFrame = tk.Frame(app.floater) + canvas = tk.Canvas(buttonsFrame, width=150, height=200, highlightthickness=0,background="gray7") + b = tk.Button(canvas, text = "Remove Tag(s)", command = lambda: delete(listbox), background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") + b.pack(padx=10, pady=10) + c = tk.Checkbutton(canvas, text="Map Tags", variable=genPreviewMap,background="gray7",fg="gray80",borderwidth=0,font="Arial 10 bold") + c.pack(padx=10, pady=10) + global tkScalebar + tkScalebar = tk.Scale(canvas, from_=(clusterTreeCuttingDist/100), to=(clusterTreeCuttingDist*2), orient=tk.HORIZONTAL,resolution=0.1,command=change_clusterDist,length=300,label="Cluster Cut Distance (in Meters)",background="gray20",borderwidth=0,fg="gray80",font="Arial 10 bold") + tkScalebar.set(clusterTreeCuttingDist)#(clusterTreeCuttingDist*10) - (clusterTreeCuttingDist/10)/2) #set position of slider to center + tkScalebar.pack() + b = tk.Button(canvas, text = "Cluster Preview", command=cluster_currentDisplayTag, background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") + b.pack(padx=10, pady=10,side="left") + b = tk.Button(canvas, text = "Scale Test", command=scaletest_currentDisplayTag, background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") + b.pack(padx=10, pady=10,side="left") + b = tk.Button(canvas, text = "Proceed..", command=proceedWithCluster, background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") + b.pack(padx=10, pady=10,side="left") + b = tk.Button(canvas, text = "Quit", command=quitTkinter, background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") + b.pack(padx=10, pady=10,side="left") + canvas.pack(fill='both',padx=0, pady=0) + buttonsFrame.pack(fill='both',padx=0, pady=0) + + #END OF TKINTER LOOP, welcome back to command line interface + app.mainloop() + plt.close("all") + + noClusterPhotos_perTag_DictOfLists = defaultdict(list) + clustersPerTag = defaultdict(list) + if proceedClusting: + #Proceed with clustering all tags + crs_wgs = pyproj.Proj(init='epsg:4326') #data always in lat/lng WGS1984 + if overrideCRS is None: + #Calculate best UTM Zone SRID/EPSG Code + input_lon_center = bound_points_shapely.centroid.coords[0][0] #True centroid (coords may be multipoint) + input_lat_center = bound_points_shapely.centroid.coords[0][1] + epsg_code = Utils.convert_wgs_to_utm(input_lon_center, input_lat_center) + crs_proj = pyproj.Proj(init=f'epsg:{epsg_code}') + project = lambda x, y: pyproj.transform(pyproj.Proj(init='epsg:4326'), pyproj.Proj(init=f'epsg:{epsg_code}'), x, y) + #geom_proj = transform(project, alphaShapeAndMeta[0]) + + if localSaturationCheck: + clusters, selectedPhotoList_Guids = cluster_tag(singleMostUsedtag, None, True) + numpy_selectedPhotoList_Guids = np.asarray(selectedPhotoList_Guids) + mask_noisy = (clusters == -1) + number_of_clusters = len(np.unique(clusters[~mask_noisy])) + print_store_log(f'--> {number_of_clusters} cluster.') + clusterPhotosGuidsList = [] + for x in range(number_of_clusters): + currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x] + clusterPhotosGuidsList.append(currentClusterPhotoGuids) + noClusterPhotos_perTag_DictOfLists[singleMostUsedtag[0]] = list(numpy_selectedPhotoList_Guids[clusters==-1]) + # Sort descending based on size of cluster: https://stackoverflow.com/questions/30346356/how-to-sort-list-of-lists-according-to-length-of-sublists + clusterPhotosGuidsList.sort(key=len, reverse=True) + if not len(clusterPhotosGuidsList) == 0: + clustersPerTag[singleMostUsedtag[0]] = clusterPhotosGuidsList + tnum = 1 + for toptag in topTagsList: + if localSaturationCheck and tnum == 1 and toptag[0] in clustersPerTag: + #skip toptag if already clustered due to local saturation + continue + clusters, selectedPhotoList_Guids = cluster_tag(toptag, None, True) + #print("baseDataList: ") + #print(str(type(selectedPhotoList))) + #for s in selectedPhotoList[:2]: + # print(*s) + #print("resultData: ") + ##for s in clusters[:2]: + ## print(*s) + #print(str(type(clusters))) + #print(clusters) + #clusters contains the cluster values (-1 = no cluster, 0 maybe, >0 = cluster + # in the same order, selectedPhotoList contains all original photo data, thus clusters[10] and selectedPhotoList[10] refer to the same photo + + numpy_selectedPhotoList_Guids = np.asarray(selectedPhotoList_Guids) + mask_noisy = (clusters == -1) + if len(selectedPhotoList_Guids) == 1: + number_of_clusters = 0 + else: + number_of_clusters = len(np.unique(clusters[~mask_noisy])) #mit noisy (=0) + #if number_of_clusters > 200: + # print_store_log("--> Too many, skipped for this scale.") + # continue + if not number_of_clusters == 0: + print_store_log(f'--> {number_of_clusters} cluster.') + tnum += 1 + photo_num = 0 + #clusternum_photolist = zip(clusters,selectedPhotoList) + #clusterPhotosList = [[] for x in range(number_of_clusters)] + clusterPhotosGuidsList = [] + for x in range(number_of_clusters): + currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x] + clusterPhotosGuidsList.append(currentClusterPhotoGuids) + noClusterPhotos_perTag_DictOfLists[toptag[0]] = list(numpy_selectedPhotoList_Guids[clusters==-1]) + # Sort descending based on size of cluster: https://stackoverflow.com/questions/30346356/how-to-sort-list-of-lists-according-to-length-of-sublists + clusterPhotosGuidsList.sort(key=len, reverse=True) + if not len(clusterPhotosGuidsList) == 0: + clustersPerTag[toptag[0]] = clusterPhotosGuidsList + else: + print_store_log("--> No cluster.") + noClusterPhotos_perTag_DictOfLists[toptag[0]] = list(numpy_selectedPhotoList_Guids) + #for x in clusters: + # #photolist = [] + # if x >= 0: # no clusters: x = -1 + # clusterPhotosList[x].append([selectedPhotoList[photo_num]]) + # #clusterPhotosArray_dict[x].add(selectedPhotoList[photo_num]) + # else: + # noClusterPhotos_perTag_DictOfLists[toptag[0]].append(selectedPhotoList[photo_num]) + # photo_num+=1 + + #print("resultList: ") + #for s in clusterPhotosList[:2]: + # print(*s) + #print(str(toptag) + " - Number of clusters: " + str(len(clusterPhotosList)) + " Photo num: " + str(photo_num)) + + #plt.autoscale(enable=True) + + #if tnum == 50: + # break + #plt.savefig('foo.png') + #sys.exit() + print_store_log("########## STEP 4 of 6: Generating Alpha Shapes ##########") + #if (tnum % 50 == 0):#modulo: if division has no remainder, force update cmd output + sys.stdout.flush() + #for each cluster of points, calculate boundary shape and add statistics (HImpTag etc.) + listOfAlphashapesAndMeta = [] + tnum = 0 + if localSaturationCheck: + #calculate total area of Top1-Tag for 80% saturation check for lower level tags + saturationExcludeCount = 0 + clusterPhotoGuidList = clustersPerTag.get(singleMostUsedtag[0], None) + #print("Toptag: " + str(singleMostUsedtag[0])) + if clusterPhotoGuidList is None: + sys.exit(f'No Photos found for toptag: {singleMostUsedtag}') + toptagArea = Utils.generateClusterShape(toptag,clusterPhotoGuidList,cleanedPhotoDict,crs_wgs,crs_proj,clusterTreeCuttingDist,localSaturationCheck)[1] + for toptag in topTagsList: + tnum += 1 + clusterPhotoGuidList = clustersPerTag.get(toptag[0], None) + #Generate tag Cluster Shapes + if clusterPhotoGuidList: + listOfAlphashapesAndMeta_tmp,tagArea = Utils.generateClusterShape(toptag,clusterPhotoGuidList,cleanedPhotoDict,crs_wgs,crs_proj,clusterTreeCuttingDist,localSaturationCheck) + if localSaturationCheck and not tagArea == 0 and not tnum == 1: + localSaturation = tagArea/(toptagArea/100) + #print("Local Saturation for Tag " + toptag[0] + ": " + str(round(localSaturation,0))) + if localSaturation > 60: + #skip tag entirely due to saturation (if total area > 80% of total area of toptag clusters) + #print("Skipped: " + toptag[0] + " due to saturation (" + str(round(localSaturation,0)) + "%).") + saturationExcludeCount += 1 + continue #next toptag + + if len(listOfAlphashapesAndMeta_tmp) > 0: + listOfAlphashapesAndMeta.extend(listOfAlphashapesAndMeta_tmp) + + singlePhotoGuidList = noClusterPhotos_perTag_DictOfLists.get(toptag[0], None) + if singlePhotoGuidList: + shapetype = "Single cluster" + #print("Single: " + str(len(singlePhotoGuidList))) + photos = [cleanedPhotoDict[x] for x in singlePhotoGuidList] + for single_photo in photos: + #project lat/lng to UTM + x, y = pyproj.transform(crs_wgs, crs_proj, single_photo.lng, single_photo.lat) + pcoordinate = geometry.Point(x, y) + result_polygon = pcoordinate.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area + #result_polygon = pcoordinate.buffer(min(distXLng,distYLat)/100,resolution=3) + if result_polygon is not None and not result_polygon.is_empty: + listOfAlphashapesAndMeta.append((result_polygon,1,max(single_photo.photo_views,single_photo.photo_likes),1,str(toptag[0]),toptag[1],1,1,1,shapetype)) + print_store_log(f'{len(listOfAlphashapesAndMeta)} Alpha Shapes. Done.') + if localSaturationCheck and not saturationExcludeCount == 0: + print_store_log(f'Excluded {saturationExcludeCount} Tags on local saturation check.') + ##Output Boundary Shapes in merged Shapefile## + print_store_log("########## STEP 5 of 6: Writing Results to Shapefile ##########") + + ##Calculate best UTM Zone SRID/EPSG Code + #input_lon_center = bound_points_shapely.centroid.coords[0][0] #True centroid (coords may be multipoint) + #input_lat_center = bound_points_shapely.centroid.coords[0][1] + #epsg_code = Utils.convert_wgs_to_utm(input_lon_center, input_lat_center) + #project = lambda x, y: pyproj.transform(pyproj.Proj(init='epsg:4326'), pyproj.Proj(init='epsg:{0}'.format(epsg_code)), x, y) + + # Define polygon feature geometry + schema = { + 'geometry': 'Polygon', + 'properties': {'Join_Count': 'int', + 'Views': 'int', + 'COUNT_User': 'int', + 'ImpTag': 'str', + 'TagCountG': 'int', + 'HImpTag': 'int', + 'Weights': 'float', + 'WeightsV2': 'float', + 'WeightsV3': 'float', + #'shapetype': 'str', + 'emoji': 'int'}, + } + + #Normalization of Values (1-1000 Range), precalc Step: + ####################################### + weightsv1_range = [x[6] for x in listOfAlphashapesAndMeta] #get the n'th column out for calculating the max/min + weightsv2_range = [x[7] for x in listOfAlphashapesAndMeta] + weightsv3_range = [x[8] for x in listOfAlphashapesAndMeta] + weightsv1_min = min(weightsv1_range) + weightsv1_max = max(weightsv1_range) + weightsv2_min = min(weightsv2_range) + weightsv2_max = max(weightsv2_range) + weightsv3_min = min(weightsv3_range) + weightsv3_max = max(weightsv3_range) + #precalc + #https://stats.stackexchange.com/questions/70801/how-to-normalize-data-to-0-1-range + weightsv1_mod_a = (1000-1)/(weightsv1_max-weightsv1_min) + weightsv1_mod_b = 1000 - weightsv1_mod_a * weightsv1_max + weightsv2_mod_a = (1000-1)/(weightsv2_max-weightsv2_min) + weightsv2_mod_b = 1000 - weightsv2_mod_a * weightsv2_max + weightsv3_mod_a = (1000-1)/(weightsv3_max-weightsv3_min) + weightsv3_mod_b = 1000 - weightsv3_mod_a * weightsv3_max + ####################################### + # Write a new Shapefile + # WGS1984 + if (clusterTags == False and clusterEmojis == True): + shapefileName = "allEmojiCluster" + else: + shapefileName = "allTagCluster" + with fiona.open(f'02_Output/{shapefileName}.shp', mode='w', encoding='UTF-8', driver='ESRI Shapefile', schema=schema,crs=from_epsg(epsg_code)) as c: + # Normalize Weights to 0-1000 Range + idx = 0 + for alphaShapeAndMeta in listOfAlphashapesAndMeta: + if idx == 0: + HImP = 1 + else: + if listOfAlphashapesAndMeta[idx][4] != listOfAlphashapesAndMeta[idx-1][4]: + HImP = 1 + else: + HImP = 0 + #emoName = unicode_name(alphaShapeAndMeta[4]) + #Calculate Normalized Weights Values based on precalc Step + if not alphaShapeAndMeta[6] == 1: + weight1_normalized = weightsv1_mod_a * alphaShapeAndMeta[6] + weightsv1_mod_b + else: + weight1_normalized = 1 + if not alphaShapeAndMeta[7] == 1: + weight2_normalized = weightsv2_mod_a * alphaShapeAndMeta[7] + weightsv2_mod_b + else: + weight2_normalized = 1 + if not alphaShapeAndMeta[8] == 1: + weight3_normalized = weightsv3_mod_a * alphaShapeAndMeta[8] + weightsv3_mod_b + else: + weight3_normalized = 1 + idx += 1 + #project data + #geom_proj = transform(project, alphaShapeAndMeta[0]) + #c.write({ + # 'geometry': geometry.mapping(geom_proj), + if clusterEmojis and alphaShapeAndMeta[4] in globalEmojiSet: + emoji = 1 + ImpTagText = "" + else: + emoji = 0 + ImpTagText = f'{alphaShapeAndMeta[4]}' + c.write({ + 'geometry': geometry.mapping(alphaShapeAndMeta[0]), + 'properties': {'Join_Count': alphaShapeAndMeta[1], + 'Views': alphaShapeAndMeta[2], + 'COUNT_User': alphaShapeAndMeta[3], + 'ImpTag': ImpTagText, + 'TagCountG': alphaShapeAndMeta[5], + 'HImpTag': HImP, + 'Weights': weight1_normalized, + 'WeightsV2': weight2_normalized, + 'WeightsV3': weight3_normalized, + #'shapetype': alphaShapeAndMeta[9], + 'emoji': emoji}, + }) + if clusterEmojis: + with open("02_Output/emojiTable.csv", "w", encoding='utf-8') as emojiTable: + emojiTable.write("FID,Emoji\n") + idx = 0 + for alphaShapeAndMeta in listOfAlphashapesAndMeta: + if alphaShapeAndMeta[4] in globalEmojiSet: + ImpTagText = f'{alphaShapeAndMeta[4]}' + else: + ImpTagText = "" + emojiTable.write(f'{idx},{ImpTagText}\n') + idx += 1 + + else: + print(f'\nUser abort.') + if abort == False and clusterPhotos == True: + print_store_log("########## STEP 6 of 6: Calculating Overall Photo Location Clusters ##########") + + if not 'clusterTreeCuttingDist' in locals(): + clusterTreeCuttingDist = int(input("Specify Cluster (Cut) Distance:\n")) + selectedPhotoList_Guids = [] + if not 'cleanedPhotoList' in locals(): + cleanedPhotoList = list(cleanedPhotoDict.values()) + for cleanedPhotoLocation in cleanedPhotoList: + selectedPhotoList_Guids.append(cleanedPhotoLocation.photo_guid) + selectedPhotoList = cleanedPhotoList + df = pd.DataFrame(selectedPhotoList) + points = df.as_matrix(['lng','lat']) + tagRadiansData = np.radians(points) + clusterer = hdbscan.HDBSCAN(min_cluster_size=2,gen_min_span_tree=False,allow_single_cluster=False,min_samples=1) + #clusterer.fit(tagRadiansData) + with warnings.catch_warnings(): + #disable joblist multithread warning + warnings.simplefilter('ignore', UserWarning) + async_result = pool.apply_async(Utils.fit_cluster, (clusterer, tagRadiansData)) + clusterer = async_result.get() + clusters = clusterer.single_linkage_tree_.get_clusters(Utils.getRadiansFromMeters(clusterTreeCuttingDist/8), min_cluster_size=2) + listOfPhotoClusters = [] + numpy_selectedPhotoList_Guids = np.asarray(selectedPhotoList_Guids) + mask_noisy = (clusters == -1) + number_of_clusters = len(np.unique(clusters[~mask_noisy])) #mit noisy (=0) + print_store_log(f'--> {number_of_clusters} Photo cluster.') + #clusternum_photolist = zip(clusters,selectedPhotoList) + #clusterPhotosList = [[] for x in range(number_of_clusters)] + clusterPhotosGuidsList = [] + for x in range(number_of_clusters): + currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x] + clusterPhotosGuidsList.append(currentClusterPhotoGuids) + noClusterPhotos = list(numpy_selectedPhotoList_Guids[clusters==-1]) + clusterPhotosGuidsList.sort(key=len,reverse=True) + if clusterTags is False: + #detect projection if not already + limYMin,limYMax,limXMin,limXMax = Utils.getRectangleBounds(points) + bound_points_shapely = geometry.MultiPoint([(limXMin, limYMin), (limXMax, limYMax)]) + crs_wgs = pyproj.Proj(init='epsg:4326') #data always in lat/lng WGS1984 + if overrideCRS is None: + #Calculate best UTM Zone SRID/EPSG Code + input_lon_center = bound_points_shapely.centroid.coords[0][0] #True centroid (coords may be multipoint) + input_lat_center = bound_points_shapely.centroid.coords[0][1] + epsg_code = Utils.convert_wgs_to_utm(input_lon_center, input_lat_center) + crs_proj = pyproj.Proj(init=f'epsg:{epsg_code}') + for photo_cluster in clusterPhotosGuidsList: + photos = [cleanedPhotoDict[x] for x in photo_cluster] + uniqueUserCount = len(set([photo.userid for photo in photos])) + #get points and project coordinates to suitable UTM + points = [geometry.Point(pyproj.transform(crs_wgs, crs_proj, photo.lng, photo.lat)) + for photo in photos] + point_collection = geometry.MultiPoint(list(points)) + result_polygon = point_collection.convex_hull #convex hull + result_centroid = result_polygon.centroid + if result_centroid is not None and not result_centroid.is_empty: + listOfPhotoClusters.append((result_centroid,uniqueUserCount)) + #clusterPhotoGuidList = clustersPerTag.get(None, None) + #noclusterphotos = [cleanedPhotoDict[x] for x in singlePhotoGuidList] + for photoGuid_noCluster in noClusterPhotos: + photo = cleanedPhotoDict[photoGuid_noCluster] + x, y = pyproj.transform(crs_wgs, crs_proj, photo.lng, photo.lat) + pcenter = geometry.Point(x, y) + if pcenter is not None and not pcenter.is_empty: + listOfPhotoClusters.append((pcenter,1)) + # Define a polygon feature geometry with one attribute + schema = { + 'geometry': 'Point', + 'properties': {'Join_Count': 'int'}, + } + + # Write a new Shapefile + # WGS1984 + with fiona.open('02_Output/allPhotoCluster.shp', mode='w', driver='ESRI Shapefile', schema=schema,crs=from_epsg(epsg_code)) as c: + ## If there are multiple geometries, put the "for" loop here + idx = 0 + for photoCluster in listOfPhotoClusters: + idx += 1 + c.write({ + 'geometry': geometry.mapping(photoCluster[0]), + 'properties': {'Join_Count': photoCluster[1]}, + }) + print("Writing log file..") + later = time.time() + hours, rem = divmod(later-now, 3600) + minutes, seconds = divmod(rem, 60) + difference = int(later - now) + reportMsg = f'\nDone.\n{int(hours):0>2} Hours {int(minutes):0>2} Minutes and {seconds:05.2f} Seconds passed.' + log_texts_list.append(reportMsg) + with open(log_file, "w", encoding='utf-8') as logfile_a: + for logtext in log_texts_list: + logfile_a.write(logtext) + print(reportMsg) + input("Press any key to exit...") + sys.exit() + +if __name__ == "__main__": + main() diff --git a/tagmaps/classes/__init__.py b/tagmaps/classes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tagmaps/classes/utils.py b/tagmaps/classes/utils.py new file mode 100644 index 0000000..fc5e86e --- /dev/null +++ b/tagmaps/classes/utils.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python +# coding: utf-8 +# TimeTransponse_from_json definition of functions file +import sys +import tkinter as tk +import emoji +import unicodedata +import math +import numpy as np +import math +from math import radians, cos, sin, asin, sqrt +import re +import fiona #Fiona needed for reading Shapefile +from fiona.crs import from_epsg +import shapely.geometry as geometry +import pyproj #import Proj, transform +#https://gis.stackexchange.com/questions/127427/transforming-shapely-polygon-and-multipolygon-objects +from shapely.ops import transform, cascaded_union, polygonize +#from shapely.geometry import Polygon +#from shapely.geometry import shape +#from shapely.geometry import Point +from decimal import Decimal +from descartes import PolygonPatch +from scipy.spatial import Delaunay + +class Utils(): + def query_yes_no(question, default="yes"): + """Ask a yes/no question via raw_input() and return their answer. + + "question" is a string that is presented to the user. + "default" is the presumed answer if the user just hits . + It must be "yes" (the default), "no" or None (meaning + an answer is required of the user). + + The "answer" return value is True for "yes" or False for "no". + """ + valid = {"yes": True, "y": True, "ye": True, + "no": False, "n": False} + if default is None: + prompt = " [y/n] " + elif default == "yes": + prompt = " [Y/n] " + elif default == "no": + prompt = " [y/N] " + else: + raise ValueError("invalid default answer: '%s'" % default) + + while True: + sys.stdout.write(question + prompt) + choice = input().lower() + if default is not None and choice == '': + return valid[default] + elif choice in valid: + return valid[choice] + else: + sys.stdout.write("'yes' or 'no' " + "(or 'y' or 'n').\n") + + def daterange(start_date, end_date): + for n in range(int ((end_date - start_date).days)): + yield start_date + timedelta(n) + + + def haversine(lon1, lat1, lon2, lat2): + """ + Calculate the great circle distance between two points + on the earth (specified in decimal degrees) + """ + # convert decimal degrees to radians + lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) + # haversine formula + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 + c = 2 * asin(sqrt(a)) + # Radius of earth in kilometers is 6371 + km = 6371* c + m = km*1000 + return m + + def getRadiansFromMeters(dist): + dist = dist/1000 + degreesDist = dist/111.325 + radiansDist = degreesDist/57.2958 + return radiansDist + #https://www.mathsisfun.com/geometry/radians.html + #1 Radian is about 57.2958 degrees. + #then see https://sciencing.com/convert-distances-degrees-meters-7858322.html + #Multiply the number of degrees by 111.325 + #To convert this to meters, multiply by 1,000. So, 2 degrees is 222,65 meters. + def getMetersFromRadians(dist): + dist = dist * 57.2958 + dist = dist * 111.325 + metersDist = round(dist * 1000,1) + + return metersDist + #1 Radian is about 57.2958 degrees. + #then see https://sciencing.com/convert-distances-degrees-meters-7858322.html + #Multiply the number of degrees by 111.325 + #To convert this to meters, multiply by 1,000. So, 2 degrees is 222,65 meters. + #plt.close('all') #clear memory + + #This function is not really needed, makes no difference! (really?) + def checkEmojiType(strEmo): + if unicodedata.name(strEmo).startswith(("EMOJI MODIFIER","VARIATION SELECTOR","ZERO WIDTH")): + return False + else: + return True + #see https://stackoverflow.com/questions/43852668/using-collections-counter-to-count-emojis-with-different-colors + # we want to ignore fitzpatrick modifiers and treat all differently colored emojis the same + #https://stackoverflow.com/questions/38100329/some-emojis-e-g-have-two-unicode-u-u2601-and-u-u2601-ufe0f-what-does + #COOKING + #OK HAND SIGN + #EMOJI MODIFIER FITZPATRICK TYPE-1-2 + #GRINNING FACE WITH SMILING EYES + #HEAVY BLACK HEART + #WEARY CAT FACE + #SMILING FACE WITH HEART-SHAPED EYES + #OK HAND SIGN + #EMOJI MODIFIER FITZPATRICK TYPE-1-2 + #GRINNING FACE WITH SMILING EYES + #PERSON WITH FOLDED HANDS + #EMOJI MODIFIER FITZPATRICK TYPE-3 + #WEARY CAT FACE + + ##Emojitest + #n = '❤️👨‍⚕️' + ##n = '👨‍⚕️' #medical emoji with zero-width joiner (http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html) + #nlist = def_functions.extract_emojis(n) + #with open("emojifile.txt", "w", encoding='utf-8') as emojifile: + # emojifile.write("Original: " + n + '\n') + # for xstr in nlist: + # emojifile.write('Emoji Extract: U+%04x' % ord(xstr) + '\n') + # emojifile.write(xstr + '\n') + # for _c in n: + # emojifile.write(str(unicode_name(_c)) + '\n') + # emojifile.write('Each Codepoint: U+%04x' % ord(_c) + '\n') + #def cleanEmoji(c): + # tuple = (u'\ufeff',u'\u200b',u'\u200d') + # for ex in tuple: + # c.replace(ex,"") + # return(c) + #https://github.com/carpedm20/emoji/ + #https://github.com/carpedm20/emoji/issues/75 + def extract_emojis(str): + #str = str.decode('utf-32').encode('utf-32', 'surrogatepass') + #return list(c for c in str if c in emoji.UNICODE_EMOJI) + return list(c for c in str if c in emoji.UNICODE_EMOJI and checkEmojiType(c) is True) + #this class is needed to override tkinter window with drag&drop option when overrideredirect = true + #class App: + # global tk + # def __init__(self): + # self.root = tk.Tk() + # #tk.Tk.__init__(self,master) + # self.root.overrideredirect(True) + # self.root.configure(background='gray7') + # self.root._offsetx = 0 + # self.root._offsety = 0 + # self.root.bind('',self.clickwin) + # self.root.bind('',self.dragwin) + # + # def dragwin(self,event): + # x = self.root.winfo_pointerx() - self._offsetx + # y = self.root.winfo_pointery() - self._offsety + # self.root.geometry('+{x}+{y}'.format(x=x,y=y)) + # + # def clickwin(self,event): + # self.root._offsetx = event.x + # self.root._offsety = event.y + + #tc unicode problem + #https://stackoverflow.com/questions/40222971/python-find-equivalent-surrogate-pair-from-non-bmp-unicode-char + + _nonbmp = re.compile(r'[\U00010000-\U0010FFFF]') + def _surrogatepair(match): + char = match.group() + assert ord(char) > 0xffff + encoded = char.encode('utf-16-le') + return ( + chr(int.from_bytes(encoded[:2], 'little')) + + chr(int.from_bytes(encoded[2:], 'little'))) + def with_surrogates(text): + return _nonbmp.sub(_surrogatepair, text) + + #https://stackoverflow.com/questions/40132542/get-a-cartesian-projection-accurate-around-a-lat-lng-pair + def convert_wgs_to_utm(lon, lat): + utm_band = str((math.floor((lon + 180) / 6 ) % 60) + 1) + if len(utm_band) == 1: + utm_band = '0'+utm_band + if lat >= 0: + epsg_code = '326' + utm_band + else: + epsg_code = '327' + utm_band + return epsg_code + + def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + def generateClusterShape(toptag,clusterPhotoGuidList,cleanedPhotoDict,crs_wgs,crs_proj,clusterTreeCuttingDist,localSaturationCheck): + #we define a new list of Temp Alpha Shapes outside the loop, so that it is not overwritten each time + listOfAlphashapesAndMeta_tmp = [] + #points = [] + tagArea = 0 + for photo_guids in clusterPhotoGuidList: + #for each cluster for this toptag + photos = [cleanedPhotoDict[x] for x in photo_guids] + photoCount = len(photo_guids) + uniqueUserCount = len(set([photo.userid for photo in photos])) + sumViews = sum([photo.photo_views for photo in photos]) + #calculate different weighting formulas + #weightsv1 = 1+ photoCount *(sqrt(1/( photoCount / uniqueUserCount )**3)) #-> Standard weighting formula (x**y means x raised to the power y); +1 to UserCount: prevent 1-2 Range from being misaligned + #weightsv2 = 1+ photoCount *(sqrt(1/( photoCount / uniqueUserCount )**2)) + weightsv1 = photoCount *(sqrt(1/( photoCount / (uniqueUserCount+1) )**3)) #-> Standard weighting formula (x**y means x raised to the power y); +1 to UserCount: prevent 1-2 Range from being misaligned + weightsv2 = photoCount *(sqrt(1/( photoCount / (uniqueUserCount+1) )**2)) #-> less importance on User_Count in correlation to photo count [Join_Count]; +1 to UserCount: prevent 1-2 Range from being misaligned + weightsv3 = sqrt((photoCount+(2*sqrt(photoCount)))*2) #-> Ignores User_Count, this will emphasize individual and very active users + #points = [geometry.Point(photo.lng, photo.lat) + # for photo in photos] + #instead of lat/lng for each photo, we use photo_locID to identify a list of distinct locations + distinctLocations = set([photo.photo_locID + for photo in photos]) + #simple list comprehension without projection: + #points = [geometry.Point(Decimal(location.split(':')[1]), Decimal(location.split(':')[0])) + # for location in distinctLocations] + points = [geometry.Point(pyproj.transform(crs_wgs, crs_proj, Decimal(location.split(':')[1]), Decimal(location.split(':')[0]))) + for location in distinctLocations] + point_collection = geometry.MultiPoint(list(points)) + result_polygon = None + + if len(points) >= 5: + if len(points) < 10: + result_polygon = point_collection.convex_hull #convex hull + result_polygon = result_polygon.buffer(clusterTreeCuttingDist/4,resolution=3) + shapetype = "between 5 and 10 points_convexHull" + #result_polygon = result_polygon.buffer(min(distXLng,distYLat)/100,resolution=3) + else: + if len(points) > 500: + startalpha = 1000000 + elif len(points) > 200: + startalpha = 10000 + else: + startalpha = 9000 + result_polygon = Utils.alpha_shape(points,alpha=clusterTreeCuttingDist/startalpha) #concave hull/alpha shape /50000 + shapetype = "Initial Alpha Shape + Buffer" + if type(result_polygon) is geometry.multipolygon.MultiPolygon or isinstance(result_polygon, bool): + #repeat generating alpha shapes with smaller alpha value if Multigon is generated + #smaller alpha values mean less granularity of resulting polygon + #but too large alpha may result in empty polygon + #(this branch is sometimes executed for larger scales) + for i in range(1,6): + #try decreasing alpha + alpha = startalpha + (startalpha * (i**i)) #** means cube + result_polygon = Utils.alpha_shape(points,alpha=clusterTreeCuttingDist/alpha)#/100000 + if not type(result_polygon) is geometry.multipolygon.MultiPolygon and not isinstance(result_polygon, bool): + shapetype = "Multipolygon Alpha Shape /" + str(alpha) + break + if type(result_polygon) is geometry.multipolygon.MultiPolygon or isinstance(result_polygon, bool): + #try increasing alpha + for i in range(1,6): + #try decreasing alpha + alpha = startalpha / (i*i) + result_polygon = Utils.alpha_shape(points,alpha=clusterTreeCuttingDist/alpha)#/100000 + if not type(result_polygon) is geometry.multipolygon.MultiPolygon and not isinstance(result_polygon, bool): + shapetype = "Multipolygon Alpha Shape /" + str(alpha) + break + if type(result_polygon) is geometry.multipolygon.MultiPolygon: + shapetype = "Multipolygon Alpha Shape -> Convex Hull" + #if still of type multipolygon, try to remove holes and do a convex_hull + result_polygon = result_polygon.convex_hull + #OR: in case there was a problem with generating alpha shapes (circum_r = a*b*c/(4.0*area) --> ZeroDivisionError: float division by zero) + #this branch is rarely executed for large point clusters where alpha is perhaps set too small + elif isinstance(result_polygon, bool) or result_polygon.is_empty: + shapetype = "BoolAlpha -> Fallback to PointCloud Convex Hull" + result_polygon = point_collection.convex_hull #convex hull + #Finally do a buffer to smooth alpha + result_polygon = result_polygon.buffer(clusterTreeCuttingDist/4,resolution=3) + #result_polygon = result_polygon.buffer(min(distXLng,distYLat)/100,resolution=3) + elif 2 <= len(points) < 5: + shapetype = "between 2 and 5 points_buffer" + #calc distance between points http://www.mathwarehouse.com/algebra/distance_formula/index.php + #bdist = math.sqrt((points[0].coords.xy[0][0]-points[1].coords.xy[0][0])**2 + (points[0].coords.xy[1][0]-points[1].coords.xy[1][0])**2) + #print(str(bdist)) + result_polygon = point_collection.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area + result_polygon = result_polygon.convex_hull + #result_polygon = point_collection.buffer(min(distXLng,distYLat)/100,resolution=3) #single dots are presented as buffer with 0.5% of width-area + elif len(points)==1 or type(result_polygon) is geometry.point.Point or result_polygon is None: + shapetype = "1 point cluster" + result_polygon = point_collection.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area + #result_polygon = point_collection.buffer(min(distXLng,distYLat)/100,resolution=3) #single dots are presented as buffer with 0.5% of width-area + #final check for multipolygon + if type(result_polygon) is geometry.multipolygon.MultiPolygon: + #usually not executed + result_polygon = result_polygon.convex_hull + #Geom, Join_Count, Views, COUNT_User,ImpTag,TagCountG,HImpTag + if result_polygon is not None and not result_polygon.is_empty: + if localSaturationCheck: + tagArea += result_polygon.area + listOfAlphashapesAndMeta_tmp.append((result_polygon,photoCount,sumViews,uniqueUserCount,toptag[0],toptag[1],weightsv1,weightsv2,weightsv3,shapetype)) + if len(listOfAlphashapesAndMeta_tmp) > 0: + # finally sort and append all cluster shapes for this tag + listOfAlphashapesAndMeta_tmp = sorted(listOfAlphashapesAndMeta_tmp,key=lambda x: -x[6]) + return listOfAlphashapesAndMeta_tmp, tagArea + + def plot_polygon(polygon): + fig = plt.figure(figsize=(10,10)) + ax = fig.add_subplot(111) + margin = .3 + x_min, y_min, x_max, y_max = polygon.bounds + ax.set_xlim([x_min-margin, x_max+margin]) + ax.set_ylim([y_min-margin, y_max+margin]) + patch = PolygonPatch(polygon, fc='#999999', + ec='#000000', fill=True, + zorder=-1) + ax.add_patch(patch) + return fig + + def alpha_shape(points, alpha): + """ + Alpha Shapes Code by KEVIN DWYER + see http://blog.thehumangeo.com/2014/05/12/drawing-boundaries-in-python/ + Compute the alpha shape (concave hull) of a set + of points. + @param points: Iterable container of points. + @param alpha: alpha value to influence the + gooeyness of the border. Smaller numbers + don't fall inward as much as larger numbers. + Too large, and you lose everything! + """ + if len(points) < 4: + # When you have a triangle, there is no sense + # in computing an alpha shape. + return geometry.MultiPoint(list(points)).convex_hull + def add_edge(edges, edge_points, coords, i, j): + """ + Add a line between the i-th and j-th points, + if not in the list already + """ + if (i, j) in edges or (j, i) in edges: + # already added + return + edges.add( (i, j) ) + edge_points.append(coords[ [i, j] ]) + coords = np.array([point.coords[0] + for point in points]) + + #print(str(len(coords))) + tri = Delaunay(coords)#,qhull_o}ptions = 'QJ') #To avoid this error, you can joggle the data by specifying the 'QJ' option to the DELAUNAY function. https://de.mathworks.com/matlabcentral/answers/94438-why-does-the-delaunay-function-in-matlab-7-0-r14-produce-an-error-when-passed-colinear-points?s_tid=gn_loc_drop + #tri = Delaunay(coords,{'QJ'}) #Version 3.1 added triangulated output ('Qt'). It should be used for Delaunay triangulations instead of using joggled input ('QJ'). + edges = set() + edge_points = [] + # loop over triangles: + # ia, ib, ic = indices of corner points of the + # triangle + for ia, ib, ic in tri.vertices: + pa = coords[ia] + pb = coords[ib] + pc = coords[ic] + # Lengths of sides of triangle + a = math.sqrt((pa[0]-pb[0])**2 + (pa[1]-pb[1])**2) + b = math.sqrt((pb[0]-pc[0])**2 + (pb[1]-pc[1])**2) + c = math.sqrt((pc[0]-pa[0])**2 + (pc[1]-pa[1])**2) + # Semiperimeter of triangle + s = (a + b + c)/2.0 + # Area of triangle by Heron's formula + try: + area = math.sqrt(s*(s-a)*(s-b)*(s-c)) + except ValueError: + return False + if area == 0: + return False + circum_r = a*b*c/(4.0*area) + # Here's the radius filter. + #print circum_r + if circum_r < 1.0/alpha: + add_edge(edges, edge_points, coords, ia, ib) + add_edge(edges, edge_points, coords, ib, ic) + add_edge(edges, edge_points, coords, ic, ia) + m = geometry.MultiLineString(edge_points) + triangles = list(polygonize(m)) + return cascaded_union(triangles)#, edge_points + #return geometry.polygon.asPolygon(edge_points,holes=None) + + def fit_cluster(clusterer, data): + clusterer.fit(data) + return clusterer + def getRectangleBounds(points): + limYMin = np.min(points.T[1]) + limYMax = np.max(points.T[1]) + limXMin = np.min(points.T[0]) + limXMax = np.max(points.T[0]) + return limYMin,limYMax,limXMin,limXMax + def filterTags(taglist,SortOutAlways_set,SortOutAlways_inStr_set): + count_tags = 0 + count_skipped = 0 + #Filter tags based on two stoplists + photo_tags_filtered = set() + for tag in taglist: + count_tags += 1 + #exclude numbers and those tags that are in SortOutAlways_set + if len(tag) == 1 or tag == '""' or tag.isdigit() or tag in SortOutAlways_set: + count_skipped += 1 + continue + for inStr in SortOutAlways_inStr_set: + if inStr in tag: + count_skipped += 1 + break + else: + photo_tags_filtered.add(tag) + return photo_tags_filtered,count_tags,count_skipped \ No newline at end of file diff --git a/tagmaps/generateTagClusters.py b/tagmaps/generateTagClusters.py deleted file mode 100644 index 5d990c2..0000000 --- a/tagmaps/generateTagClusters.py +++ /dev/null @@ -1,1863 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -# generateTagClusters - -""" -generateTagClusters.py - -- will read in geotagged (lat/lng) decimal degree point data -- will generate HDBSCAN Cluster Hierarchy -- will output Alpha Shapes/Delauney for cluster cut at specific distance -""" - -__author__ = "Alexander Dunkel" -__license__ = "GNU GPLv3" -__version__ = "0.9.3" - -import csv -import os -os.system('mode con: cols=197 lines=40') -import sys -import re -from glob import glob -from _csv import QUOTE_MINIMAL -from collections import defaultdict -from collections import Counter -from collections import namedtuple -import collections -import tkinter as tk -from tkinter.messagebox import showerror -import tkinter.messagebox -from .utils import * -import datetime -import warnings -from unicodedata import name as unicode_name - -#Cluster stuff -import numpy as np -import matplotlib -import matplotlib.pyplot as plt -plt.ion() #enable interactive mode for pyplot (not necessary?!) -import seaborn as sns -import sklearn.datasets as data -import pandas as pd -import hdbscan - -from multiprocessing.pool import ThreadPool -pool = ThreadPool(processes=1) -import time -from time import sleep -import copy - -#needed for anonymization (Topic Clustering) -import hashlib - -#enable for map display -#from mpl_toolkits.basemap import Basemap -#from PIL import Image - -import fiona #Fiona needed for reading Shapefile -from fiona.crs import from_epsg -import shapely.geometry as geometry -import pyproj #import Proj, transform -#https://gis.stackexchange.com/questions/127427/transforming-shapely-polygon-and-multipolygon-objects - -from shapely.ops import transform -from decimal import Decimal - -#alternative Shapefile module pure Python -#https://github.com/GeospatialPython/pyshp#writing-shapefiles -#import shapefile - - -##Emojitest -#n = '❤️👨‍⚕️' -#n = '😍,146' -##print(n.encode("utf-8")) -###n = '👨‍⚕️' #medical emoji with zero-width joiner (http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html) -#nlist = utils.extract_emojis(n) -#with open("emojifile.txt", "w", encoding='utf-8') as emojifile: -# emojifile.write("Original: " + n + '\n') -# for xstr in nlist: -# emojifile.write('Emoji Extract: U+%04x' % ord(xstr) + '\n') -# emojifile.write(xstr + '\n') -# for _c in n: -# emojifile.write(str(unicode_name(_c)) + '\n') -# emojifile.write('Each Codepoint: U+%04x' % ord(_c) + '\n') - -###################### -####config section#### -###################### -log_file = "02_Output/log.txt" -log_texts_list = [] - -def print_store_log(text,end=None): - if end is None: - addEnd = False - end = '\n' - else: - addEnd = True - #watch out for non-printable characters in console - try: - print(text,end=end) - except UnicodeEncodeError: - print("#".join(re.findall("[a-zA-Z]+", text))) - log_texts_list.append(text + end) - -##args -#Choose one of four options for Input data type: -import argparse -parser = argparse.ArgumentParser() -parser.add_argument('-s', "--source", default= "fromLBSN") # naming it "source" -parser.add_argument('-r', "--removeLongTail", type=utils.str2bool, nargs='?', const=True,default= True) -parser.add_argument('-e', "--EPSG") -parser.add_argument('-t', "--clusterTags", type=utils.str2bool, nargs='?', const=True,default= True) -parser.add_argument('-p', "--clusterPhotos", type=utils.str2bool, nargs='?', const=True,default= True) -parser.add_argument('-c', "--localSaturationCheck", type=utils.str2bool, nargs='?', const=True, default= False) -parser.add_argument('-j', "--tokenizeJapanese", type=utils.str2bool, nargs='?', const=True, default= False) -parser.add_argument('-o', "--clusterEmojis", type=utils.str2bool, nargs='?', const=True, default= True) -parser.add_argument('-m', "--topicModeling", type=utils.str2bool, nargs='?', const=True, default= False) -parser.add_argument('-w', "--writeCleanedData", type=utils.str2bool, nargs='?', const=True, default= True) -parser.add_argument('-i', "--shapefileIntersect", type=utils.str2bool, nargs='?', const=True, default= False) -parser.add_argument('-f', "--shapefilePath", default= "") -parser.add_argument('-is',"--ignoreStoplists", type=utils.str2bool, nargs='?', const=True, default= False) -parser.add_argument('-ip',"--ignorePlaceCorrections", type=utils.str2bool, nargs='?', const=True, default= False) -parser.add_argument('-stat',"--statisticsOnly", type=utils.str2bool, nargs='?', const=True, default= False) -args = parser.parse_args() # returns data from the options specified (source) -DSource = args.source -clusterTags = args.clusterTags -clusterPhotos = args.clusterPhotos -removeLongTail = args.removeLongTail -clusterEmojis = args.clusterEmojis -topicModeling = args.topicModeling -writeCleanedData = args.writeCleanedData -localSaturationCheck = args.localSaturationCheck -shapefileIntersect = args.shapefileIntersect -shapefilePath = args.shapefilePath -ignoreStoplists = args.ignoreStoplists -ignorePlaceCorrections = args.ignorePlaceCorrections -statisticsOnly = args.statisticsOnly -writeGISCompLine = True # writes placeholder entry after headerline for avoiding GIS import format issues - -##Load Filterlists -SortOutAlways_file = "00_Config/SortOutAlways.txt" -SortOutAlways_inStr_file = "00_Config/SortOutAlways_inStr.txt" -SortOutAlways_set = set() -SortOutAlways_inStr_set = set() -if not os.path.isfile(SortOutAlways_file): - print(f'{SortOutAlways_file} not found.') -#else read logfile -else: - if ignoreStoplists == False: - with open(SortOutAlways_file, newline='', encoding='utf8') as f: #read each unsorted file and sort lines based on datetime (as string) - SortOutAlways_set = set([line.lower().rstrip('\r\n') for line in f]) - print(f'Loaded {len(SortOutAlways_set)} stoplist items.') -if not os.path.isfile(SortOutAlways_inStr_file): - print(f'{SortOutAlways_inStr_file} not found.') -#else read logfile -else: - if ignoreStoplists == False: - with open(SortOutAlways_inStr_file, newline='', encoding='utf8') as f: #read each unsorted file and sort lines based on datetime (as string) - SortOutAlways_inStr_set = set([line.lower().rstrip('\r\n') for line in f]) - print(f'Loaded {len(SortOutAlways_inStr_set)} inStr stoplist items.') -#manually filter places or correct lat/lng -SortOutPlaces_file = "00_Config/SortOutPlaces.txt" -CorrectPlaceLatLng_file = "00_Config/CorrectPlaceLatLng.txt" -SortOutPlaces_set = set() -CorrectPlaceLatLng_dict = dict() -SortOutPlaces = False -if os.path.isfile(SortOutPlaces_file): - if ignoreStoplists == False: - with open(SortOutPlaces_file, newline='', encoding='utf8') as f: - f.readline() - #placeid - SortOutPlaces_set = set([line.rstrip('\r\n').split(",")[0] for line in f if len(line) > 0]) - SortOutPlaces = True - print(f'Loaded {len(SortOutPlaces_set)} stoplist places.') -CorrectPlaces = False -if os.path.isfile(CorrectPlaceLatLng_file): - if ignorePlaceCorrections == False: - with open(CorrectPlaceLatLng_file, newline='', encoding='utf8') as f: - f.readline() - for line in f: - if len(line) > 0: - linesplit = line.rstrip('\r\n').split(",") - if len(linesplit) > 1: - #placeid = #lat,lng - CorrectPlaceLatLng_dict[linesplit[0]] = (linesplit[1],linesplit[2]) - CorrectPlaces = True - print(f'Loaded {len(CorrectPlaceLatLng_dict)} place lat/lng corrections.') - -###SHAPEFILESTUFF### -if shapefileIntersect: - if shapefilePath == "": - sys.exit(f'No Shapefile-Path specified. Exiting..') - from shapely.geometry import Polygon - from shapely.geometry import shape - from shapely.geometry import Point - PShape = fiona.open(shapefilePath) - - ######Single Polygon:###### - first = PShape.next() - print("Loaded Shapefile with " + str(len(first['geometry']['coordinates'][0])) + " Vertices.") # (GeoJSON format) - shp_geom = shape(first['geometry']) #shape(first) - - ######Multipolygon:###### - #vcount = PShape.next()['geometry']['coordinates'] #needed for count of vertices - #geom = MultiPolygon([shape(pol['geometry']) for pol in PShape]) - #shp_geom = geom - #print("Loaded Shapefile with Vertices ", sum([len(poly[0]) for poly in vcount])) # (GeoJSON format) -###END SHAPEFILESTUFF### - -if args.EPSG is None: - overrideCRS = None -else: - #try loading Custom CRS at beginning - crs_proj = pyproj.Proj(init='epsg:{0}'.format(overrideCRS)) - print("Custom CRS set: " + str(crs_proj.srs)) - epsg_code = overrideCRS - -tokenizeJapanese = args.tokenizeJapanese -if tokenizeJapanese: - from jNlp.jTokenize import jTokenize -#print(str(removeLongTail)) -pathname = os.getcwd() -if not os.path.exists(pathname + '/02_Output/'): - os.makedirs(pathname + '/02_Output/') - print("Folder /02_Output was created") -#if not os.path.exists(pathname + '/Output/ClusterImg/'): -# os.makedirs(pathname + '/Output/ClusterImg/') -# print("Folder /Output/ClusterImg/ was created") - -# READ All JSON in Current Folder and join to list -#partnum = 0 -guid_list = set() #global list of guids -count_glob = 0 -partcount = 0 -#filenameprev = "" -if (DSource == "fromFlickr_CSV"): - filelist = glob('01_Input/*.txt') - GMTTimetransform = 0 - guid_columnNameID = 5 #guid - Sourcecode = 2 - quoting_opt = csv.QUOTE_NONE -elif (DSource == "fromInstagram_PGlbsnEmoji") or (DSource == "fromLBSN") or (DSource == "fromLBSN_old"): - filelist = glob('01_Input/*.csv') - guid_columnNameID = 1 #guid - quoting_opt = csv.QUOTE_MINIMAL -elif (DSource == "fromSensorData_InfWuerz"): - filelist = glob('01_Input/*.csv') - GMTTimetransform = 0 - guid_columnNameID = 1 #guid - Sourcecode = 11 - quoting_opt = csv.QUOTE_NONE -else: - sys.exit("Source not supported yet.") - -print('\n') -print_store_log("########## STEP 1 of 6: Data Cleanup ##########") -if (len(filelist) == 0): - sys.exit(f'No *.json/csv/txt files found.') -else: - if clusterTags or clusterEmojis: - inputtext = input(f'Files to process: {len(filelist)}. \nOptional: Enter a Number for the variety of Tags to process (Default is 1000)\nPress Enter to proceed.. \n') - if inputtext == "" or not inputtext.isdigit(): - tmax = 1000 - else: - tmax = int(inputtext) - -skippedCount = 0 -appendToAlreadyExist = False -count_non_geotagged = 0 -count_outside_shape = 0 -count_tags_global = 0 -count_emojis_global = 0 -count_tags_skipped = 0 -shapeFileExcludelocIDhash = set() -shapeFileIncludedlocIDhash = set() - -#initialize global variables for analysis bounds (lat, lng coordinates) -limLatMin = None -limLatMax = None -limLngMin = None -limLngMax = None -def setLatLngBounds(Lat,Lng): - global limLatMin, limLatMax, limLngMin, limLngMax - if limLatMin is None or (Lat < limLatMin and not Lat == 0): - limLatMin = Lat - if limLatMax is None or (Lat > limLatMax and not Lat == 0): - limLatMax = Lat - if limLngMin is None or (Lng < limLngMin and not Lng == 0): - limLngMin = Lng - if limLngMax is None or (Lng > limLngMax and not Lng == 0): - limLngMax = Lng -def is_number(s): - try: - float(s) - return True - except ValueError: - return False -photoIDHash = set() -LocationsPerUserID_dict = defaultdict(set) -UserLocationTagList_dict = defaultdict(set) -if topicModeling: - UserTopicList_dict = defaultdict(set) - UserPhotoIDS_dict = defaultdict(set) - UserPhotoFirstThumb_dict = defaultdict(str) -UserLocationWordList_dict = defaultdict(set) -UserLocationsFirstPhoto_dict = defaultdict(set) -if clusterEmojis: - overallNumOfEmojis_global = collections.Counter() - -#UserDict_TagCounters = defaultdict(set) -UserDict_TagCounters_global = defaultdict(set) -#UserIDsPerLocation_dict = defaultdict(set) -#PhotoLocDict = defaultdict(set) -distinctLocations_set = set() -distinctUserLocations_set = set() -count_loc = 0 -now = time.time() -for file_name in filelist: - #filename = "02_Output/" + os.path.basename(file_name) - #with open(filename, 'a', encoding='utf8') as file: - # file.write("ID_Date,SOURCE,Latitude,Longitude,PhotoID,Owner,UserID,Name,URL,DateTaken,UploadDate,Views,Tags,MTags,Likes,Comments,Shortcode,Type,LocName,LocID" + '\n') - # file.write('"2000-01-01 00:00:00","TESTLINE","43.2544706","28.023467","24PHOTOID3534","testowner","812643S9812644","testcaption","https://scontent.cdninstagram.com/t/s640x640/22344798_1757535311005731_6649353052090269696_n.jpg","2000-01-01 00:00:00","2000-01-01 00:00:00","22",";blacksea;romania;ig;seaside;mareaneagra;travel;getfit;trip;travelog;sun;beachy;avenit;mytinyatlas;islandhopping;flashesofdelight;beachvibes;beautiful;waves;barbershop;sea;love;photooftheday;picoftheday;vsco;vscocam;snapshot;instatravel;instamood;ich;io;summer;photography;europa;happy;end;je;lacrusesc;contrejour;chiaroscuro;morninsunshine;treadmill;gainz;workout;sunshine;getstrong;eu;rumunsko;calatoriecupasiune;superduper;selfie;lazyday;","TESTMTAG","50","25","BaE5OZpgfRu","Image","Sunshine Boulevard Sunshine Boulevard Sunshine Bou","821648SS21642"' +'\n') - - photolist = [] # clear photolist for every file - ##f_count += 1 - ##if f_count > 25: - ## break - # guid_list.clear() #duplicate detection only for last 500k items - with open(file_name, newline='', encoding='utf8') as f: # On input, if newline is None, universal newlines mode is enabled. Lines in the input can end in '\n', '\r', or '\r\n', and these are translated into '\n' before being returned to the caller. - partcount += 1 - if (DSource == "fromInstagram_LocMedia_CSV" or DSource == "fromLBSN" or DSource == "fromLBSN_old" or DSource == "fromInstagram_UserMedia_CSV" or DSource == "fromFlickr_CSV" or DSource == "fromInstagram_PGlbsnEmoji" or DSource == "fromSensorData_InfWuerz"): - photolist = csv.reader(f, delimiter=',', quotechar='"', quoting=quoting_opt) #QUOTE_NONE is important because media saved from php/Flickr does not contain any " check; only ',' are replaced - next(photolist, None) # skip headerline - elif (DSource == "fromInstagram_HashMedia_JSON"): - photolist = photolist + json.loads(f.read()) - #PhotosPerDayLists = defaultdict(list) - #keyCreatedHash = set() - for item in photolist: - #duplicate check based on GUID - if item[guid_columnNameID] in photoIDHash: - skippedCount += 1 - continue - else: - photoIDHash.add(item[guid_columnNameID]) - if (DSource == "fromInstagram_LocMedia_CSV"): - if len(item) < 15: - #skip - skippedCount += 1 - continue - else: - photo_source = Sourcecode #LocamediaCode - photo_guid = item[0] #guid - photo_userid = item[2] - #photo_owner = item[7] ##!!! - photo_shortcode = item[5] - photo_uploadDate = item[3] # format datetime as string - photo_idDate = photo_uploadDate #use upload date as sorting ID - photo_caption = item[7] - photo_likes = item[12] - photo_tags = ";" + item[8] + ";" - photo_thumbnail = item[6] - photo_comments = item[13] - photo_mediatype = item[4] - photo_locID = item[14] - photo_locName = item[1] - #assign lat/lng coordinates from dict - if (photo_locID in loc_dict): - photo_latitude = loc_dict[photo_locID][0] - photo_longitude = loc_dict[photo_locID][1] - #setLatLngBounds(photo_latitude,photo_longitude) - if shapefileIntersect: - #skip all outside shapefile - if photo_locID in shapeFileExcludelocIDhash: - count_outside_shape += 1 - continue - if not photo_locID in shapeFileIncludedlocIDhash: - LngLatPoint = Point(photo_longitude, photo_latitude) - if not LngLatPoint.within(shp_geom): - count_outside_shape += 1 - shapeFileExcludelocIDhash.add(photo_locID) - continue - else: - shapeFileIncludedlocIDhash.add(photo_locID) - else: - if excludeWhereMissingGeocode: - skippedCount += 1 - continue #skip non-geotagged medias - else: - photo_latitude = "" - photo_longitude = "" - #assign usernames from dict - if photo_userid in user_dict: - photo_owner = user_dict[photo_userid] - elif photo_userid in netlytics_usernameid_dict: - photo_owner = netlytics_usernameid_dict[photo_userid] - else: - photo_owner = "" - #empty for instagram: - photo_mTags = "" - photo_dateTaken = "" - photo_views = "" - elif DSource == "fromInstagram_UserMedia_CSV": - if len(item) < 15: - #skip - skippedCount += 1 - continue - else: - photo_source = Sourcecode #LocMediaCode - photo_guid = item[0].split("_")[0] #guid - photo_userid = item[0].split("_")[1] #userid - photo_owner = item[1] ##!!! - photo_shortcode = item[6] - photo_uploadDate = item[4] # format datetime as string - photo_idDate = photo_uploadDate #use upload date as sorting ID - photo_caption = item[8] - photo_likes = item[13] - photo_tags = ";" + item[9] + ";" - photo_thumbnail = item[7] - photo_comments = item[14] - photo_mediatype = item[5] - photo_locID = item[2] - if photo_locID == "": - count_non_geotagged += 1 - continue #skip non-geotagged medias - photo_locName = item[3] - #assign lat/lng coordinates from dict - if (photo_locID in loc_dict): - photo_latitude = loc_dict[photo_locID][0] - photo_longitude = loc_dict[photo_locID][1] - if shapefileIntersect: - #skip all outside shapefile - if photo_locID in shapeFileExcludelocIDhash: - count_outside_shape += 1 - continue - if not photo_locID in shapeFileIncludedlocIDhash: - LngLatPoint = Point(photo_longitude, photo_latitude) - if not LngLatPoint.within(shp_geom): - count_outside_shape += 1 - shapeFileExcludelocIDhash.add(photo_locID) - continue - else: - shapeFileIncludedlocIDhash.add(photo_locID) - else: - if excludeWhereMissingGeocode: - skippedCount += 1 - continue #skip non-geotagged medias - else: - photo_latitude = "" - photo_longitude = "" - #empty for Instagram: - photo_mTags = "" - photo_dateTaken = "" - photo_views = "" - elif DSource == "fromFlickr_CSV": - if len(item) < 12: - #skip - skippedCount += 1 - continue - else: - photo_source = Sourcecode #LocMediaCode - photo_guid = item[5] #photoID - photo_userid = item[7] #userid - photo_owner = item[6] ##!!! - photo_shortcode = "" - photo_uploadDate = datetime.datetime.strptime(item[9],'%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S') # format datetime as string - photo_dateTaken = datetime.datetime.strptime(item[8] ,'%m/%d/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S') - photo_idDate = photo_dateTaken #use date taken date as sorting ID - photo_caption = item[3] - photo_likes = "" - #Filter tags based on two stoplists - if clusterTags or topicModeling: - photo_tags = set(filter(None, item[11].lower().split(";"))) #filter empty strings from photo_tags list and convert to set (hash) with unique values - #Filter tags based on two stoplists - photo_tags, count_tags, count_skipped = utils.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set) - count_tags_global += count_tags - count_tags_skipped += count_skipped - else: - photo_tags = set() - #if not "water" in photo_tags: - # continue - photo_thumbnail = item[4] - photo_comments = "" - photo_mediatype = "" - photo_locName = "" - if is_number(item[1]): - photo_latitude = Decimal(item[1]) - else: - skippedCount += 1 - continue - if is_number(item[2]): - photo_longitude = Decimal(item[2]) - else: - skippedCount += 1 - continue - setLatLngBounds(photo_latitude,photo_longitude) - photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng - photo_mTags = "" #not used currently but available - photo_views = item[10] - elif (DSource == "fromInstagram_HashMedia_JSON"): - photo_source = Sourcecode #HashMediaCode - if item.get('owner'): - photo_userid = item["owner"]["id"] - else: - # skip problematic entries - skippedCount += 1 - continue - if item.get('edge_liked_by'): - photo_likes = item["edge_liked_by"]["count"] - else: - photo_likes = "" - if item.get('edge_media_to_caption') and not len(item.get("edge_media_to_caption", {}).get("edges")) == 0: - photo_caption = item["edge_media_to_caption"]["edges"][0]["node"]["text"].replace('\n', ' ').replace('\r', '') - else: - photo_caption = "" - if item.get('edge_media_to_comment'): - photo_comments = item["edge_media_to_comment"]["count"] - else: - photo_comments = "" - if item.get('id'): - photo_guid = item["id"] - else: - # skip problematic entries - skippedCount += 1 - continue - if item.get('is_video'): - photo_mediatype = "video" - else: - photo_mediatype = "image" - if item.get('location'): - photo_locID = item["location"]["id"] - photo_locName = item["location"]["name"] - else: - # skip non geotagged - count_non_geotagged += 1 - continue - if item.get('shortcode'): - photo_shortcode = item["shortcode"] - else: - photo_shortcode = "" - if item.get('tags'): - photo_tags =';'.join(item["tags"]).replace('\n', ' ').replace('\r', '') - photo_tags = ";%s;" % (photo_tags.replace(",", ";")) - else: - photo_tags = "" - if item.get('taken_at_timestamp'): - pdate = datetime.datetime.fromtimestamp(int(item["taken_at_timestamp"])) + timedelta(hours = GMTTimetransform) #GMT conversion - photo_uploadDate = pdate.strftime('%Y-%m-%d %H:%M:%S') # format datetime as string - photo_idDate = photo_uploadDate - else: - # skip problematic entries - skippedCount += 1 - continue - if item.get('thumbnail_src'): - photo_thumbnail = item["thumbnail_src"] - else: - photo_thumbnail = "" - - #assign lat/lng coordinates from dict - if (photo_locID in loc_dict): - photo_latitude = loc_dict[photo_locID][0] - photo_longitude = loc_dict[photo_locID][1] - if shapefileIntersect: - #skip all outside shapefile - if photo_locID in shapeFileExcludelocIDhash: - count_outside_shape += 1 - continue - if not photo_locID in shapeFileIncludedlocIDhash: - LngLatPoint = Point(photo_longitude, photo_latitude) - if not LngLatPoint.within(shp_geom): - count_outside_shape += 1 - shapeFileExcludelocIDhash.add(photo_locID) - continue - else: - shapeFileIncludedlocIDhash.add(photo_locID) - else: - if excludeWhereMissingGeocode: - #count_non_geotagged += 1 - skippedCount += 1 - continue #skip non-geotagged medias - else: - photo_latitude = "" - photo_longitude = "" - - #assign usernames from dict - if photo_userid in user_dict: - photo_owner = user_dict[photo_userid] - elif photo_userid in netlytics_usernameid_dict: - photo_owner = netlytics_usernameid_dict[photo_userid] - else: - photo_owner = "" - #empty for instagram: - photo_mTags = "" - photo_dateTaken = "" - photo_views = "" - elif DSource == "fromInstagram_PGlbsnEmoji": - if len(item) < 15: - #skip - skippedCount += 1 - continue - else: - photo_source = item[0] - photo_guid = item[1] #guid - photo_userid = item[7] #guid - photo_owner = ""#item[1] ##!!! - photo_shortcode = item[18] - photo_uploadDate = item[8] #guid - photo_idDate = photo_uploadDate #use upload date as sorting ID - photo_caption = item[9] - photo_likes = item[13] - #photo_tags = ";" + item[11] + ";" - tags_filtered = utils.extract_emojis(photo_caption) - if not len(tags_filtered) == 0: - count_tags_global += len(tags_filtered) - photo_tags = set(tags_filtered) - else: - photo_tags = set() - #photo_emojis = extract_emojis(photo_caption) - photo_thumbnail = item[17] - photo_comments = item[14] - photo_mediatype = item[19] - - photo_locName = item[4] #guid - if item[2] == "" or item[3] == "": - count_non_geotagged += 1 - continue #skip non-geotagged medias - else: - photo_latitude = Decimal(item[2]) #guid - photo_longitude = Decimal(item[3]) #guid - setLatLngBounds(photo_latitude,photo_longitude) - photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng - #empty for Instagram: - photo_mTags = "" - photo_dateTaken = "" - photo_views = 0 - elif DSource == "fromLBSN": - if len(item) < 15: - #skip - skippedCount += 1 - continue - else: - photo_source = item[0] #LocMediaCode - photo_guid = item[1] #guid - photo_userid = item[4] #guid - photo_owner = ""#item[1] ##!!! - photo_shortcode = None#item[18] - photo_uploadDate = item[6] #guid - photo_idDate = None#photo_uploadDate #use upload date as sorting ID - #Process Spatial Query first (if skipping necessary) - if SortOutPlaces: - if not item[19] == "": - if item[19] in SortOutPlaces_set: - skippedCount += 1 - continue - if item[2] == "" or item[3] == "": - count_non_geotagged += 1 - continue #skip non-geotagged medias - else: - if CorrectPlaces and not item[19] and item[19] in CorrectPlaceLatLng_dict: - photo_latitude = Decimal(CorrectPlaceLatLng_dict[item[19]][0]) #correct lat/lng - photo_longitude = Decimal(CorrectPlaceLatLng_dict[item[19]][1]) #correct lat/lng - else: - photo_latitude = Decimal(item[2]) #guid - photo_longitude = Decimal(item[3]) #guid - setLatLngBounds(photo_latitude,photo_longitude) - photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng - #assign lat/lng coordinates from dict - if shapefileIntersect: - #skip all outside shapefile - if photo_locID in shapeFileExcludelocIDhash: - skippedCount += 1 - continue - if not photo_locID in shapeFileIncludedlocIDhash: - LngLatPoint = Point(photo_longitude, photo_latitude) - if not LngLatPoint.within(shp_geom): - skippedCount += 1 - shapeFileExcludelocIDhash.add(photo_locID) - continue - else: - shapeFileIncludedlocIDhash.add(photo_locID) - if clusterTags or clusterEmojis or topicModeling: - photo_caption = item[14] - else: - photo_caption = "" - photo_likes = 0 - if not item[9] == "": - try: - photo_likes = int(item[9]) - except TypeError: - pass - photo_tags = set() - if clusterTags or topicModeling: - photo_tags = set(filter(None, item[11].lower().split(";"))) #[1:-1] removes curly brackets, second [1:-1] removes quotes - #Filter tags based on two stoplists - if ignoreStoplists: - count_tags = len(photo_tags) - count_skipped = 0 - else: - photo_tags,count_tags,count_skipped = utils.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set) - count_tags_global += count_tags - count_tags_skipped += count_skipped - if clusterEmojis: - emojis_filtered = set(utils.extract_emojis(photo_caption)) - if not len(emojis_filtered) == 0: - count_emojis_global += len(emojis_filtered) - overallNumOfEmojis_global.update(emojis_filtered) - photo_tags = set.union(emojis_filtered) - #photo_tags = ";" + item[11] + ";" - photo_thumbnail = None#item[17] - photo_comments = None#item[14] - photo_mediatype = None#item[19] - photo_locName = item[4] #guid - #empty for Instagram: - photo_mTags = "" - photo_dateTaken = "" - photo_views = 0 - if not item[8] == "": - try: - photo_views = int(item[8]) - except TypeError: - pass - elif DSource == "fromLBSN_old": - if len(item) < 15: - #skip - skippedCount += 1 - continue - else: - photo_source = item[0] #LocMediaCode - photo_guid = item[1] #guid - photo_userid = item[7] #guid - photo_owner = ""#item[1] ##!!! - photo_shortcode = None#item[18] - photo_uploadDate = item[8] #guid - photo_idDate = None#photo_uploadDate #use upload date as sorting ID - if clusterTags or clusterEmojis or topicModeling: - photo_caption = item[9] - else: - photo_caption = "" - photo_likes = None#item[13] - photo_tags = set() - if clusterTags or topicModeling: - photo_tags = set(filter(None, item[11].strip('"').lstrip('{').rstrip('}').lower().split(","))) #[1:-1] removes curly brackets, second [1:-1] removes quotes - #Filter tags based on two stoplists - photo_tags,count_tags,count_skipped = utils.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set) - count_tags_global += count_tags - count_tags_skipped += count_skipped - if clusterEmojis: - emojis_filtered = set(utils.extract_emojis(photo_caption)) - if not len(emojis_filtered) == 0: - count_emojis_global += len(emojis_filtered) - overallNumOfEmojis_global.update(emojis_filtered) - photo_tags = set.union(emojis_filtered) - #photo_tags = ";" + item[11] + ";" - photo_thumbnail = None#item[17] - photo_comments = None#item[14] - photo_mediatype = None#item[19] - photo_locName = item[4] #guid - if item[2] == "" or item[3] == "": - count_non_geotagged += 1 - continue #skip non-geotagged medias - else: - photo_latitude = Decimal(item[2]) #guid - photo_longitude = Decimal(item[3]) #guid - setLatLngBounds(photo_latitude,photo_longitude) - photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng - #empty for Instagram: - photo_mTags = "" - photo_dateTaken = "" - photo_views = 0 - elif DSource == "fromSensorData_InfWuerz": - if len(item) < 5: - #skip - skippedCount += 1 - continue - else: - photo_source = Sourcecode #LocMediaCode - photo_guid = item[1] #guid - photo_userid = item[4] #meta_device_id - photo_owner = ""#item[1] ##!!! - photo_shortcode = "" - photo_uploadDate = item[3] #meta_timestamp_received - photo_idDate = item[2] #meta_timestamp_recorded - photo_caption = item[8] - if not len(photo_caption) == 0: - removeSpecialChars = photo_caption.translate ({ord(c): " " for c in "?.!/;:,[]()'-&#"}) - wordlist = [word for word in removeSpecialChars.lower().split(' ') if not word == "" and len(word) > 1] - photo_tags = set(wordlist) - else: - photo_tags = set() - photo_tags_filtered = set() - for tag in photo_tags: - count_tags_global += 1 - #exclude numbers and those tags that are in SortOutAlways_set - if tag.isdigit() or tag in SortOutAlways_set: - count_tags_skipped += 1 - continue - for inStr in SortOutAlways_inStr_set: - if inStr in tag: - count_tags_skipped += 1 - break - else: - photo_tags_filtered.add(tag) - photo_tags = photo_tags_filtered - if item[6] == "" or item[7] == "": - count_non_geotagged += 1 - continue #skip non-geotagged medias - else: - photo_latitude = Decimal(item[7]) #guid - photo_longitude = Decimal(item[6]) #guid - setLatLngBounds(photo_latitude,photo_longitude) - photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng - #empty for SensorWuerz: - photo_likes = "" - photo_thumbnail = "" - photo_comments = "" - photo_mediatype = "" - photo_locName = "" - photo_mTags = "" - photo_dateTaken = "" - photo_views = 0 - #this code will union all tags of a single user for each location - #further information is derived from the first image for each user-location - photo_locIDUserID = photo_locID + '::' + str(photo_userid) #create userid_loc_id - distinctLocations_set.add(photo_locID) - #print(f'Added: {photo_locID} to distinctLocations_set (len: {len(distinctLocations_set)})') - distinctUserLocations_set.add(photo_locIDUserID) - #print(f'Added: {photo_locIDUserID} to distinctUserLocations_set (len: {len(distinctUserLocations_set)})') - if not photo_userid in LocationsPerUserID_dict or not photo_locID in LocationsPerUserID_dict[photo_userid]: - LocationsPerUserID_dict[photo_userid] |= {photo_locID} # Bit wise or and assignment in one step. -> assign locID to UserDict list if not already contained - count_loc += 1 - UserLocationsFirstPhoto_dict[photo_locIDUserID] = (photo_source, - photo_guid, - photo_owner, - photo_userid, - photo_caption, - photo_thumbnail, - photo_dateTaken, - photo_uploadDate, - photo_views, - photo_tags, - photo_mTags, - photo_likes, - photo_comments, - photo_shortcode, - photo_mediatype, - photo_locName, - photo_locID) - UserLocationTagList_dict[photo_locIDUserID] |= photo_tags #union tags per userid/unique location - removeSpecialChars = photo_caption.translate ({ord(c): " " for c in "?.!/;:,[]()'-&#"}) - if tokenizeJapanese: - wordlist = [word for word in jTokenize(input_sentence) for input_sentence in removeSpecialChars.split(' ')] - else: - wordlist = [word for word in removeSpecialChars.lower().split(' ') if len(word) > 2] #first replace specia characters in caption, then split by space-character - UserLocationWordList_dict[photo_locIDUserID] |= set(wordlist) #union words per userid/unique location - count_glob += 1 - - ##Calculate toplists - if photo_tags: - UserDict_TagCounters_global[photo_userid].update(photo_tags) #add tagcount of this media to overall tagcount or this user, initialize counter for user if not already done - msg = f'Cleaned output to {len(distinctLocations_set):02d} distinct locations from {count_glob:02d} photos (File {partcount} of {len(filelist)}) - Skipped Media: {skippedCount} - Skipped Tags: {count_tags_skipped} of {count_tags_global}' - print(msg, end='\r') -#Append last message to log file -log_texts_list.append(msg) -total_distinct_locations = len(distinctLocations_set) -print_store_log(f'\nTotal users: {len(LocationsPerUserID_dict)} (UC)') -print_store_log(f'Total photos: {count_glob:02d} (PC)') -print_store_log(f'Total tags (PTC): {count_tags_global}') -print_store_log(f'Total emojis (PEC): {count_emojis_global}') -print_store_log(f'Total user photo locations (UPL): {len(distinctUserLocations_set)}') - -#boundary: -print_store_log(f'Bounds are: Min {float(limLngMin)} {float(limLatMin)} Max {float(limLngMax)} {float(limLatMax)}') - -#create structure for tuple with naming for easy referencing -cleanedPhotoLocation_tuple = namedtuple('cleanedPhotoLocation_tuple', 'source lat lng photo_guid photo_owner userid photo_caption photo_dateTaken photo_uploadDate photo_views photo_tags photo_thumbnail photo_mTags photo_likes photo_comments photo_shortcode photo_mediatype photo_locName photo_locID') -cleanedPhotoDict = defaultdict(cleanedPhotoLocation_tuple) -with open("02_Output/Output_cleaned.txt", 'w', encoding='utf8') as csvfile: - csvfile.write("SOURCE,Latitude,Longitude,PhotoID,Owner,UserID,Name,DateTaken,UploadDate,Views,Tags,URL,MTags,Likes,Comments,Shortcode,Type,LocName,LocID," + '\n') - datawriter = csv.writer(csvfile, delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) - for user_key, locationhash in LocationsPerUserID_dict.items(): - for location in locationhash: - locIDUserID = str(location) + '::' + str(user_key) - photo_latlng = location.split(':') - photo = UserLocationsFirstPhoto_dict.get(locIDUserID,(" "," "," "," "," "," "," "," "," "," "," "," "," "," "," "," "," ")) - #create tuple with cleaned photo data - cleanedPhotoLocation = cleanedPhotoLocation_tuple(photo[0],#Source = 0 - float(photo_latlng[0]), #Lat = 1 - float(photo_latlng[1]), #Lng = 2 - photo[1],#photo_guid = 3 - photo[2],#photo_owner = 4 - user_key, #userid = 5 - UserLocationWordList_dict.get(locIDUserID,("",)),#photo_caption = 6 - photo[6],#photo_dateTaken = 7 - photo[7],#photo_uploadDate = 8 - photo[8],#photo_views = 9 - UserLocationTagList_dict.get(locIDUserID,("",)),#photo_tags = 10 - photo[5],#photo_thumbnail = 11 - photo[10],#photo_mTags = 12 - photo[11],#photo_likes = 13 - photo[12],#photo_comments = 14 - photo[13],#photo_shortcode = 15 - photo[14],#photo_mediatype = 16 - photo[15],#photo_locName = 17 - photo[16]#photo_locID = 18 - ) - if writeCleanedData: - ###optional Write Cleaned Data to CSV/TXT - datawriter.writerow([cleanedPhotoLocation.source,#Source = 0 - cleanedPhotoLocation.lat, #Lat = 1 - cleanedPhotoLocation.lng, #Lng = 2 - cleanedPhotoLocation.photo_guid,#photo_guid = 3 - cleanedPhotoLocation.photo_owner,#photo_owner = 4 - cleanedPhotoLocation.userid, #userid = 5 - ";".join(cleanedPhotoLocation.photo_caption),#photo_caption = 6 - cleanedPhotoLocation.photo_dateTaken,#photo_dateTaken = 7 - cleanedPhotoLocation.photo_uploadDate,#photo_uploadDate = 8 - cleanedPhotoLocation.photo_views,#photo_views = 9 - ";".join(cleanedPhotoLocation.photo_tags),#photo_tags = 10 - cleanedPhotoLocation.photo_thumbnail,#photo_thumbnail = 11 - cleanedPhotoLocation.photo_mTags,#photo_mTags = 12 - cleanedPhotoLocation.photo_likes,#photo_likes = 13 - cleanedPhotoLocation.photo_comments,#photo_comments = 14 - cleanedPhotoLocation.photo_shortcode,#photo_shortcode = 15 - cleanedPhotoLocation.photo_mediatype,#photo_mediatype = 16 - cleanedPhotoLocation.photo_locName,#photo_locName = 17 - cleanedPhotoLocation.photo_locID]#photo_locID = 18 - ) - ##optional Write Cleaned Search Terms to CSV for Topic Modeling - #topics = cleanedPhotoLocation.photo_caption.union(cleanedPhotoLocation.photo_tags) - if topicModeling: - if not len(cleanedPhotoLocation.photo_tags) == 0: - UserTopicList_dict[user_key] |= cleanedPhotoLocation.photo_tags - UserTopicList_dict[user_key] |= cleanedPhotoLocation.photo_caption #also use descriptions for Topic Modeling - UserPhotoIDS_dict[user_key] |= {cleanedPhotoLocation.photo_guid} # Bit wise or and assignment in one step. -> assign PhotoGuid to UserDict list if not already contained - #UserPhotoFirstThumb_dict[user_key] = photo[5] - cleanedPhotoDict[cleanedPhotoLocation.photo_guid] = cleanedPhotoLocation -if topicModeling: - #export list of cleaned topics on a per user basis for LDA/TSNE etc. - with open("02_Output/Output_usertopics_anonymized.csv", 'w', encoding='utf8') as csvfile: - csvfile.write("TOPICS,PhotoIDs,UserID" + '\n') - datawriter = csv.writer(csvfile, delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) - for user_key, topics in UserTopicList_dict.items(): - datawriter.writerow([" ".join(topics),"{" + ",".join([hashlib.sha3_256(photoid.encode("utf8")).hexdigest() for photoid in UserPhotoIDS_dict.get(user_key,None)]) + "}",hashlib.sha3_256(user_key.encode("utf8")).hexdigest()]) - with open("02_Output/Output_usertopics.csv", 'w', encoding='utf8') as csvfile: - csvfile.write("TOPICS,PhotoIDs,UserID" + '\n') - datawriter = csv.writer(csvfile, delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) - for user_key, topics in UserTopicList_dict.items(): - datawriter.writerow([" ".join(topics),"{" + ",".join(UserPhotoIDS_dict.get(user_key,None)) + "}",str(user_key)]) - -abort = False -if (clusterTags or clusterEmojis): - print_store_log("########## STEP 2 of 6: Tag Ranking ##########") - overallNumOfUsersPerTag_global = collections.Counter() - for user_key, taghash in UserDict_TagCounters_global.items(): - #taghash contains unique values (= strings) for each user, thus summing up these taghashes counts each user only once per tag - overallNumOfUsersPerTag_global.update(taghash) - - - topTagsList = overallNumOfUsersPerTag_global.most_common(tmax) - #remove all tags that are used by less than two photographers - if removeLongTail is True: - indexMin = next((i for i, (t1, t2) in enumerate(topTagsList) if t2 < 2), None) - if indexMin: - lenBefore = len(topTagsList) - del topTagsList[indexMin:] - lenAfter = len(topTagsList) - tmax = lenAfter - if not lenBefore == lenAfter: - print_store_log(f'Filtered {lenBefore - lenAfter} Tags that were used by less than 2 users.') - #optional write topemojis to file - globalEmojiSet = {} - if clusterEmojis: - topEmojisList = overallNumOfEmojis_global.most_common() - globalEmojiSet = {tuple[0] for tuple in topEmojisList} - if (not len(globalEmojiSet) == 0): - topemojis = ''.join("%s,%i" % v + '\n' for v in topEmojisList) - with open("02_Output/Output_topemojis.txt", 'w', encoding="utf8") as file: #overwrite - file.write("emoji,usercount\n") - file.write(topemojis) - - if clusterTags: - #optional write toptags to file - toptags = ''.join("%s,%i" % v + '\n' for v in topTagsList if not v[0] in globalEmojiSet) - if (not len(topTagsList) == 0): - with open("02_Output/Output_toptags.txt", 'w', encoding="utf8") as file: #overwrite - file.write("tag,usercount\n") - file.write(toptags) - - if statisticsOnly == False: - singleMostUsedtag = topTagsList[0] - now = time.time() - print_store_log("########## STEP 3 of 6: Tag Location Clustering ##########") - #prepare some variables - tnum = 0 - first = True - label_size = 10 - #plt.rcParams['xtick.labelsize'] = label_size - #plt.rcParams['ytick.labelsize'] = label_size - plot_kwds = {'alpha' : 0.5, 's' : 10, 'linewidths':0} - sys.stdout.flush() - proceedClusting = False - - distY = 0 - distX = 0 - imgRatio = 0 - - #Optional: set global plotting bounds - #plt.gca().set_xlim([limXMin, limXMax]) - #plt.gca().set_ylim([limYMin, limYMax]) - cleanedPhotoList = list(cleanedPhotoDict.values()) - df = pd.DataFrame(cleanedPhotoList) - points = df.as_matrix(['lng','lat']) - limYMin,limYMax,limXMin,limXMax = utils.getRectangleBounds(points) - bound_points_shapely = geometry.MultiPoint([(limXMin, limYMin), (limXMax, limYMax)]) - distYLat = limYMax - limYMin - distXLng = limXMax - limXMin - #distYLat = utils.haversine(limXMin,limYMax,limXMin,limYMin) - #distXLng = utils.haversine(limXMax,limYMin,limXMin,limYMin) - #imgRatio = distXLng/(distYLat*2) - imgRatio = distXLng/(distYLat*2) - distY = utils.haversine(limXMin, limYMin, limXMin, limYMax) - distX = utils.haversine(limXMin, limYMin, limXMax, limYMin) - clusterTreeCuttingDist = (min(distX,distY)/100)*7 #4% of research area width/height (max) = default value #223.245922725 #= 0.000035 radians dist - - ######### TKinter Preparation ########### - class App(tk.Tk): - def __init__(self): - tk.Tk.__init__(self) - self.overrideredirect(True) #this is needed to make the root disappear - self.geometry('%dx%d+%d+%d' % (0, 0, 0, 0)) - #create separate floating window - self.floater = FloatingWindow(self) - - class FloatingWindow(tk.Toplevel): - #global app - def __init__(self, *args, **kwargs): - tk.Toplevel.__init__(self, *args, **kwargs) - self.overrideredirect(True) - self.configure(background='gray7') - #self.label = tk.Label(self, text="Click on the grip to move") - self.grip = tk.Label(self, bitmap="gray25") - self.grip.pack(side="left", fill="y") - #self.label.pack(side="right", fill="both", expand=True) - self.grip.bind("", self.StartMove) - self.grip.bind("", self.StopMove) - self.grip.bind("", self.OnMotion) - #center Floating Window - w = self.winfo_reqwidth() - h = self.winfo_reqheight() - ws = self.winfo_screenwidth() - hs = self.winfo_screenheight() - x = (ws/2) - (w/2) - y = (hs/2) - (h/2) - self.geometry('+%d+%d' % (x, y)) - def StartMove(self, event): - self.x = event.x - self.y = event.y - - def StopMove(self, event): - self.x = None - self.y = None - - def OnMotion(self, event): - deltax = event.x - self.x - deltay = event.y - self.y - x = self.winfo_x() + deltax - y = self.winfo_y() + deltay - self.geometry("+%s+%s" % (x, y)) - - #Initialize TKinter Interface - app = App() - #necessary override for error reporting during tkinter mode: - import traceback - def report_callback_exception(self, exc, val, tb): - showerror("Error", message=str(val)) - exc_type, exc_value, exc_traceback = sys.exc_info() - print("*** print_tb:") - traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) - print("*** print_exception:") - traceback.print_exception(exc_type, exc_value, exc_traceback, - limit=2, file=sys.stdout) - print("*** print_exc:") - traceback.print_exc() - print("*** format_exc, first and last line:") - formatted_lines = traceback.format_exc().splitlines() - print(formatted_lines[0]) - print(formatted_lines[-1]) - print("*** format_exception:") - print(repr(traceback.format_exception(exc_type, exc_value, - exc_traceback))) - print("*** extract_tb:") - print(repr(traceback.extract_tb(exc_traceback))) - print("*** format_tb:") - print(repr(traceback.format_tb(exc_traceback))) - print("*** tb_lineno:", exc_traceback.tb_lineno) - tk.Tk.report_callback_exception = report_callback_exception - - #the following code part is a bit garbled due to using TKinter interface - ###################################################################################################################################################### - ###################################################################################################################################################### - ###################################################################################################################################################### - - #definition of global vars for interface and graph design - canvasWidth = 1320 - canvasHeight = 440 - floaterX = 0 - floaterY = 0 - #Cluster preparation - sns.set_context('poster') - sns.set_style('white') - #sns.set_color_codes() - #matplotlib.style.use('ggplot') - plt.style.use('ggplot') - graphFrame = None - lastselectionList = [] - currentDisplayTag = None - genPreviewMap = tk.IntVar(value = 0) - createMinimumSpanningTree = False - autoselectClusters = False - #definition of global figure for reusing windows - fig1 = None - fig2 = None - fig3 = None - fig4 = None - - def quitTkinter(): - #exits Tkinter gui and continues with code execution after mainloop() - #global app - #app.floater.destroy() - #tkinter.messagebox.showinfo("Closing App", "Closing App") - #plt.quit() - global abort - abort = True - app.update() #see https://stackoverflow.com/questions/35040168/python-tkinter-error-cant-evoke-event-command - app.destroy() - app.quit() ##root.quit() causes mainloop to exit, see https://stackoverflow.com/questions/2307464/what-is-the-difference-between-root-destroy-and-root-quit - def proceedWithCluster(): - global proceedClusting - global fig1 - proceedClusting = True - #def vis_tag(tag): - #tkinter.messagebox.showinfo("Proceed", "Proceed") - #if plt.figure(1): - # plt.figure(1).clf() - app.update() #see https://stackoverflow.com/questions/35040168/python-tkinter-error-cant-evoke-event-command - app.destroy() - app.quit() - - def sel_photos(tag,cleanedPhotoList): - #select photos from list based on a specific tag - distinctLocalLocationCount = set() - selectedPhotoList_Guids = [] - for cleanedPhotoLocation in cleanedPhotoList: - if tag in (cleanedPhotoLocation.photo_tags) or (tag in cleanedPhotoLocation.photo_caption): - selectedPhotoList_Guids.append(cleanedPhotoLocation.photo_guid) - distinctLocalLocationCount.add(cleanedPhotoLocation.photo_locID) - return selectedPhotoList_Guids, len(distinctLocalLocationCount) - - def cluster_tag(toptag=None,preview=None,silent=None): - if preview is None: - preview = False - if silent is None: - silent = False - global first - global currentDisplayTag - global limYMin, limYMax, limXMin, limXMax, imgRatio, floaterX, floaterY - global fig1, fig2, fig3, fig4 - selectedPhotoList_Guids, distinctLocalLocationCount = sel_photos(toptag[0],cleanedPhotoList) - percentageOfTotalLocations = distinctLocalLocationCount/(total_distinct_locations/100) - if silent: - if toptag[0] in globalEmojiSet: - text = unicode_name(toptag[0]) - else: - text = toptag[0] - print_store_log(f'({tnum} of {tmax}) Found {len(selectedPhotoList_Guids)} photos (UPL) for tag \'{text}\' ({percentageOfTotalLocations:.0f}% of total distinct locations in area)', end=" ") - - - #clustering - if len(selectedPhotoList_Guids) < 2: - return [], selectedPhotoList_Guids - selectedPhotoList = [cleanedPhotoDict[x] for x in selectedPhotoList_Guids] - #only used for tag clustering, otherwise (photo location clusters), global vars are used (df, points) - df = pd.DataFrame(selectedPhotoList) - points = df.as_matrix(['lng','lat']) #converts pandas data to numpy array (limit by list of column-names) - - #only return preview fig without clustering - if preview == True: - #only map data - if genPreviewMap.get() == 1: - if fig1: - plt.figure(1).clf() #clear figure 1 - #earth = Basemap() - #earth.bluemarble(alpha=0.42) - #earth.drawcoastlines(color='#555566', linewidth=1) - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') - #reuse window of figure 1 for new figure - plt.scatter(points.T[0], points.T[1], color='red', **plot_kwds) - fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) - fig1.canvas.set_window_title('Preview Map') - #displayImgPath = pathname + '/Output/ClusterImg/00_displayImg.png' - #fig1.figure.savefig(displayImgPath) - else: - - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') - plt.scatter(points.T[0], points.T[1], color='red', **plot_kwds) - fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) - #earth = Basemap() - #earth.bluemarble(alpha=0.42) - #earth.drawcoastlines(color='#555566', linewidth=1) - fig1.canvas.set_window_title('Preview Map') - plt.gca().set_xlim([limXMin, limXMax]) - plt.gca().set_ylim([limYMin, limYMax]) - plt.tick_params(labelsize=10) - currentDisplayTag = toptag - else: - #cluster data - tagRadiansData = np.radians(points) #conversion to radians for HDBSCAN (does not support decimal degrees) - #for each tag in overallNumOfUsersPerTag_global.most_common(1000) (descending), calculate HDBSCAN Clusters - - minClusterSize = max(2,int(((len(selectedPhotoList_Guids))/100)*5)) #4% optimum - clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize,gen_min_span_tree=createMinimumSpanningTree,allow_single_cluster=True,min_samples=1) - #clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize,gen_min_span_tree=True,min_samples=1) - #clusterer = hdbscan.HDBSCAN(min_cluster_size=10,metric='haversine',gen_min_span_tree=False,allow_single_cluster=True) - #clusterer = hdbscan.robust_single_linkage_.RobustSingleLinkage(cut=0.000035) - #srsl_plt = hdbscan.robust_single_linkage_.plot() - - #Start clusterer on different thread to prevent GUI from freezing - #See: http://stupidpythonideas.blogspot.de/2013/10/why-your-gui-app-freezes.html - #https://stackoverflow.com/questions/6893968/how-to-get-the-return-value-from-a-thread-in-python - #if silent: - # #on silent command line operation, don't use multiprocessing - # clusterer = fit_cluster(clusterer,tagRadiansData) - #else: - with warnings.catch_warnings(): - #disable joblist multithread warning - warnings.simplefilter('ignore', UserWarning) - async_result = pool.apply_async(utils.fit_cluster, (clusterer, tagRadiansData)) - clusterer = async_result.get() - #clusterer.fit(tagRadiansData) - #updateNeeded = False - - if autoselectClusters: - sel_labels = clusterer.labels_ #auto selected clusters - else: - sel_labels = clusterer.single_linkage_tree_.get_clusters(utils.getRadiansFromMeters(clusterTreeCuttingDist), min_cluster_size=2) #0.000035 without haversine: 223 m (or 95 m for 0.000015) - - #exit function in case final processing loop (no figure generating) - if silent: - return sel_labels, selectedPhotoList_Guids - mask_noisy = (sel_labels == -1) - number_of_clusters = len(np.unique(sel_labels[~mask_noisy])) #len(sel_labels) - #palette = sns.color_palette("hls", ) - #palette = sns.color_palette(None, len(sel_labels)) #sns.color_palette("hls", ) #sns.color_palette(None, 100) - palette = sns.color_palette("husl", number_of_clusters+1) - sel_colors = [palette[x] if x >= 0 - else (0.5, 0.5, 0.5) - #for x in clusterer.labels_ ] - for x in sel_labels] #clusterer.labels_ (best selection) or sel_labels (cut distance) - #tkinter.messagebox.showinfo("Num of clusters: ", str(len(sel_colors)) + " " + str(sel_colors[1])) - #output/update matplotlib figures - - if fig1: - plt.figure(1).clf() - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') #plt references the last figure accessed - ax = plt.scatter(points.T[0], points.T[1], color=sel_colors, **plot_kwds) - fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) - fig1.canvas.set_window_title('Cluster Preview') - distText = '' - if autoselectClusters == False: - distText = '@ ' + str(clusterTreeCuttingDist) +'m' - plt.title(f'Cluster Preview {distText}', fontsize=12,loc='center') - #plt.title('Cluster Preview') - #xmax = ax.get_xlim()[1] - #ymax = ax.get_ylim()[1] - noisyTxt = '{}/{}'.format(mask_noisy.sum(), len(mask_noisy)) - plt.text(limXMax, limYMax,f'{number_of_clusters} Cluster (Noise: {noisyTxt})',fontsize=10,horizontalalignment='right',verticalalignment='top',fontweight='bold') - else: - plt.scatter(points.T[0], points.T[1], c=sel_colors, **plot_kwds) - fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) - fig1.canvas.set_window_title('Cluster Preview') - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') - distText = '' - if autoselectClusters == False: - distText = '@ ' + str(clusterTreeCuttingDist) +'m' - plt.title(f'Cluster Preview {distText}', fontsize=12,loc='center') - #xmax = fig1.get_xlim()[1] - #ymax = fig1.get_ylim()[1] - noisyTxt = '{} / {}'.format(mask_noisy.sum(), len(mask_noisy)) - plt.text(limXMax, limYMax,f'{number_of_clusters} Clusters (Noise: {noisyTxt})',fontsize=10,horizontalalignment='right',verticalalignment='top',fontweight='bold') - plt.gca().set_xlim([limXMin, limXMax]) - plt.gca().set_ylim([limYMin, limYMax]) - plt.tick_params(labelsize=10) - #if len(tagRadiansData) < 10000: - if fig2: - plt.figure(2).clf() - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') - #plt.title('Condensed Tree', fontsize=12,loc='center') - clusterer.condensed_tree_.plot(select_clusters=False, selection_palette=sel_colors)#,label_clusters=False - #tkinter.messagebox.showinfo("Num of clusters: ", str(len(sel_colors)) + " " + str(sel_colors[0])) - else: - plt.figure(2).canvas.set_window_title('Condensed Tree') - fig2 = clusterer.condensed_tree_.plot(select_clusters=False, selection_palette=sel_colors)#,label_clusters=False - #tkinter.messagebox.showinfo("Num of clusters: ", str(len(sel_colors)) + " " + str(sel_colors[1])) - #fig2 = clusterer.condensed_tree_.plot(select_clusters=False, selection_palette=sel_colors,label_clusters=True) - #plt.title('Condensed Tree', fontsize=12,loc='center') - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') - plt.tick_params(labelsize=10) - if fig3: - plt.figure(3).clf() - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') - plt.title('Single Linkage Tree', fontsize=12,loc='center') - #clusterer.single_linkage_tree_.plot(truncate_mode='lastp',p=50) - ax = clusterer.single_linkage_tree_.plot(truncate_mode='lastp',p=max(50,min(number_of_clusters*10,256))) #p is the number of max count of leafs in the tree, this should at least be the number of clusters*10, not lower than 50 [but max 500 to not crash] - #tkinter.messagebox.showinfo("messagr", str(type(ax))) - #plot cutting distance - y = utils.getRadiansFromMeters(clusterTreeCuttingDist) - xmin = ax.get_xlim()[0] - xmax = ax.get_xlim()[1] - line, = ax.plot([xmin, xmax], [y, y], color='k', label=f'Cluster (Cut) Distance {clusterTreeCuttingDist}m') - line.set_label(f'Cluster (Cut) Distance {clusterTreeCuttingDist}m') - ax.legend(fontsize=10) - vals = ax.get_yticks() - ax.set_yticklabels(['{:3.1f}m'.format(utils.getMetersFromRadians(x)) for x in vals]) - else: - plt.figure(3).canvas.set_window_title('Single Linkage Tree') - fig3 = clusterer.single_linkage_tree_.plot(truncate_mode='lastp',p=max(50,min(number_of_clusters*10,256))) - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') - plt.title('Single Linkage Tree', fontsize=12,loc='center') - #tkinter.messagebox.showinfo("messagr", str(type(fig3))) - #plot cutting distance - y = utils.getRadiansFromMeters(clusterTreeCuttingDist) - xmin = fig3.get_xlim()[0] - xmax = fig3.get_xlim()[1] - line, = fig3.plot([xmin, xmax], [y, y], color='k', label=f'Cluster (Cut) Distance {clusterTreeCuttingDist}m') - line.set_label(f'Cluster (Cut) Distance {clusterTreeCuttingDist}m') - fig3.legend(fontsize=10) - vals = fig3.get_yticks() - fig3.set_yticklabels([f'{utils.getMetersFromRadians(x):3.1f}m' for x in vals]) - plt.tick_params(labelsize=10) - if createMinimumSpanningTree: - if fig4: - plt.figure(4).clf() - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') - #plt.title('Single Linkage Tree', fontsize=12,loc='center') - #clusterer.single_linkage_tree_.plot(truncate_mode='lastp',p=50) - ax = clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis', - edge_alpha=0.6, - node_size=10, - edge_linewidth=1) #tkinter.messagebox.showinfo("messagr", str(type(ax))) - fig4.canvas.set_window_title('Minimum Spanning Tree') - plt.title(f'Minimum Spanning Tree @ {clusterTreeCuttingDist}m', fontsize=12,loc='center') - ax.legend(fontsize=10) - #ax=plt.gca() #plt.gca() for current axis, otherwise set appropriately. - #im=ax.images #this is a list of all images that have been plotted - #cb=im[0].colorbar ##in this case I assume to be interested to the last one plotted, otherwise use the appropriate index - #cb.ax.tick_params(labelsize=10) - #vals = cb.ax.get_yticks() - #cb.ax.set_yticklabels(['{:3.1f}m'.format(getMetersFromRadians(x)) for x in vals]) - else: - plt.figure(4).canvas.set_window_title('Minimum Spanning Tree') - fig4 = clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis', - edge_alpha=0.6, - node_size=10, - edge_linewidth=1) #tkinter.messagebox.showinfo("messagr", str(type(ax))) - plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') - plt.title(f'Minimum Spanning Tree @ {clusterTreeCuttingDist}m', fontsize=12,loc='center') - fig4.legend(fontsize=10) - #ax=plt.gca() #plt.gca() for current axis, otherwise set appropriately. - #im=ax.images #this is a list of all images that have been plotted - #cb=im[0].colorbar ##in this case I assume to be interested to the last one plotted, otherwise use the appropriate index - #cb.ax.tick_params(labelsize=10) - #vals = cb.ax.get_yticks() - #cb.ax.set_yticklabels(['{:3.1f}m'.format(getMetersFromRadians(x)) for x in vals]) - plt.tick_params(labelsize=10) - #adjust scalebar limits - global tkScalebar - tkScalebar.configure(from_=(clusterTreeCuttingDist/100), to=(clusterTreeCuttingDist*2)) - - def change_clusterDist(val): - #tkinter.messagebox.showinfo("messagr", val) - global clusterTreeCuttingDist - #global canvas - #global tkScalebar - clusterTreeCuttingDist = float(val)#tkScalebar.get() - - def onselect(evt): - # Note here that Tkinter passes an event object to onselect() - global lastselectionList - global tnum - w = evt.widget - if lastselectionList: #if not empty - changedSelection = set(lastselectionList).symmetric_difference(set(w.curselection())) - lastselectionList = w.curselection() - else: - lastselectionList = w.curselection() - changedSelection = w.curselection() - index = int(list(changedSelection)[0]) - value = w.get(index) - #tkinter.messagebox.showinfo("You selected ", value) - tnum = 1 - cluster_tag(topTagsList[index],True) #generate only preview map - #plt.close('all') - def cluster_currentDisplayTag(): - if currentDisplayTag: - #tkinter.messagebox.showinfo("Clustertag: ", str(currentDisplayTag)) - cluster_tag(currentDisplayTag) - else: - cluster_tag(topTagsList[0]) - def scaletest_currentDisplayTag(): - global tkScalebar - global fig1 - if currentDisplayTag: - clustertag = currentDisplayTag - else: - clustertag = topTagsList[0] - scalecalclist = [] - dmax = int(clusterTreeCuttingDist*10) - dmin = int(clusterTreeCuttingDist/10) - dstep = int(((clusterTreeCuttingDist*10)-(clusterTreeCuttingDist/10))/100) - for i in range(dmin,dmax,dstep): - change_clusterDist(i) - tkScalebar.set(i) - app.update() - #clusterTreeCuttingDist = i - clusters, selectedPhotoList_Guids = cluster_tag(clustertag, None, True) - mask_noisy = (clusters == -1) - number_of_clusters = len(np.unique(clusters[~mask_noisy])) #mit noisy (=0) - if number_of_clusters == 1: - break - form_string = f'{i},{number_of_clusters},{mask_noisy.sum()},{len(mask_noisy)},\n' - scalecalclist.append(form_string) - with open(f'02_Output/scaletest_{clustertag[0]}.txt', "w", encoding='utf-8') as logfile_a: - for scalecalc in scalecalclist: - logfile_a.write(scalecalc) - plt.figure(1).clf() - plt.suptitle(clustertag[0].upper(), fontsize=18, fontweight='bold') #plt references the last figure accessed - fig1 = plt.figure(num=1,figsize=(11, int(11*imgRatio)), dpi=80) - fig1.canvas.set_window_title('Cluster Preview') - distText = '' - if autoselectClusters == False: - distText = f'@ {clusterTreeCuttingDist}m' - plt.title(f'Cluster Preview {distText}', fontsize=12,loc='center') - noisyTxt = f'{mask_noisy.sum()}/{len(mask_noisy)}' - plt.text(limXMax, limYMax,f'{number_of_clusters} Cluster (Noise: {noisyTxt})',fontsize=10,horizontalalignment='right',verticalalignment='top',fontweight='bold') - - def delete(listbox): - global topTagsList - global lastselectionList - lastselectionList = [] - # Delete from Listbox - selection = listbox.curselection() - for index in selection[::-1]: - listbox.delete(index) - del(topTagsList[index]) - ###################################################################################################################################################### - ###################################################################################################################################################### - ###################################################################################################################################################### - - #A frame is created for each window/part of the gui; after it is used, it is destroyed with frame.destroy() - - listboxFrame = tk.Frame(app.floater) - canvas = tk.Canvas(listboxFrame, width=150, height=200, highlightthickness=0,background="gray7") - l = tk.Label(canvas, text="Optional: Exclude tags.", background="gray7",fg="gray80",font="Arial 10 bold") - l.pack(padx=10, pady=10) - l = tk.Label(canvas, text="Select all tags you wish to exclude from analysis \n and click on remove to proceed.", background="gray7",fg="gray80") - l.pack(padx=10, pady=10) - #if DSource == "fromInstagram_PGlbsnEmoji": - # listbox_font = ("twitter Color Emoji", 12, "bold") - # #listbox_font = ("Symbola", 12, "bold") - #else: - listbox_font = None - listbox = tk.Listbox(canvas,selectmode=tk.MULTIPLE, bd=0,background="gray29",fg="gray91",width=30,font=listbox_font) - listbox.bind('<>', onselect) - scroll = tk.Scrollbar(canvas, orient=tk.VERTICAL,background="gray20",borderwidth=0) - scroll.configure(command=listbox.yview) - scroll.pack(side="right", fill="y") - listbox.pack() - listbox.config(yscrollcommand=scroll.set) - for item in topTagsList: #only for first 500 entries: use topTagsList[:500] - try: - listbox.insert(tk.END, f'{item[0]} ({item[1]} user)') - except tk.TclError: - emoji = unicode_name(item[0]) #utils.with_surrogates() - listbox.insert(tk.END, f'{emoji} ({item[1]} user)') - canvas.pack(fill='both',padx=0, pady=0) - listboxFrame.pack(fill='both',padx=0, pady=0) - buttonsFrame = tk.Frame(app.floater) - canvas = tk.Canvas(buttonsFrame, width=150, height=200, highlightthickness=0,background="gray7") - b = tk.Button(canvas, text = "Remove Tag(s)", command = lambda: delete(listbox), background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") - b.pack(padx=10, pady=10) - c = tk.Checkbutton(canvas, text="Map Tags", variable=genPreviewMap,background="gray7",fg="gray80",borderwidth=0,font="Arial 10 bold") - c.pack(padx=10, pady=10) - tkScalebar = tk.Scale(canvas, from_=(clusterTreeCuttingDist/100), to=(clusterTreeCuttingDist*2), orient=tk.HORIZONTAL,resolution=0.1,command=change_clusterDist,length=300,label="Cluster Cut Distance (in Meters)",background="gray20",borderwidth=0,fg="gray80",font="Arial 10 bold") - tkScalebar.set(clusterTreeCuttingDist)#(clusterTreeCuttingDist*10) - (clusterTreeCuttingDist/10)/2) #set position of slider to center - tkScalebar.pack() - b = tk.Button(canvas, text = "Cluster Preview", command=cluster_currentDisplayTag, background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") - b.pack(padx=10, pady=10,side="left") - b = tk.Button(canvas, text = "Scale Test", command=scaletest_currentDisplayTag, background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") - b.pack(padx=10, pady=10,side="left") - b = tk.Button(canvas, text = "Proceed..", command=proceedWithCluster, background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") - b.pack(padx=10, pady=10,side="left") - b = tk.Button(canvas, text = "Quit", command=quitTkinter, background="gray20",fg="gray80",borderwidth=0,font="Arial 10 bold") - b.pack(padx=10, pady=10,side="left") - canvas.pack(fill='both',padx=0, pady=0) - buttonsFrame.pack(fill='both',padx=0, pady=0) - - #END OF TKINTER LOOP, welcome back to command line interface - app.mainloop() - plt.close("all") - - noClusterPhotos_perTag_DictOfLists = defaultdict(list) - clustersPerTag = defaultdict(list) - if proceedClusting: - #Proceed with clustering all tags - crs_wgs = pyproj.Proj(init='epsg:4326') #data always in lat/lng WGS1984 - if overrideCRS is None: - #Calculate best UTM Zone SRID/EPSG Code - input_lon_center = bound_points_shapely.centroid.coords[0][0] #True centroid (coords may be multipoint) - input_lat_center = bound_points_shapely.centroid.coords[0][1] - epsg_code = utils.convert_wgs_to_utm(input_lon_center, input_lat_center) - crs_proj = pyproj.Proj(init=f'epsg:{epsg_code}') - project = lambda x, y: pyproj.transform(pyproj.Proj(init='epsg:4326'), pyproj.Proj(init=f'epsg:{epsg_code}'), x, y) - #geom_proj = transform(project, alphaShapeAndMeta[0]) - - if localSaturationCheck: - clusters, selectedPhotoList_Guids = cluster_tag(singleMostUsedtag, None, True) - numpy_selectedPhotoList_Guids = np.asarray(selectedPhotoList_Guids) - mask_noisy = (clusters == -1) - number_of_clusters = len(np.unique(clusters[~mask_noisy])) - print_store_log(f'--> {number_of_clusters} cluster.') - clusterPhotosGuidsList = [] - for x in range(number_of_clusters): - currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x] - clusterPhotosGuidsList.append(currentClusterPhotoGuids) - noClusterPhotos_perTag_DictOfLists[singleMostUsedtag[0]] = list(numpy_selectedPhotoList_Guids[clusters==-1]) - # Sort descending based on size of cluster: https://stackoverflow.com/questions/30346356/how-to-sort-list-of-lists-according-to-length-of-sublists - clusterPhotosGuidsList.sort(key=len, reverse=True) - if not len(clusterPhotosGuidsList) == 0: - clustersPerTag[singleMostUsedtag[0]] = clusterPhotosGuidsList - tnum = 1 - for toptag in topTagsList: - if localSaturationCheck and tnum == 1 and toptag[0] in clustersPerTag: - #skip toptag if already clustered due to local saturation - continue - clusters, selectedPhotoList_Guids = cluster_tag(toptag, None, True) - #print("baseDataList: ") - #print(str(type(selectedPhotoList))) - #for s in selectedPhotoList[:2]: - # print(*s) - #print("resultData: ") - ##for s in clusters[:2]: - ## print(*s) - #print(str(type(clusters))) - #print(clusters) - #clusters contains the cluster values (-1 = no cluster, 0 maybe, >0 = cluster - # in the same order, selectedPhotoList contains all original photo data, thus clusters[10] and selectedPhotoList[10] refer to the same photo - - numpy_selectedPhotoList_Guids = np.asarray(selectedPhotoList_Guids) - mask_noisy = (clusters == -1) - if len(selectedPhotoList_Guids) == 1: - number_of_clusters = 0 - else: - number_of_clusters = len(np.unique(clusters[~mask_noisy])) #mit noisy (=0) - #if number_of_clusters > 200: - # print_store_log("--> Too many, skipped for this scale.") - # continue - if not number_of_clusters == 0: - print_store_log(f'--> {number_of_clusters} cluster.') - tnum += 1 - photo_num = 0 - #clusternum_photolist = zip(clusters,selectedPhotoList) - #clusterPhotosList = [[] for x in range(number_of_clusters)] - clusterPhotosGuidsList = [] - for x in range(number_of_clusters): - currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x] - clusterPhotosGuidsList.append(currentClusterPhotoGuids) - noClusterPhotos_perTag_DictOfLists[toptag[0]] = list(numpy_selectedPhotoList_Guids[clusters==-1]) - # Sort descending based on size of cluster: https://stackoverflow.com/questions/30346356/how-to-sort-list-of-lists-according-to-length-of-sublists - clusterPhotosGuidsList.sort(key=len, reverse=True) - if not len(clusterPhotosGuidsList) == 0: - clustersPerTag[toptag[0]] = clusterPhotosGuidsList - else: - print_store_log("--> No cluster.") - noClusterPhotos_perTag_DictOfLists[toptag[0]] = list(numpy_selectedPhotoList_Guids) - #for x in clusters: - # #photolist = [] - # if x >= 0: # no clusters: x = -1 - # clusterPhotosList[x].append([selectedPhotoList[photo_num]]) - # #clusterPhotosArray_dict[x].add(selectedPhotoList[photo_num]) - # else: - # noClusterPhotos_perTag_DictOfLists[toptag[0]].append(selectedPhotoList[photo_num]) - # photo_num+=1 - - #print("resultList: ") - #for s in clusterPhotosList[:2]: - # print(*s) - #print(str(toptag) + " - Number of clusters: " + str(len(clusterPhotosList)) + " Photo num: " + str(photo_num)) - - #plt.autoscale(enable=True) - - #if tnum == 50: - # break - #plt.savefig('foo.png') - #sys.exit() - print_store_log("########## STEP 4 of 6: Generating Alpha Shapes ##########") - #if (tnum % 50 == 0):#modulo: if division has no remainder, force update cmd output - sys.stdout.flush() - #for each cluster of points, calculate boundary shape and add statistics (HImpTag etc.) - listOfAlphashapesAndMeta = [] - tnum = 0 - if localSaturationCheck: - #calculate total area of Top1-Tag for 80% saturation check for lower level tags - saturationExcludeCount = 0 - clusterPhotoGuidList = clustersPerTag.get(singleMostUsedtag[0], None) - #print("Toptag: " + str(singleMostUsedtag[0])) - if clusterPhotoGuidList is None: - sys.exit(f'No Photos found for toptag: {singleMostUsedtag}') - toptagArea = utils.generateClusterShape(toptag,clusterPhotoGuidList,cleanedPhotoDict,crs_wgs,crs_proj,clusterTreeCuttingDist,localSaturationCheck)[1] - for toptag in topTagsList: - tnum += 1 - clusterPhotoGuidList = clustersPerTag.get(toptag[0], None) - #Generate tag Cluster Shapes - if clusterPhotoGuidList: - listOfAlphashapesAndMeta_tmp,tagArea = utils.generateClusterShape(toptag,clusterPhotoGuidList,cleanedPhotoDict,crs_wgs,crs_proj,clusterTreeCuttingDist,localSaturationCheck) - if localSaturationCheck and not tagArea == 0 and not tnum == 1: - localSaturation = tagArea/(toptagArea/100) - #print("Local Saturation for Tag " + toptag[0] + ": " + str(round(localSaturation,0))) - if localSaturation > 60: - #skip tag entirely due to saturation (if total area > 80% of total area of toptag clusters) - #print("Skipped: " + toptag[0] + " due to saturation (" + str(round(localSaturation,0)) + "%).") - saturationExcludeCount += 1 - continue #next toptag - - if len(listOfAlphashapesAndMeta_tmp) > 0: - listOfAlphashapesAndMeta.extend(listOfAlphashapesAndMeta_tmp) - - singlePhotoGuidList = noClusterPhotos_perTag_DictOfLists.get(toptag[0], None) - if singlePhotoGuidList: - shapetype = "Single cluster" - #print("Single: " + str(len(singlePhotoGuidList))) - photos = [cleanedPhotoDict[x] for x in singlePhotoGuidList] - for single_photo in photos: - #project lat/lng to UTM - x, y = pyproj.transform(crs_wgs, crs_proj, single_photo.lng, single_photo.lat) - pcoordinate = geometry.Point(x, y) - result_polygon = pcoordinate.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area - #result_polygon = pcoordinate.buffer(min(distXLng,distYLat)/100,resolution=3) - if result_polygon is not None and not result_polygon.is_empty: - listOfAlphashapesAndMeta.append((result_polygon,1,max(single_photo.photo_views,single_photo.photo_likes),1,str(toptag[0]),toptag[1],1,1,1,shapetype)) - print_store_log(f'{len(listOfAlphashapesAndMeta)} Alpha Shapes. Done.') - if localSaturationCheck and not saturationExcludeCount == 0: - print_store_log(f'Excluded {saturationExcludeCount} Tags on local saturation check.') - ##Output Boundary Shapes in merged Shapefile## - print_store_log("########## STEP 5 of 6: Writing Results to Shapefile ##########") - - ##Calculate best UTM Zone SRID/EPSG Code - #input_lon_center = bound_points_shapely.centroid.coords[0][0] #True centroid (coords may be multipoint) - #input_lat_center = bound_points_shapely.centroid.coords[0][1] - #epsg_code = utils.convert_wgs_to_utm(input_lon_center, input_lat_center) - #project = lambda x, y: pyproj.transform(pyproj.Proj(init='epsg:4326'), pyproj.Proj(init='epsg:{0}'.format(epsg_code)), x, y) - - # Define polygon feature geometry - schema = { - 'geometry': 'Polygon', - 'properties': {'Join_Count': 'int', - 'Views': 'int', - 'COUNT_User': 'int', - 'ImpTag': 'str', - 'TagCountG': 'int', - 'HImpTag': 'int', - 'Weights': 'float', - 'WeightsV2': 'float', - 'WeightsV3': 'float', - #'shapetype': 'str', - 'emoji': 'int'}, - } - - #Normalization of Values (1-1000 Range), precalc Step: - ####################################### - weightsv1_range = [x[6] for x in listOfAlphashapesAndMeta] #get the n'th column out for calculating the max/min - weightsv2_range = [x[7] for x in listOfAlphashapesAndMeta] - weightsv3_range = [x[8] for x in listOfAlphashapesAndMeta] - weightsv1_min = min(weightsv1_range) - weightsv1_max = max(weightsv1_range) - weightsv2_min = min(weightsv2_range) - weightsv2_max = max(weightsv2_range) - weightsv3_min = min(weightsv3_range) - weightsv3_max = max(weightsv3_range) - #precalc - #https://stats.stackexchange.com/questions/70801/how-to-normalize-data-to-0-1-range - weightsv1_mod_a = (1000-1)/(weightsv1_max-weightsv1_min) - weightsv1_mod_b = 1000 - weightsv1_mod_a * weightsv1_max - weightsv2_mod_a = (1000-1)/(weightsv2_max-weightsv2_min) - weightsv2_mod_b = 1000 - weightsv2_mod_a * weightsv2_max - weightsv3_mod_a = (1000-1)/(weightsv3_max-weightsv3_min) - weightsv3_mod_b = 1000 - weightsv3_mod_a * weightsv3_max - ####################################### - # Write a new Shapefile - # WGS1984 - if (clusterTags == False and clusterEmojis == True): - shapefileName = "allEmojiCluster" - else: - shapefileName = "allTagCluster" - with fiona.open(f'02_Output/{shapefileName}.shp', mode='w', encoding='UTF-8', driver='ESRI Shapefile', schema=schema,crs=from_epsg(epsg_code)) as c: - # Normalize Weights to 0-1000 Range - idx = 0 - for alphaShapeAndMeta in listOfAlphashapesAndMeta: - if idx == 0: - HImP = 1 - else: - if listOfAlphashapesAndMeta[idx][4] != listOfAlphashapesAndMeta[idx-1][4]: - HImP = 1 - else: - HImP = 0 - #emoName = unicode_name(alphaShapeAndMeta[4]) - #Calculate Normalized Weights Values based on precalc Step - if not alphaShapeAndMeta[6] == 1: - weight1_normalized = weightsv1_mod_a * alphaShapeAndMeta[6] + weightsv1_mod_b - else: - weight1_normalized = 1 - if not alphaShapeAndMeta[7] == 1: - weight2_normalized = weightsv2_mod_a * alphaShapeAndMeta[7] + weightsv2_mod_b - else: - weight2_normalized = 1 - if not alphaShapeAndMeta[8] == 1: - weight3_normalized = weightsv3_mod_a * alphaShapeAndMeta[8] + weightsv3_mod_b - else: - weight3_normalized = 1 - idx += 1 - #project data - #geom_proj = transform(project, alphaShapeAndMeta[0]) - #c.write({ - # 'geometry': geometry.mapping(geom_proj), - if clusterEmojis and alphaShapeAndMeta[4] in globalEmojiSet: - emoji = 1 - ImpTagText = "" - else: - emoji = 0 - ImpTagText = f'{alphaShapeAndMeta[4]}' - c.write({ - 'geometry': geometry.mapping(alphaShapeAndMeta[0]), - 'properties': {'Join_Count': alphaShapeAndMeta[1], - 'Views': alphaShapeAndMeta[2], - 'COUNT_User': alphaShapeAndMeta[3], - 'ImpTag': ImpTagText, - 'TagCountG': alphaShapeAndMeta[5], - 'HImpTag': HImP, - 'Weights': weight1_normalized, - 'WeightsV2': weight2_normalized, - 'WeightsV3': weight3_normalized, - #'shapetype': alphaShapeAndMeta[9], - 'emoji': emoji}, - }) - if clusterEmojis: - with open("02_Output/emojiTable.csv", "w", encoding='utf-8') as emojiTable: - emojiTable.write("FID,Emoji\n") - idx = 0 - for alphaShapeAndMeta in listOfAlphashapesAndMeta: - if alphaShapeAndMeta[4] in globalEmojiSet: - ImpTagText = f'{alphaShapeAndMeta[4]}' - else: - ImpTagText = "" - emojiTable.write(f'{idx},{ImpTagText}\n') - idx += 1 - -else: - print(f'\nUser abort.') -if abort == False and clusterPhotos == True: - print_store_log("########## STEP 6 of 6: Calculating Overall Photo Location Clusters ##########") - - if not 'clusterTreeCuttingDist' in locals(): - clusterTreeCuttingDist = int(input("Specify Cluster (Cut) Distance:\n")) - selectedPhotoList_Guids = [] - if not 'cleanedPhotoList' in locals(): - cleanedPhotoList = list(cleanedPhotoDict.values()) - for cleanedPhotoLocation in cleanedPhotoList: - selectedPhotoList_Guids.append(cleanedPhotoLocation.photo_guid) - selectedPhotoList = cleanedPhotoList - df = pd.DataFrame(selectedPhotoList) - points = df.as_matrix(['lng','lat']) - tagRadiansData = np.radians(points) - clusterer = hdbscan.HDBSCAN(min_cluster_size=2,gen_min_span_tree=False,allow_single_cluster=False,min_samples=1) - #clusterer.fit(tagRadiansData) - with warnings.catch_warnings(): - #disable joblist multithread warning - warnings.simplefilter('ignore', UserWarning) - async_result = pool.apply_async(utils.fit_cluster, (clusterer, tagRadiansData)) - clusterer = async_result.get() - clusters = clusterer.single_linkage_tree_.get_clusters(utils.getRadiansFromMeters(clusterTreeCuttingDist/8), min_cluster_size=2) - listOfPhotoClusters = [] - numpy_selectedPhotoList_Guids = np.asarray(selectedPhotoList_Guids) - mask_noisy = (clusters == -1) - number_of_clusters = len(np.unique(clusters[~mask_noisy])) #mit noisy (=0) - print_store_log(f'--> {number_of_clusters} Photo cluster.') - #clusternum_photolist = zip(clusters,selectedPhotoList) - #clusterPhotosList = [[] for x in range(number_of_clusters)] - clusterPhotosGuidsList = [] - for x in range(number_of_clusters): - currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x] - clusterPhotosGuidsList.append(currentClusterPhotoGuids) - noClusterPhotos = list(numpy_selectedPhotoList_Guids[clusters==-1]) - clusterPhotosGuidsList.sort(key=len,reverse=True) - if clusterTags is False: - #detect projection if not already - limYMin,limYMax,limXMin,limXMax = utils.getRectangleBounds(points) - bound_points_shapely = geometry.MultiPoint([(limXMin, limYMin), (limXMax, limYMax)]) - crs_wgs = pyproj.Proj(init='epsg:4326') #data always in lat/lng WGS1984 - if overrideCRS is None: - #Calculate best UTM Zone SRID/EPSG Code - input_lon_center = bound_points_shapely.centroid.coords[0][0] #True centroid (coords may be multipoint) - input_lat_center = bound_points_shapely.centroid.coords[0][1] - epsg_code = utils.convert_wgs_to_utm(input_lon_center, input_lat_center) - crs_proj = pyproj.Proj(init=f'epsg:{epsg_code}') - for photo_cluster in clusterPhotosGuidsList: - photos = [cleanedPhotoDict[x] for x in photo_cluster] - uniqueUserCount = len(set([photo.userid for photo in photos])) - #get points and project coordinates to suitable UTM - points = [geometry.Point(pyproj.transform(crs_wgs, crs_proj, photo.lng, photo.lat)) - for photo in photos] - point_collection = geometry.MultiPoint(list(points)) - result_polygon = point_collection.convex_hull #convex hull - result_centroid = result_polygon.centroid - if result_centroid is not None and not result_centroid.is_empty: - listOfPhotoClusters.append((result_centroid,uniqueUserCount)) - #clusterPhotoGuidList = clustersPerTag.get(None, None) - #noclusterphotos = [cleanedPhotoDict[x] for x in singlePhotoGuidList] - for photoGuid_noCluster in noClusterPhotos: - photo = cleanedPhotoDict[photoGuid_noCluster] - x, y = pyproj.transform(crs_wgs, crs_proj, photo.lng, photo.lat) - pcenter = geometry.Point(x, y) - if pcenter is not None and not pcenter.is_empty: - listOfPhotoClusters.append((pcenter,1)) - # Define a polygon feature geometry with one attribute - schema = { - 'geometry': 'Point', - 'properties': {'Join_Count': 'int'}, - } - - # Write a new Shapefile - # WGS1984 - with fiona.open('02_Output/allPhotoCluster.shp', mode='w', driver='ESRI Shapefile', schema=schema,crs=from_epsg(epsg_code)) as c: - ## If there are multiple geometries, put the "for" loop here - idx = 0 - for photoCluster in listOfPhotoClusters: - idx += 1 - c.write({ - 'geometry': geometry.mapping(photoCluster[0]), - 'properties': {'Join_Count': photoCluster[1]}, - }) -print("Writing log file..") -later = time.time() -hours, rem = divmod(later-now, 3600) -minutes, seconds = divmod(rem, 60) -difference = int(later - now) -reportMsg = f'\nDone.\n{int(hours):0>2} Hours {int(minutes):0>2} Minutes and {seconds:05.2f} Seconds passed.' -log_texts_list.append(reportMsg) -with open(log_file, "w", encoding='utf-8') as logfile_a: - for logtext in log_texts_list: - logfile_a.write(logtext) -print(reportMsg) -input("Press any key to exit...") -sys.exit() diff --git a/tagmaps/utils.py b/tagmaps/utils.py deleted file mode 100644 index 2ce3d1c..0000000 --- a/tagmaps/utils.py +++ /dev/null @@ -1,411 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -# TimeTransponse_from_json definition of functions file -import sys -import tkinter as tk -import emoji -import unicodedata -import math - -def query_yes_no(question, default="yes"): - """Ask a yes/no question via raw_input() and return their answer. - - "question" is a string that is presented to the user. - "default" is the presumed answer if the user just hits . - It must be "yes" (the default), "no" or None (meaning - an answer is required of the user). - - The "answer" return value is True for "yes" or False for "no". - """ - valid = {"yes": True, "y": True, "ye": True, - "no": False, "n": False} - if default is None: - prompt = " [y/n] " - elif default == "yes": - prompt = " [Y/n] " - elif default == "no": - prompt = " [y/N] " - else: - raise ValueError("invalid default answer: '%s'" % default) - - while True: - sys.stdout.write(question + prompt) - choice = input().lower() - if default is not None and choice == '': - return valid[default] - elif choice in valid: - return valid[choice] - else: - sys.stdout.write("'yes' or 'no' " - "(or 'y' or 'n').\n") - -def daterange(start_date, end_date): - for n in range(int ((end_date - start_date).days)): - yield start_date + timedelta(n) - -from math import radians, cos, sin, asin, sqrt -def haversine(lon1, lat1, lon2, lat2): - """ - Calculate the great circle distance between two points - on the earth (specified in decimal degrees) - """ - # convert decimal degrees to radians - lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) - # haversine formula - dlon = lon2 - lon1 - dlat = lat2 - lat1 - a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 - c = 2 * asin(sqrt(a)) - # Radius of earth in kilometers is 6371 - km = 6371* c - m = km*1000 - return m - -def getRadiansFromMeters(dist): - dist = dist/1000 - degreesDist = dist/111.325 - radiansDist = degreesDist/57.2958 - return radiansDist - #https://www.mathsisfun.com/geometry/radians.html - #1 Radian is about 57.2958 degrees. - #then see https://sciencing.com/convert-distances-degrees-meters-7858322.html - #Multiply the number of degrees by 111.325 - #To convert this to meters, multiply by 1,000. So, 2 degrees is 222,65 meters. -def getMetersFromRadians(dist): - dist = dist * 57.2958 - dist = dist * 111.325 - metersDist = round(dist * 1000,1) - - return metersDist - #1 Radian is about 57.2958 degrees. - #then see https://sciencing.com/convert-distances-degrees-meters-7858322.html - #Multiply the number of degrees by 111.325 - #To convert this to meters, multiply by 1,000. So, 2 degrees is 222,65 meters. - #plt.close('all') #clear memory - -#This function is not really needed, makes no difference! (really?) -def checkEmojiType(strEmo): - if unicodedata.name(strEmo).startswith(("EMOJI MODIFIER","VARIATION SELECTOR","ZERO WIDTH")): - return False - else: - return True - #see https://stackoverflow.com/questions/43852668/using-collections-counter-to-count-emojis-with-different-colors - # we want to ignore fitzpatrick modifiers and treat all differently colored emojis the same - #https://stackoverflow.com/questions/38100329/some-emojis-e-g-have-two-unicode-u-u2601-and-u-u2601-ufe0f-what-does -#COOKING -#OK HAND SIGN -#EMOJI MODIFIER FITZPATRICK TYPE-1-2 -#GRINNING FACE WITH SMILING EYES -#HEAVY BLACK HEART -#WEARY CAT FACE -#SMILING FACE WITH HEART-SHAPED EYES -#OK HAND SIGN -#EMOJI MODIFIER FITZPATRICK TYPE-1-2 -#GRINNING FACE WITH SMILING EYES -#PERSON WITH FOLDED HANDS -#EMOJI MODIFIER FITZPATRICK TYPE-3 -#WEARY CAT FACE - -##Emojitest -#n = '❤️👨‍⚕️' -##n = '👨‍⚕️' #medical emoji with zero-width joiner (http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html) -#nlist = def_functions.extract_emojis(n) -#with open("emojifile.txt", "w", encoding='utf-8') as emojifile: -# emojifile.write("Original: " + n + '\n') -# for xstr in nlist: -# emojifile.write('Emoji Extract: U+%04x' % ord(xstr) + '\n') -# emojifile.write(xstr + '\n') -# for _c in n: -# emojifile.write(str(unicode_name(_c)) + '\n') -# emojifile.write('Each Codepoint: U+%04x' % ord(_c) + '\n') -#def cleanEmoji(c): -# tuple = (u'\ufeff',u'\u200b',u'\u200d') -# for ex in tuple: -# c.replace(ex,"") -# return(c) -#https://github.com/carpedm20/emoji/ -#https://github.com/carpedm20/emoji/issues/75 -def extract_emojis(str): - #str = str.decode('utf-32').encode('utf-32', 'surrogatepass') - #return list(c for c in str if c in emoji.UNICODE_EMOJI) - return list(c for c in str if c in emoji.UNICODE_EMOJI and checkEmojiType(c) is True) -#this class is needed to override tkinter window with drag&drop option when overrideredirect = true -#class App: -# global tk -# def __init__(self): -# self.root = tk.Tk() -# #tk.Tk.__init__(self,master) -# self.root.overrideredirect(True) -# self.root.configure(background='gray7') -# self.root._offsetx = 0 -# self.root._offsety = 0 -# self.root.bind('',self.clickwin) -# self.root.bind('',self.dragwin) -# -# def dragwin(self,event): -# x = self.root.winfo_pointerx() - self._offsetx -# y = self.root.winfo_pointery() - self._offsety -# self.root.geometry('+{x}+{y}'.format(x=x,y=y)) -# -# def clickwin(self,event): -# self.root._offsetx = event.x -# self.root._offsety = event.y - -#tc unicode problem -#https://stackoverflow.com/questions/40222971/python-find-equivalent-surrogate-pair-from-non-bmp-unicode-char -import re -_nonbmp = re.compile(r'[\U00010000-\U0010FFFF]') -def _surrogatepair(match): - char = match.group() - assert ord(char) > 0xffff - encoded = char.encode('utf-16-le') - return ( - chr(int.from_bytes(encoded[:2], 'little')) + - chr(int.from_bytes(encoded[2:], 'little'))) -def with_surrogates(text): - return _nonbmp.sub(_surrogatepair, text) - -#https://stackoverflow.com/questions/40132542/get-a-cartesian-projection-accurate-around-a-lat-lng-pair -def convert_wgs_to_utm(lon, lat): - utm_band = str((math.floor((lon + 180) / 6 ) % 60) + 1) - if len(utm_band) == 1: - utm_band = '0'+utm_band - if lat >= 0: - epsg_code = '326' + utm_band - else: - epsg_code = '327' + utm_band - return epsg_code - -def str2bool(v): - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - -import fiona #Fiona needed for reading Shapefile -from fiona.crs import from_epsg -import shapely.geometry as geometry -import pyproj #import Proj, transform -#https://gis.stackexchange.com/questions/127427/transforming-shapely-polygon-and-multipolygon-objects -from shapely.ops import transform -#from shapely.geometry import Polygon -#from shapely.geometry import shape -#from shapely.geometry import Point -from decimal import Decimal -import numpy as np -def generateClusterShape(toptag,clusterPhotoGuidList,cleanedPhotoDict,crs_wgs,crs_proj,clusterTreeCuttingDist,localSaturationCheck): - #we define a new list of Temp Alpha Shapes outside the loop, so that it is not overwritten each time - listOfAlphashapesAndMeta_tmp = [] - #points = [] - tagArea = 0 - for photo_guids in clusterPhotoGuidList: - #for each cluster for this toptag - photos = [cleanedPhotoDict[x] for x in photo_guids] - photoCount = len(photo_guids) - uniqueUserCount = len(set([photo.userid for photo in photos])) - sumViews = sum([photo.photo_views for photo in photos]) - #calculate different weighting formulas - #weightsv1 = 1+ photoCount *(sqrt(1/( photoCount / uniqueUserCount )**3)) #-> Standard weighting formula (x**y means x raised to the power y); +1 to UserCount: prevent 1-2 Range from being misaligned - #weightsv2 = 1+ photoCount *(sqrt(1/( photoCount / uniqueUserCount )**2)) - weightsv1 = photoCount *(sqrt(1/( photoCount / (uniqueUserCount+1) )**3)) #-> Standard weighting formula (x**y means x raised to the power y); +1 to UserCount: prevent 1-2 Range from being misaligned - weightsv2 = photoCount *(sqrt(1/( photoCount / (uniqueUserCount+1) )**2)) #-> less importance on User_Count in correlation to photo count [Join_Count]; +1 to UserCount: prevent 1-2 Range from being misaligned - weightsv3 = sqrt((photoCount+(2*sqrt(photoCount)))*2) #-> Ignores User_Count, this will emphasize individual and very active users - #points = [geometry.Point(photo.lng, photo.lat) - # for photo in photos] - #instead of lat/lng for each photo, we use photo_locID to identify a list of distinct locations - distinctLocations = set([photo.photo_locID - for photo in photos]) - #simple list comprehension without projection: - #points = [geometry.Point(Decimal(location.split(':')[1]), Decimal(location.split(':')[0])) - # for location in distinctLocations] - points = [geometry.Point(pyproj.transform(crs_wgs, crs_proj, Decimal(location.split(':')[1]), Decimal(location.split(':')[0]))) - for location in distinctLocations] - point_collection = geometry.MultiPoint(list(points)) - result_polygon = None - - if len(points) >= 5: - if len(points) < 10: - result_polygon = point_collection.convex_hull #convex hull - result_polygon = result_polygon.buffer(clusterTreeCuttingDist/4,resolution=3) - shapetype = "between 5 and 10 points_convexHull" - #result_polygon = result_polygon.buffer(min(distXLng,distYLat)/100,resolution=3) - else: - if len(points) > 500: - startalpha = 1000000 - elif len(points) > 200: - startalpha = 10000 - else: - startalpha = 9000 - result_polygon = alpha_shape(points,alpha=clusterTreeCuttingDist/startalpha) #concave hull/alpha shape /50000 - shapetype = "Initial Alpha Shape + Buffer" - if type(result_polygon) is geometry.multipolygon.MultiPolygon or isinstance(result_polygon, bool): - #repeat generating alpha shapes with smaller alpha value if Multigon is generated - #smaller alpha values mean less granularity of resulting polygon - #but too large alpha may result in empty polygon - #(this branch is sometimes executed for larger scales) - for i in range(1,6): - #try decreasing alpha - alpha = startalpha + (startalpha * (i**i)) #** means cube - result_polygon = alpha_shape(points,alpha=clusterTreeCuttingDist/alpha)#/100000 - if not type(result_polygon) is geometry.multipolygon.MultiPolygon and not isinstance(result_polygon, bool): - shapetype = "Multipolygon Alpha Shape /" + str(alpha) - break - if type(result_polygon) is geometry.multipolygon.MultiPolygon or isinstance(result_polygon, bool): - #try increasing alpha - for i in range(1,6): - #try decreasing alpha - alpha = startalpha / (i*i) - result_polygon = alpha_shape(points,alpha=clusterTreeCuttingDist/alpha)#/100000 - if not type(result_polygon) is geometry.multipolygon.MultiPolygon and not isinstance(result_polygon, bool): - shapetype = "Multipolygon Alpha Shape /" + str(alpha) - break - if type(result_polygon) is geometry.multipolygon.MultiPolygon: - shapetype = "Multipolygon Alpha Shape -> Convex Hull" - #if still of type multipolygon, try to remove holes and do a convex_hull - result_polygon = result_polygon.convex_hull - #OR: in case there was a problem with generating alpha shapes (circum_r = a*b*c/(4.0*area) --> ZeroDivisionError: float division by zero) - #this branch is rarely executed for large point clusters where alpha is perhaps set too small - elif isinstance(result_polygon, bool) or result_polygon.is_empty: - shapetype = "BoolAlpha -> Fallback to PointCloud Convex Hull" - result_polygon = point_collection.convex_hull #convex hull - #Finally do a buffer to smooth alpha - result_polygon = result_polygon.buffer(clusterTreeCuttingDist/4,resolution=3) - #result_polygon = result_polygon.buffer(min(distXLng,distYLat)/100,resolution=3) - elif 2 <= len(points) < 5: - shapetype = "between 2 and 5 points_buffer" - #calc distance between points http://www.mathwarehouse.com/algebra/distance_formula/index.php - #bdist = math.sqrt((points[0].coords.xy[0][0]-points[1].coords.xy[0][0])**2 + (points[0].coords.xy[1][0]-points[1].coords.xy[1][0])**2) - #print(str(bdist)) - result_polygon = point_collection.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area - result_polygon = result_polygon.convex_hull - #result_polygon = point_collection.buffer(min(distXLng,distYLat)/100,resolution=3) #single dots are presented as buffer with 0.5% of width-area - elif len(points)==1 or type(result_polygon) is geometry.point.Point or result_polygon is None: - shapetype = "1 point cluster" - result_polygon = point_collection.buffer(clusterTreeCuttingDist/4,resolution=3) #single dots are presented as buffer with 0.5% of width-area - #result_polygon = point_collection.buffer(min(distXLng,distYLat)/100,resolution=3) #single dots are presented as buffer with 0.5% of width-area - #final check for multipolygon - if type(result_polygon) is geometry.multipolygon.MultiPolygon: - #usually not executed - result_polygon = result_polygon.convex_hull - #Geom, Join_Count, Views, COUNT_User,ImpTag,TagCountG,HImpTag - if result_polygon is not None and not result_polygon.is_empty: - if localSaturationCheck: - tagArea += result_polygon.area - listOfAlphashapesAndMeta_tmp.append((result_polygon,photoCount,sumViews,uniqueUserCount,toptag[0],toptag[1],weightsv1,weightsv2,weightsv3,shapetype)) - if len(listOfAlphashapesAndMeta_tmp) > 0: - # finally sort and append all cluster shapes for this tag - listOfAlphashapesAndMeta_tmp = sorted(listOfAlphashapesAndMeta_tmp,key=lambda x: -x[6]) - return listOfAlphashapesAndMeta_tmp, tagArea - -from descartes import PolygonPatch -def plot_polygon(polygon): - fig = plt.figure(figsize=(10,10)) - ax = fig.add_subplot(111) - margin = .3 - x_min, y_min, x_max, y_max = polygon.bounds - ax.set_xlim([x_min-margin, x_max+margin]) - ax.set_ylim([y_min-margin, y_max+margin]) - patch = PolygonPatch(polygon, fc='#999999', - ec='#000000', fill=True, - zorder=-1) - ax.add_patch(patch) - return fig -from shapely.ops import cascaded_union, polygonize -from scipy.spatial import Delaunay -import math -from math import sqrt -def alpha_shape(points, alpha): - """ - Alpha Shapes Code by KEVIN DWYER - see http://blog.thehumangeo.com/2014/05/12/drawing-boundaries-in-python/ - Compute the alpha shape (concave hull) of a set - of points. - @param points: Iterable container of points. - @param alpha: alpha value to influence the - gooeyness of the border. Smaller numbers - don't fall inward as much as larger numbers. - Too large, and you lose everything! - """ - if len(points) < 4: - # When you have a triangle, there is no sense - # in computing an alpha shape. - return geometry.MultiPoint(list(points)).convex_hull - def add_edge(edges, edge_points, coords, i, j): - """ - Add a line between the i-th and j-th points, - if not in the list already - """ - if (i, j) in edges or (j, i) in edges: - # already added - return - edges.add( (i, j) ) - edge_points.append(coords[ [i, j] ]) - coords = np.array([point.coords[0] - for point in points]) - - #print(str(len(coords))) - tri = Delaunay(coords)#,qhull_o}ptions = 'QJ') #To avoid this error, you can joggle the data by specifying the 'QJ' option to the DELAUNAY function. https://de.mathworks.com/matlabcentral/answers/94438-why-does-the-delaunay-function-in-matlab-7-0-r14-produce-an-error-when-passed-colinear-points?s_tid=gn_loc_drop - #tri = Delaunay(coords,{'QJ'}) #Version 3.1 added triangulated output ('Qt'). It should be used for Delaunay triangulations instead of using joggled input ('QJ'). - edges = set() - edge_points = [] - # loop over triangles: - # ia, ib, ic = indices of corner points of the - # triangle - for ia, ib, ic in tri.vertices: - pa = coords[ia] - pb = coords[ib] - pc = coords[ic] - # Lengths of sides of triangle - a = math.sqrt((pa[0]-pb[0])**2 + (pa[1]-pb[1])**2) - b = math.sqrt((pb[0]-pc[0])**2 + (pb[1]-pc[1])**2) - c = math.sqrt((pc[0]-pa[0])**2 + (pc[1]-pa[1])**2) - # Semiperimeter of triangle - s = (a + b + c)/2.0 - # Area of triangle by Heron's formula - try: - area = math.sqrt(s*(s-a)*(s-b)*(s-c)) - except ValueError: - return False - if area == 0: - return False - circum_r = a*b*c/(4.0*area) - # Here's the radius filter. - #print circum_r - if circum_r < 1.0/alpha: - add_edge(edges, edge_points, coords, ia, ib) - add_edge(edges, edge_points, coords, ib, ic) - add_edge(edges, edge_points, coords, ic, ia) - m = geometry.MultiLineString(edge_points) - triangles = list(polygonize(m)) - return cascaded_union(triangles)#, edge_points - #return geometry.polygon.asPolygon(edge_points,holes=None) - -def fit_cluster(clusterer, data): - clusterer.fit(data) - return clusterer -def getRectangleBounds(points): - limYMin = np.min(points.T[1]) - limYMax = np.max(points.T[1]) - limXMin = np.min(points.T[0]) - limXMax = np.max(points.T[0]) - return limYMin,limYMax,limXMin,limXMax -def filterTags(taglist,SortOutAlways_set,SortOutAlways_inStr_set): - count_tags = 0 - count_skipped = 0 - #Filter tags based on two stoplists - photo_tags_filtered = set() - for tag in taglist: - count_tags += 1 - #exclude numbers and those tags that are in SortOutAlways_set - if len(tag) == 1 or tag == '""' or tag.isdigit() or tag in SortOutAlways_set: - count_skipped += 1 - continue - for inStr in SortOutAlways_inStr_set: - if inStr in tag: - count_skipped += 1 - break - else: - photo_tags_filtered.add(tag) - return photo_tags_filtered,count_tags,count_skipped \ No newline at end of file