From 611bef5bf1fdd41a35e4030f8a73b34c164ff084 Mon Sep 17 00:00:00 2001 From: AD Date: Fri, 12 Jan 2018 16:55:33 +0100 Subject: [PATCH] Final data output, tested! --- generateTagClusters.py | 162 ++++++++++++++++++++++++++++++----------- 1 file changed, 119 insertions(+), 43 deletions(-) diff --git a/generateTagClusters.py b/generateTagClusters.py index 4e0bbf8..1b58cf2 100644 --- a/generateTagClusters.py +++ b/generateTagClusters.py @@ -484,9 +484,11 @@ def is_number(s): print("\nTotal distinct locations: " + str(total_distinct_locations)) #boundary: print("Bounds are: Min " + str(float(limLngMin)) + " " + str(float(limLatMin)) + " Max " + str(float(limLngMax)) + " " + str(float(limLatMax))) -cleanedPhotoList = [] +#cleanedPhotoList = [] + #create structure for tuple with naming for easy referencing cleanedPhotoLocation_tuple = namedtuple('cleanedPhotoLocation_tuple', 'source lat lng photo_guid photo_owner userid photo_caption photo_dateTaken photo_uploadDate photo_views photo_tags photo_thumbnail photo_mTags photo_likes photo_comments photo_shortcode photo_mediatype photo_locName photo_locID') +cleanedPhotoDict = defaultdict(cleanedPhotoLocation_tuple) with open("Output/Output_cleaned.txt", 'w', encoding='utf8') as csvfile: csvfile.write("SOURCE,Latitude,Longitude,PhotoID,Owner,UserID,Name,DateTaken,UploadDate,Views,Tags,URL,MTags,Likes,Comments,Shortcode,Type,LocName,LocID," + '\n') datawriter = csv.writer(csvfile, delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) @@ -536,7 +538,8 @@ def is_number(s): cleanedPhotoLocation.photo_locName,#photo_locName = 17 cleanedPhotoLocation.photo_locID]#photo_locID = 18 ) - cleanedPhotoList.append(cleanedPhotoLocation) + #cleanedPhotoList.append(cleanedPhotoLocation) + cleanedPhotoDict[cleanedPhotoLocation.photo_guid] = cleanedPhotoLocation print("########## STEP 2 of 5: Tag Ranking ##########") overallNumOfUsersPerTag_global = collections.Counter() @@ -662,6 +665,7 @@ def report_callback_exception(self, exc, val, tb): #Optional: set global plotting bounds #plt.gca().set_xlim([limXMin, limXMax]) #plt.gca().set_ylim([limYMin, limYMax]) +cleanedPhotoList = list(cleanedPhotoDict.values()) df = pd.DataFrame(cleanedPhotoList) points = df.as_matrix(['lng','lat']) limYMin = np.min(points.T[1]) @@ -727,12 +731,12 @@ def proceedWithCluster(): def sel_photos(tag,cleanedPhotoList): #select photos from list based on a specific tag distinctLocalLocationCount = set() - selectedPhotoList = [] + selectedPhotoList_Guids = [] for cleanedPhotoLocation in cleanedPhotoList: if tag in (cleanedPhotoLocation.photo_tags) or (tag in cleanedPhotoLocation.photo_caption): - selectedPhotoList.append(cleanedPhotoLocation) + selectedPhotoList_Guids.append(cleanedPhotoLocation.photo_guid) distinctLocalLocationCount.add(cleanedPhotoLocation.photo_locID) - return selectedPhotoList, len(distinctLocalLocationCount) + return selectedPhotoList_Guids, len(distinctLocalLocationCount) def fit_cluster(clusterer, data): clusterer.fit(data) @@ -757,12 +761,13 @@ def cluster_tag(toptag,preview=None,silent=None): #canvas = tk.Canvas(graphFrame, width=canvasWidth, height=canvasHeight, highlightthickness=0,background="gray7") #l = tk.Label(canvas, text="Preview Map", background="gray7",fg="gray80",font="Arial 10 bold") #l.pack() - selectedPhotoList, distinctLocalLocationCount = sel_photos(toptag[0],cleanedPhotoList) + selectedPhotoList_Guids, distinctLocalLocationCount = sel_photos(toptag[0],cleanedPhotoList) percentageOfTotalLocations = distinctLocalLocationCount/(total_distinct_locations/100) #tkinter.messagebox.showinfo("Num of clusters: ", "(" + str(tnum) + " of " + str(tmax) + ") Found " + str(len(selectedPhotoList)) + " photos for tag " + "'" + toptag[0] + "' (" + str(round(percentageOfTotalLocations,0)) + "% of total distinct locations in area)") if silent: - print("(" + str(tnum) + " of " + str(tmax) + ") Found " + str(len(selectedPhotoList)) + " photos for tag " + "'" + toptag[0] + "' (" + str(round(percentageOfTotalLocations,0)) + "% of total distinct locations in area)", end=" ") + print("(" + str(tnum) + " of " + str(tmax) + ") Found " + str(len(selectedPhotoList_Guids)) + " photos for tag " + "'" + toptag[0] + "' (" + str(round(percentageOfTotalLocations,0)) + "% of total distinct locations in area)", end=" ") #clustering + selectedPhotoList = [cleanedPhotoDict[x] for x in selectedPhotoList_Guids] df = pd.DataFrame(selectedPhotoList) points = df.as_matrix(['lng','lat']) #converts pandas data to numpy array (limit by list of column-names) @@ -799,7 +804,7 @@ def cluster_tag(toptag,preview=None,silent=None): #cluster data tagRadiansData = np.radians(points) #conversion to radians for HDBSCAN (does not support decimal degrees) #for each tag in overallNumOfUsersPerTag_global.most_common(1000) (descending), calculate HDBSCAN Clusters - minClusterSize = max(2,int(((len(selectedPhotoList))/100)*5)) #4% optimum + minClusterSize = max(2,int(((len(selectedPhotoList_Guids))/100)*5)) #4% optimum #minClusterSize = 2 clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize,gen_min_span_tree=createMinimumSpanningTree,allow_single_cluster=True,min_samples=1) #clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize,gen_min_span_tree=True,min_samples=1) @@ -825,7 +830,7 @@ def cluster_tag(toptag,preview=None,silent=None): #exit function in case final processing loop (no figure generating) if silent: - return sel_labels, selectedPhotoList + return sel_labels, selectedPhotoList_Guids mask_noisy = (sel_labels == -1) number_of_clusters = len(np.unique(sel_labels[~mask_noisy])) #len(sel_labels) #palette = sns.color_palette("hls", ) @@ -1147,7 +1152,7 @@ def add_edge(edges, edge_points, coords, i, j): tnum = 1 for toptag in topTagsList: - clusters, selectedPhotoList = cluster_tag(toptag,None,True) + clusters, selectedPhotoList_Guids = cluster_tag(toptag, None, True) #print("baseDataList: ") #print(str(type(selectedPhotoList))) #for s in selectedPhotoList[:2]: @@ -1159,43 +1164,72 @@ def add_edge(edges, edge_points, coords, i, j): #print(clusters) #clusters contains the cluster values (-1 = no cluster, 0 maybe, >0 = cluster # in the same order, selectedPhotoList contains all original photo data, thus clusters[10] and selectedPhotoList[10] refer to the same photo + numpy_selectedPhotoList_Guids = np.asarray(selectedPhotoList_Guids) mask_noisy = (clusters == -1) number_of_clusters = len(np.unique(clusters[~mask_noisy])) #mit noisy (=0) print("--> " + str(number_of_clusters) + " cluster.") tnum += 1 photo_num = 0 - clusterPhotosList = [[] for x in range(number_of_clusters)] - for x in clusters: - #photolist = [] - if x >= 0: # no clusters: x = -1 - clusterPhotosList[x].append(selectedPhotoList[photo_num]) - #clusterPhotosArray_dict[x].add(selectedPhotoList[photo_num]) - else: - noClusterPhotos_perTag_DictOfLists[toptag].append(selectedPhotoList[photo_num]) - photo_num+=1 + #clusternum_photolist = zip(clusters,selectedPhotoList) + #clusterPhotosList = [[] for x in range(number_of_clusters)] + clusterPhotosGuidsList = [] + for x in range(number_of_clusters): + currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x] + clusterPhotosGuidsList.append(currentClusterPhotoGuids) + noClusterPhotos_perTag_DictOfLists[toptag[0]] = list(numpy_selectedPhotoList_Guids[clusters==-1]) + #for x in clusters: + # #photolist = [] + # if x >= 0: # no clusters: x = -1 + # clusterPhotosList[x].append([selectedPhotoList[photo_num]]) + # #clusterPhotosArray_dict[x].add(selectedPhotoList[photo_num]) + # else: + # noClusterPhotos_perTag_DictOfLists[toptag[0]].append(selectedPhotoList[photo_num]) + # photo_num+=1 + #print("resultList: ") #for s in clusterPhotosList[:2]: # print(*s) #print(str(toptag) + " - Number of clusters: " + str(len(clusterPhotosList)) + " Photo num: " + str(photo_num)) # Sort descending based on size of cluster: https://stackoverflow.com/questions/30346356/how-to-sort-list-of-lists-according-to-length-of-sublists - clusterPhotosList.sort(key=len, reverse=True) - if not len(clusterPhotosList) == 0: - clustersPerTag[toptag] = clusterPhotosList + clusterPhotosGuidsList.sort(key=len, reverse=True) + if not len(clusterPhotosGuidsList) == 0: + clustersPerTag[toptag[0]] = clusterPhotosGuidsList #plt.autoscale(enable=True) - if tnum == 4: - break + #if tnum == 50: + # break #plt.savefig('foo.png') #sys.exit() print("########## STEP 4 of 5: Generating Alpha Shapes ##########") + sys.stdout.flush() #for each cluster of points, calculate boundary shape and add statistics (HImpTag etc.) - listOfPolygons = [] + listOfAlphashapesAndMeta = [] + tnum = 1 for toptag in topTagsList: - if toptag in clustersPerTag: - clusterPhotoList = clustersPerTag[toptag] - for cluster in clusterPhotosList: - points = [geometry.Point(point[2], point[1]) - for point in cluster] + tnum += 1 + clusterPhotoGuidList = clustersPerTag.get(toptag[0], None) + #print(toptag[0]) + if clusterPhotoGuidList: + #clusterPhotoList = clustersPerTag[toptag[0]] + #print(str(len(clusterPhotoGuidList))) + firstCluster = True + for photo_guids in clusterPhotoGuidList: + if firstCluster: + HImP = 1 + firstCluster = False + else: + HImP = 0 + #try: + # print(*photos[:10]) + #except UnicodeEncodeError: + # print("skipped\n") + #get full photo data from dict as list + photos = [cleanedPhotoDict[x] for x in photo_guids] + + uniqueUserCount = len(set([photo.userid for photo in photos])) + sumViews = sum([photo.photo_views for photo in photos]) + points = [geometry.Point(photo.lng, photo.lat) + for photo in photos] x = [p.coords.xy[0] for p in points] y = [p.coords.xy[1] for p in points] point_collection = geometry.MultiPoint(list(points)) @@ -1225,7 +1259,10 @@ def add_edge(edges, edge_points, coords, i, j): #final check for multipolygon if type(result_polygon) is geometry.multipolygon.MultiPolygon: result_polygon = result_polygon.convex_hull - listOfPolygons.append(result_polygon) + #Geom, Join_Count, Views, COUNT_User,ImpTag,TagCountG,HImpTag + + listOfAlphashapesAndMeta.append((result_polygon,len(photo_guids),sumViews,uniqueUserCount,toptag[0],toptag[1],HImP)) + #print(str(listOfPolygons[len(listOfPolygons)-1])+'\n') #plot_polygon(result_polygon) #plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold') #plt.gca().set_xlim([float(limLngMin), float(limLngMax)]) @@ -1233,31 +1270,70 @@ def add_edge(edges, edge_points, coords, i, j): #plt.plot(x,y,'o',ms=5) #plt.waitforbuttonpress() #plt.close() - if toptag in noClusterPhotos_perTag_DictOfLists: - singlePhotoList = noClusterPhotos_perTag_DictOfLists[toptag] - for singlePhoto in singlePhotoList: - point = geometry.Point(singlePhoto[2], singlePhoto[1]) - resultPolygon = point.buffer((limLngMax-limLngMin)/200,resolution=3) - listOfPolygons.append(result_polygon) + singlePhotoGuidList = noClusterPhotos_perTag_DictOfLists.get(toptag[0], None) + if singlePhotoGuidList: + #print("Single: " + str(len(singlePhotoGuidList))) + photos = [cleanedPhotoDict[x] for x in singlePhotoGuidList] + points = [geometry.Point(photo.lng, photo.lat) + for photo in photos] + x = [p.coords.xy[0] for p in points] + y = [p.coords.xy[1] for p in points] + point_collection = geometry.MultiPoint(list(points)) + result_polygon = point_collection.buffer((limLngMax-limLngMin)/200,resolution=3) + if type(result_polygon) is geometry.multipolygon.MultiPolygon: + for polygon in result_polygon: + listOfAlphashapesAndMeta.append((result_polygon,1,0,1,toptag[0],toptag[1],0)) + else: + listOfAlphashapesAndMeta.append((result_polygon,1,0,1,toptag[0],toptag[1],0)) + + #if tnum == 50: + # break + #if toptag in noClusterPhotos_perTag_DictOfLists: + # singlePhotoList = noClusterPhotos_perTag_DictOfLists[toptag] + # #for singlePhoto in singlePhotoList: + # #point = geometry.Point(singlePhoto[2], singlePhoto[1]) + # points = [geometry.Point(point[2], point[1]) + # for point in singlePhotoList] + # x = [p.coords.xy[0] for p in points] + # y = [p.coords.xy[1] for p in points] + # point_collection = geometry.MultiPoint(list(points)) + # result_polygon = point_collection.buffer((limLngMax-limLngMin)/200,resolution=3) + # if type(result_polygon) is geometry.multipolygon.MultiPolygon: + # for polygon in result_polygon: + # listOfPolygons.append(result_polygon) + # else: + # listOfPolygons.append(result_polygon) + + print(str(len(listOfAlphashapesAndMeta)) + " Alpha Shapes. Done.") ##Output Boundary Shapes in merged Shapefile## - print("########## STEP 5 of 5: Generating Output ##########") + print("########## STEP 5 of 5: Writing Results to Shapefile ##########") # Define a polygon feature geometry with one attribute schema = { 'geometry': 'Polygon', - 'properties': {'id': 'int'}, + 'properties': {'Join_Count': 'int', + 'Views': 'int', + 'COUNT_User': 'int', + 'ImpTag': 'str', + 'TagCountG': 'int', + 'HImpTag': 'int'}, } # Write a new Shapefile # WGS1984 - with fiona.open('allTagClusters.shp', mode='w', driver='ESRI Shapefile', schema=schema,crs=from_epsg(4326)) as c: + with fiona.open('Output/allTagClusters.shp', mode='w', driver='ESRI Shapefile', schema=schema,crs=from_epsg(4326)) as c: ## If there are multiple geometries, put the "for" loop here idx = 0 - for polygon in listOfPolygons: + for alphaShapeAndMeta in listOfAlphashapesAndMeta: idx += 1 c.write({ - 'geometry': geometry.mapping(polygon), - 'properties': {'id': 123}, + 'geometry': geometry.mapping(alphaShapeAndMeta[0]), + 'properties': {'Join_Count': alphaShapeAndMeta[1], + 'Views': alphaShapeAndMeta[2], + 'COUNT_User': alphaShapeAndMeta[3], + 'ImpTag': alphaShapeAndMeta[4], + 'TagCountG': alphaShapeAndMeta[5], + 'HImpTag': alphaShapeAndMeta[6]}, }) print("\n" + "Done.")