Skip to content

Commit

Permalink
Final data output, tested!
Browse files Browse the repository at this point in the history
  • Loading branch information
Sieboldianus committed Jan 12, 2018
1 parent 3d5f6ef commit 611bef5
Showing 1 changed file with 119 additions and 43 deletions.
162 changes: 119 additions & 43 deletions generateTagClusters.py
Expand Up @@ -484,9 +484,11 @@ def is_number(s):
print("\nTotal distinct locations: " + str(total_distinct_locations))
#boundary:
print("Bounds are: Min " + str(float(limLngMin)) + " " + str(float(limLatMin)) + " Max " + str(float(limLngMax)) + " " + str(float(limLatMax)))
cleanedPhotoList = []
#cleanedPhotoList = []

#create structure for tuple with naming for easy referencing
cleanedPhotoLocation_tuple = namedtuple('cleanedPhotoLocation_tuple', 'source lat lng photo_guid photo_owner userid photo_caption photo_dateTaken photo_uploadDate photo_views photo_tags photo_thumbnail photo_mTags photo_likes photo_comments photo_shortcode photo_mediatype photo_locName photo_locID')
cleanedPhotoDict = defaultdict(cleanedPhotoLocation_tuple)
with open("Output/Output_cleaned.txt", 'w', encoding='utf8') as csvfile:
csvfile.write("SOURCE,Latitude,Longitude,PhotoID,Owner,UserID,Name,DateTaken,UploadDate,Views,Tags,URL,MTags,Likes,Comments,Shortcode,Type,LocName,LocID," + '\n')
datawriter = csv.writer(csvfile, delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
Expand Down Expand Up @@ -536,7 +538,8 @@ def is_number(s):
cleanedPhotoLocation.photo_locName,#photo_locName = 17
cleanedPhotoLocation.photo_locID]#photo_locID = 18
)
cleanedPhotoList.append(cleanedPhotoLocation)
#cleanedPhotoList.append(cleanedPhotoLocation)
cleanedPhotoDict[cleanedPhotoLocation.photo_guid] = cleanedPhotoLocation

print("########## STEP 2 of 5: Tag Ranking ##########")
overallNumOfUsersPerTag_global = collections.Counter()
Expand Down Expand Up @@ -662,6 +665,7 @@ def report_callback_exception(self, exc, val, tb):
#Optional: set global plotting bounds
#plt.gca().set_xlim([limXMin, limXMax])
#plt.gca().set_ylim([limYMin, limYMax])
cleanedPhotoList = list(cleanedPhotoDict.values())
df = pd.DataFrame(cleanedPhotoList)
points = df.as_matrix(['lng','lat'])
limYMin = np.min(points.T[1])
Expand Down Expand Up @@ -727,12 +731,12 @@ def proceedWithCluster():
def sel_photos(tag,cleanedPhotoList):
#select photos from list based on a specific tag
distinctLocalLocationCount = set()
selectedPhotoList = []
selectedPhotoList_Guids = []
for cleanedPhotoLocation in cleanedPhotoList:
if tag in (cleanedPhotoLocation.photo_tags) or (tag in cleanedPhotoLocation.photo_caption):
selectedPhotoList.append(cleanedPhotoLocation)
selectedPhotoList_Guids.append(cleanedPhotoLocation.photo_guid)
distinctLocalLocationCount.add(cleanedPhotoLocation.photo_locID)
return selectedPhotoList, len(distinctLocalLocationCount)
return selectedPhotoList_Guids, len(distinctLocalLocationCount)

def fit_cluster(clusterer, data):
clusterer.fit(data)
Expand All @@ -757,12 +761,13 @@ def cluster_tag(toptag,preview=None,silent=None):
#canvas = tk.Canvas(graphFrame, width=canvasWidth, height=canvasHeight, highlightthickness=0,background="gray7")
#l = tk.Label(canvas, text="Preview Map", background="gray7",fg="gray80",font="Arial 10 bold")
#l.pack()
selectedPhotoList, distinctLocalLocationCount = sel_photos(toptag[0],cleanedPhotoList)
selectedPhotoList_Guids, distinctLocalLocationCount = sel_photos(toptag[0],cleanedPhotoList)
percentageOfTotalLocations = distinctLocalLocationCount/(total_distinct_locations/100)
#tkinter.messagebox.showinfo("Num of clusters: ", "(" + str(tnum) + " of " + str(tmax) + ") Found " + str(len(selectedPhotoList)) + " photos for tag " + "'" + toptag[0] + "' (" + str(round(percentageOfTotalLocations,0)) + "% of total distinct locations in area)")
if silent:
print("(" + str(tnum) + " of " + str(tmax) + ") Found " + str(len(selectedPhotoList)) + " photos for tag " + "'" + toptag[0] + "' (" + str(round(percentageOfTotalLocations,0)) + "% of total distinct locations in area)", end=" ")
print("(" + str(tnum) + " of " + str(tmax) + ") Found " + str(len(selectedPhotoList_Guids)) + " photos for tag " + "'" + toptag[0] + "' (" + str(round(percentageOfTotalLocations,0)) + "% of total distinct locations in area)", end=" ")
#clustering
selectedPhotoList = [cleanedPhotoDict[x] for x in selectedPhotoList_Guids]
df = pd.DataFrame(selectedPhotoList)
points = df.as_matrix(['lng','lat']) #converts pandas data to numpy array (limit by list of column-names)

Expand Down Expand Up @@ -799,7 +804,7 @@ def cluster_tag(toptag,preview=None,silent=None):
#cluster data
tagRadiansData = np.radians(points) #conversion to radians for HDBSCAN (does not support decimal degrees)
#for each tag in overallNumOfUsersPerTag_global.most_common(1000) (descending), calculate HDBSCAN Clusters
minClusterSize = max(2,int(((len(selectedPhotoList))/100)*5)) #4% optimum
minClusterSize = max(2,int(((len(selectedPhotoList_Guids))/100)*5)) #4% optimum
#minClusterSize = 2
clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize,gen_min_span_tree=createMinimumSpanningTree,allow_single_cluster=True,min_samples=1)
#clusterer = hdbscan.HDBSCAN(min_cluster_size=minClusterSize,gen_min_span_tree=True,min_samples=1)
Expand All @@ -825,7 +830,7 @@ def cluster_tag(toptag,preview=None,silent=None):

#exit function in case final processing loop (no figure generating)
if silent:
return sel_labels, selectedPhotoList
return sel_labels, selectedPhotoList_Guids
mask_noisy = (sel_labels == -1)
number_of_clusters = len(np.unique(sel_labels[~mask_noisy])) #len(sel_labels)
#palette = sns.color_palette("hls", )
Expand Down Expand Up @@ -1147,7 +1152,7 @@ def add_edge(edges, edge_points, coords, i, j):
tnum = 1
for toptag in topTagsList:

clusters, selectedPhotoList = cluster_tag(toptag,None,True)
clusters, selectedPhotoList_Guids = cluster_tag(toptag, None, True)
#print("baseDataList: ")
#print(str(type(selectedPhotoList)))
#for s in selectedPhotoList[:2]:
Expand All @@ -1159,43 +1164,72 @@ def add_edge(edges, edge_points, coords, i, j):
#print(clusters)
#clusters contains the cluster values (-1 = no cluster, 0 maybe, >0 = cluster
# in the same order, selectedPhotoList contains all original photo data, thus clusters[10] and selectedPhotoList[10] refer to the same photo
numpy_selectedPhotoList_Guids = np.asarray(selectedPhotoList_Guids)
mask_noisy = (clusters == -1)
number_of_clusters = len(np.unique(clusters[~mask_noisy])) #mit noisy (=0)
print("--> " + str(number_of_clusters) + " cluster.")
tnum += 1
photo_num = 0
clusterPhotosList = [[] for x in range(number_of_clusters)]
for x in clusters:
#photolist = []
if x >= 0: # no clusters: x = -1
clusterPhotosList[x].append(selectedPhotoList[photo_num])
#clusterPhotosArray_dict[x].add(selectedPhotoList[photo_num])
else:
noClusterPhotos_perTag_DictOfLists[toptag].append(selectedPhotoList[photo_num])
photo_num+=1
#clusternum_photolist = zip(clusters,selectedPhotoList)
#clusterPhotosList = [[] for x in range(number_of_clusters)]
clusterPhotosGuidsList = []
for x in range(number_of_clusters):
currentClusterPhotoGuids = numpy_selectedPhotoList_Guids[clusters==x]
clusterPhotosGuidsList.append(currentClusterPhotoGuids)
noClusterPhotos_perTag_DictOfLists[toptag[0]] = list(numpy_selectedPhotoList_Guids[clusters==-1])
#for x in clusters:
# #photolist = []
# if x >= 0: # no clusters: x = -1
# clusterPhotosList[x].append([selectedPhotoList[photo_num]])
# #clusterPhotosArray_dict[x].add(selectedPhotoList[photo_num])
# else:
# noClusterPhotos_perTag_DictOfLists[toptag[0]].append(selectedPhotoList[photo_num])
# photo_num+=1

#print("resultList: ")
#for s in clusterPhotosList[:2]:
# print(*s)
#print(str(toptag) + " - Number of clusters: " + str(len(clusterPhotosList)) + " Photo num: " + str(photo_num))
# Sort descending based on size of cluster: https://stackoverflow.com/questions/30346356/how-to-sort-list-of-lists-according-to-length-of-sublists
clusterPhotosList.sort(key=len, reverse=True)
if not len(clusterPhotosList) == 0:
clustersPerTag[toptag] = clusterPhotosList
clusterPhotosGuidsList.sort(key=len, reverse=True)
if not len(clusterPhotosGuidsList) == 0:
clustersPerTag[toptag[0]] = clusterPhotosGuidsList
#plt.autoscale(enable=True)

if tnum == 4:
break
#if tnum == 50:
# break
#plt.savefig('foo.png')
#sys.exit()
print("########## STEP 4 of 5: Generating Alpha Shapes ##########")
sys.stdout.flush()
#for each cluster of points, calculate boundary shape and add statistics (HImpTag etc.)
listOfPolygons = []
listOfAlphashapesAndMeta = []
tnum = 1
for toptag in topTagsList:
if toptag in clustersPerTag:
clusterPhotoList = clustersPerTag[toptag]
for cluster in clusterPhotosList:
points = [geometry.Point(point[2], point[1])
for point in cluster]
tnum += 1
clusterPhotoGuidList = clustersPerTag.get(toptag[0], None)
#print(toptag[0])
if clusterPhotoGuidList:
#clusterPhotoList = clustersPerTag[toptag[0]]
#print(str(len(clusterPhotoGuidList)))
firstCluster = True
for photo_guids in clusterPhotoGuidList:
if firstCluster:
HImP = 1
firstCluster = False
else:
HImP = 0
#try:
# print(*photos[:10])
#except UnicodeEncodeError:
# print("skipped\n")
#get full photo data from dict as list
photos = [cleanedPhotoDict[x] for x in photo_guids]

uniqueUserCount = len(set([photo.userid for photo in photos]))
sumViews = sum([photo.photo_views for photo in photos])
points = [geometry.Point(photo.lng, photo.lat)
for photo in photos]
x = [p.coords.xy[0] for p in points]
y = [p.coords.xy[1] for p in points]
point_collection = geometry.MultiPoint(list(points))
Expand Down Expand Up @@ -1225,39 +1259,81 @@ def add_edge(edges, edge_points, coords, i, j):
#final check for multipolygon
if type(result_polygon) is geometry.multipolygon.MultiPolygon:
result_polygon = result_polygon.convex_hull
listOfPolygons.append(result_polygon)
#Geom, Join_Count, Views, COUNT_User,ImpTag,TagCountG,HImpTag

listOfAlphashapesAndMeta.append((result_polygon,len(photo_guids),sumViews,uniqueUserCount,toptag[0],toptag[1],HImP))
#print(str(listOfPolygons[len(listOfPolygons)-1])+'\n')
#plot_polygon(result_polygon)
#plt.suptitle(toptag[0].upper(), fontsize=18, fontweight='bold')
#plt.gca().set_xlim([float(limLngMin), float(limLngMax)])
#plt.gca().set_ylim([float(limLatMin), float(limLatMax)])
#plt.plot(x,y,'o',ms=5)
#plt.waitforbuttonpress()
#plt.close()
if toptag in noClusterPhotos_perTag_DictOfLists:
singlePhotoList = noClusterPhotos_perTag_DictOfLists[toptag]
for singlePhoto in singlePhotoList:
point = geometry.Point(singlePhoto[2], singlePhoto[1])
resultPolygon = point.buffer((limLngMax-limLngMin)/200,resolution=3)
listOfPolygons.append(result_polygon)
singlePhotoGuidList = noClusterPhotos_perTag_DictOfLists.get(toptag[0], None)
if singlePhotoGuidList:
#print("Single: " + str(len(singlePhotoGuidList)))
photos = [cleanedPhotoDict[x] for x in singlePhotoGuidList]
points = [geometry.Point(photo.lng, photo.lat)
for photo in photos]
x = [p.coords.xy[0] for p in points]
y = [p.coords.xy[1] for p in points]
point_collection = geometry.MultiPoint(list(points))
result_polygon = point_collection.buffer((limLngMax-limLngMin)/200,resolution=3)
if type(result_polygon) is geometry.multipolygon.MultiPolygon:
for polygon in result_polygon:
listOfAlphashapesAndMeta.append((result_polygon,1,0,1,toptag[0],toptag[1],0))
else:
listOfAlphashapesAndMeta.append((result_polygon,1,0,1,toptag[0],toptag[1],0))

#if tnum == 50:
# break
#if toptag in noClusterPhotos_perTag_DictOfLists:
# singlePhotoList = noClusterPhotos_perTag_DictOfLists[toptag]
# #for singlePhoto in singlePhotoList:
# #point = geometry.Point(singlePhoto[2], singlePhoto[1])
# points = [geometry.Point(point[2], point[1])
# for point in singlePhotoList]
# x = [p.coords.xy[0] for p in points]
# y = [p.coords.xy[1] for p in points]
# point_collection = geometry.MultiPoint(list(points))
# result_polygon = point_collection.buffer((limLngMax-limLngMin)/200,resolution=3)
# if type(result_polygon) is geometry.multipolygon.MultiPolygon:
# for polygon in result_polygon:
# listOfPolygons.append(result_polygon)
# else:
# listOfPolygons.append(result_polygon)

print(str(len(listOfAlphashapesAndMeta)) + " Alpha Shapes. Done.")
##Output Boundary Shapes in merged Shapefile##
print("########## STEP 5 of 5: Generating Output ##########")
print("########## STEP 5 of 5: Writing Results to Shapefile ##########")

# Define a polygon feature geometry with one attribute
schema = {
'geometry': 'Polygon',
'properties': {'id': 'int'},
'properties': {'Join_Count': 'int',
'Views': 'int',
'COUNT_User': 'int',
'ImpTag': 'str',
'TagCountG': 'int',
'HImpTag': 'int'},
}

# Write a new Shapefile
# WGS1984
with fiona.open('allTagClusters.shp', mode='w', driver='ESRI Shapefile', schema=schema,crs=from_epsg(4326)) as c:
with fiona.open('Output/allTagClusters.shp', mode='w', driver='ESRI Shapefile', schema=schema,crs=from_epsg(4326)) as c:
## If there are multiple geometries, put the "for" loop here
idx = 0
for polygon in listOfPolygons:
for alphaShapeAndMeta in listOfAlphashapesAndMeta:
idx += 1
c.write({
'geometry': geometry.mapping(polygon),
'properties': {'id': 123},
'geometry': geometry.mapping(alphaShapeAndMeta[0]),
'properties': {'Join_Count': alphaShapeAndMeta[1],
'Views': alphaShapeAndMeta[2],
'COUNT_User': alphaShapeAndMeta[3],
'ImpTag': alphaShapeAndMeta[4],
'TagCountG': alphaShapeAndMeta[5],
'HImpTag': alphaShapeAndMeta[6]},
})

print("\n" + "Done.")
Expand Down

0 comments on commit 611bef5

Please sign in to comment.