Skip to content

Commit

Permalink
Refactoring, better implementation of emoji code (TU Campus tested)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sieboldianus committed Feb 12, 2018
1 parent 7a4572e commit 5e62c7c
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 27 deletions.
15 changes: 14 additions & 1 deletion 00_Config/SortOutAlways.txt
Expand Up @@ -294,4 +294,17 @@ and
st
at
pic
photography
photography
photooftheday
für
latergram
sind
zum
done
back
not
wieder
auf
von
ist
kein
Expand Up @@ -3,5 +3,6 @@
#-p cluster Photos
#-c localSaturationCheck (will exclude any tags that are used over the whole area)
#-s Source
python "D:\03_EvaVGI\05_Code\Py\standalone_tag_cluster_hdbscan\generateTagClusters.py" -s 'fromFlickr_CSV' -c 'True' -p 'False' -j 'False'
#-o emoji cluster
python "D:\03_EvaVGI\05_Code\Py\standalone_tag_cluster_hdbscan\generateTagClusters.py" -s 'fromFlickr_CSV' -c 'True' -p 'False' -j 'False' -o 'True' -t 'True'
$SHELL
14 changes: 8 additions & 6 deletions def_functions.py
Expand Up @@ -388,19 +388,21 @@ def getRectangleBounds(points):
limXMin = np.min(points.T[0])
limXMax = np.max(points.T[0])
return limYMin,limYMax,limXMin,limXMax
def filterTags(taglist,SortOutAlways_set,SortOutAlways_inStr_set,count_tags_global,count_tags_skipped):
def filterTags(taglist,SortOutAlways_set,SortOutAlways_inStr_set):
count_tags = 0
count_skipped = 0
#Filter tags based on two stoplists
photo_tags_filtered = set()
for tag in taglist:
count_tags_global += 1
count_tags += 1
#exclude numbers and those tags that are in SortOutAlways_set
if tag == '""' or tag.isdigit() or tag in SortOutAlways_set:
count_tags_skipped += 1
if len(tag) == 1 or tag == '""' or tag.isdigit() or tag in SortOutAlways_set:
count_skipped += 1
continue
for inStr in SortOutAlways_inStr_set:
if inStr in tag:
count_tags_skipped += 1
count_skipped += 1
break
else:
photo_tags_filtered.add(tag)
return photo_tags_filtered
return photo_tags_filtered,count_tags,count_skipped
66 changes: 47 additions & 19 deletions generateTagClusters.py
Expand Up @@ -183,7 +183,7 @@ def print_store_log(text,end=None):
if (len(filelist) == 0):
sys.exit("No *.json/csv/txt files found.")
else:
if clusterTags:
if clusterTags or clusterEmojis:
inputtext = input("Files to process: " + str(len(filelist)) + ". \nOptional: Enter a Number for the variety of Tags to process (Default is 1000)\nPress Enter to proceed.. \n")
if inputtext == "" or not inputtext.isdigit():
tmax = 1000
Expand All @@ -195,6 +195,7 @@ def print_store_log(text,end=None):
count_non_geotagged = 0
count_outside_shape = 0
count_tags_global = 0
count_emojis_global = 0
count_tags_skipped = 0
shapeFileExcludelocIDhash = set()
shapeFileIncludedlocIDhash = set()
Expand Down Expand Up @@ -224,6 +225,8 @@ def is_number(s):
UserLocationTagList_dict = defaultdict(set)
UserLocationWordList_dict = defaultdict(set)
UserLocationsFirstPhoto_dict = defaultdict(set)
if clusterEmojis:
overallNumOfEmojis_global = collections.Counter()

#UserDict_TagCounters = defaultdict(set)
UserDict_TagCounters_global = defaultdict(set)
Expand Down Expand Up @@ -381,7 +384,9 @@ def is_number(s):
if clusterTags:
photo_tags = set(filter(None, item[11].lower().split(";"))) #filter empty strings from photo_tags list and convert to set (hash) with unique values
#Filter tags based on two stoplists
photo_tags = def_functions.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set,count_tags_global,count_tags_skipped)
photo_tags, count_tags, count_skipped = def_functions.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set)
count_tags_global += count_tags
count_tags_skipped += count_skipped
else:
photo_tags = set()
#if not "water" in photo_tags:
Expand Down Expand Up @@ -553,17 +558,24 @@ def is_number(s):
photo_shortcode = None#item[18]
photo_uploadDate = item[8] #guid
photo_idDate = None#photo_uploadDate #use upload date as sorting ID
if clusterTags:
if clusterTags or clusterEmojis:
photo_caption = item[9]
else:
photo_caption = ""
photo_likes = None#item[13]
photo_tags = set()
if clusterTags:
photo_tags = set(filter(None, item[11][1:-1].lower().split(","))) #[1:-1] removes curly brackets, second [1:-1] removes quotes
#Filter tags based on two stoplists
photo_tags = def_functions.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set,count_tags_global,count_tags_skipped)
else:
photo_tags = set()
photo_tags,count_tags,count_skipped = def_functions.filterTags(photo_tags,SortOutAlways_set,SortOutAlways_inStr_set)
count_tags_global += count_tags
count_tags_skipped += count_skipped
if clusterEmojis:
emojis_filtered = set(def_functions.extract_emojis(photo_caption))
if not len(emojis_filtered) == 0:
count_emojis_global += len(emojis_filtered)
overallNumOfEmojis_global.update(emojis_filtered)
photo_tags = set.union(emojis_filtered)
#photo_tags = ";" + item[11] + ";"
photo_thumbnail = None#item[17]
photo_comments = None#item[14]
Expand Down Expand Up @@ -672,7 +684,12 @@ def is_number(s):

log_texts_list.append("Cleaned output to " + "%02d" % (count_loc,) + " photolocations from " + "%02d" % (count_glob,)+ " (File " + str(partcount) + " of " + str(len(filelist)) + ") - Skipped Media: " + str(skippedCount) + " - Skipped Tags: " + str(count_tags_skipped) +" of " + str(count_tags_global))
total_distinct_locations = len(distinctLocations_set)
print_store_log("\nTotal distinct locations: " + str(total_distinct_locations))
print_store_log("\nTotal users: " + str(len(LocationsPerUserID_dict)))
print_store_log("Total photos: " + str(count_glob))
print_store_log("Total distinct locations: " + str(total_distinct_locations))
print_store_log("Total tags: " + str(count_tags_global))
print_store_log("Total emojis: " + str(count_emojis_global))

#boundary:
print_store_log("Bounds are: Min " + str(float(limLngMin)) + " " + str(float(limLatMin)) + " Max " + str(float(limLngMax)) + " " + str(float(limLatMax)))
#cleanedPhotoList = []
Expand Down Expand Up @@ -732,7 +749,7 @@ def is_number(s):
)
cleanedPhotoDict[cleanedPhotoLocation.photo_guid] = cleanedPhotoLocation
now = time.time()
if clusterTags:
if clusterTags or clusterEmojis:
print_store_log("########## STEP 2 of 6: Tag Ranking ##########")
overallNumOfUsersPerTag_global = collections.Counter()
for user_key, taghash in UserDict_TagCounters_global.items():
Expand All @@ -752,12 +769,19 @@ def is_number(s):
if not lenBefore == lenAfter:
print("Filtered " + str(lenBefore - lenAfter) + " Tags that were only used by less than 2 users.")
singleMostUsedtag = topTagsList[0]

#optional write toptags to file
toptags = ''.join("%s,%i" % v + '\n' for v in topTagsList)
with open("02_Output/Output_toptags.txt", 'w', encoding="utf8") as file: #overwrite
file.write(toptags)

if clusterTags:
#optional write toptags to file
toptags = ''.join("%s,%i" % v + '\n' for v in topTagsList)
with open("02_Output/Output_toptags.txt", 'w', encoding="utf8") as file: #overwrite
file.write(toptags)
#optional write topemojis to file
if clusterEmojis:
topEmojisList = overallNumOfEmojis_global.most_common()
globalEmojiSet = {tuple[0] for tuple in topEmojisList}
topemojis = ''.join("%s,%i" % v + '\n' for v in topEmojisList)
with open("02_Output/Output_topemojis.txt", 'w', encoding="utf8") as file: #overwrite
file.write(topemojis)

print_store_log("########## STEP 3 of 6: Tag Location Clustering ##########")
#prepare some variables
tnum = 0
Expand Down Expand Up @@ -1491,8 +1515,8 @@ def delete(listbox):
'Weights': 'float',
'WeightsV2': 'float',
'WeightsV3': 'float',
'shapetype': 'str'},
#'EmojiName': 'str'},
'shapetype': 'str',
'emoji': 'int'},
}

#Normalization of Values (1-1000 Range), precalc Step:
Expand Down Expand Up @@ -1546,7 +1570,11 @@ def delete(listbox):
#project data
#geom_proj = transform(project, alphaShapeAndMeta[0])
#c.write({
# 'geometry': geometry.mapping(geom_proj),
# 'geometry': geometry.mapping(geom_proj),
if clusterEmojis and alphaShapeAndMeta[4] in globalEmojiSet:
emoji = 1
else:
emoji = 0
c.write({
'geometry': geometry.mapping(alphaShapeAndMeta[0]),
'properties': {'Join_Count': alphaShapeAndMeta[1],
Expand All @@ -1558,8 +1586,8 @@ def delete(listbox):
'Weights': weight1_normalized,
'WeightsV2': weight2_normalized,
'WeightsV3': weight3_normalized,
'shapetype': alphaShapeAndMeta[9]},
#'EmojiName': emoName},
'shapetype': alphaShapeAndMeta[9],
'emoji': int(emoji)},
})
else:
print("\n" + "User abort.")
Expand Down
1 change: 1 addition & 0 deletions log.txt
@@ -0,0 +1 @@
"Path","Size","LastRead"

0 comments on commit 5e62c7c

Please sign in to comment.