Skip to content

Commit 4821403

Browse files
committed
commit
1 parent 5786dec commit 4821403

File tree

4 files changed

+142
-15
lines changed

4 files changed

+142
-15
lines changed

propertysuggester/evaluator/FirstLetterResultEvaluation.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
from propertysuggester.evaluator.ResultEvaluation import ResultEvaluation
22
from propertysuggester.parser import CsvReader
3+
from propertysuggester.utils.WikidataApi import WikidataApi
34
import csv
45
class FirstLetterResultEvaluation(ResultEvaluation):
56

67
def __init__(self):
7-
ResultEvaluation.__init__(self, "Y:\Documents\GitHub\PropertySuggester-Python\wikidatawiki-20140526-pages-articles.csv", 1000, "20130526_dump_1000_first_letter.csv")
8+
ResultEvaluation.__init__(self, "C:\Users\Virginia\Documents\GitHub\PropertySuggester-Python\propertysuggester\evaluator\wikidatawiki-20140526-pages-articles.csv", 1000, "20130526_dump_1000_first_letter_threshold_0_6.csv")
89
self.property_dic = {}
10+
self.wikidata_api = WikidataApi("http://wikidata.org/w/")
911

1012

1113
def get_first_letter(self, property_id):
1214
if property_id in self.property_dic:
1315
first_letter = self.property_dic[property_id]
1416
else:
15-
property_json = self.api.wb_getentities(entityid=property_id, language="en")
17+
property_json = self.wikidata_api.wb_getentities(entityid=property_id, language="en")
1618
property_description = property_json["entities"][str(property_id)]["labels"]["en"]["value"]
1719
first_letter = property_description[0]
1820
self.property_dic[property_id] = first_letter
@@ -24,7 +26,7 @@ def process_entities(self, entity):
2426
removed_property_id = propertyIds[-1]
2527
property_id = "P" + str(removed_property_id)
2628
print "\nItem " + entity.title + ":"
27-
first_letter = self.get_first_letter(property_id) #api = WikidataApi("http://suggester.wmflabs.org/wiki")
29+
first_letter = self.get_first_letter(property_id) # api = WikidataApi("http://suggester.wmflabs.org/wiki")
2830
suggestions = self.api.wbs_getsuggestions(properties=removed_list, limit=50, cont=0, search=first_letter)
2931
self.rank_suggestions( propertyIds, suggestions)
3032

@@ -61,6 +63,6 @@ def get_all_letters(self):
6163

6264

6365
x = FirstLetterResultEvaluation()
64-
x.get_all_letters()
65-
66+
#x.get_all_letters()
67+
x.evaluate()
6668
#importfile = "Wikidata-20131129161111.xml.gz.csv"

propertysuggester/evaluator/ResultEvaluation.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,18 @@ def __init__(self, inputfile, samplesize, outputfile):
99
self.samplesize = samplesize
1010
self.outputfile = outputfile
1111
self.itemsBeyond50Count = 0
12-
self.itemsOnFirst50PositionSum = 0
13-
self.itemsOnFirst50Count = 0
12+
self.foundMissingPropertiesRankSum = 0
13+
self.foundMissingProperties = 0
1414
self.appearsWithinFirst50 = False
1515
self.ranking_amounts = {i: 0 for i in range(1,51)}
1616
self.random_ids = []
1717
self.foundEntities = 0
18-
self.api = WikidataApi("http://wikidata.org/w/")
18+
self.api = WikidataApi("http://127.0.0.1/devrepo")
1919

2020
def generate_random_integers(self):
2121
random.seed(2)
2222
counter = 0
23-
while counter < 5 * self.samplesize:
23+
while counter < 100 * self.samplesize:
2424
self.random_ids.append(random.randint(0, 10957764))
2525
counter += 1
2626

@@ -49,8 +49,8 @@ def rank_suggestions(self, propertyIds, suggestions):
4949
def handle_found_suggestion(self, rank):
5050
print "Found suggestion at rank " + str(rank)
5151
self.ranking_amounts[rank] += 1
52-
self.itemsOnFirst50PositionSum += rank
53-
self.itemsOnFirst50Count += 1
52+
self.foundMissingPropertiesRankSum += rank
53+
self.foundMissingProperties += 1
5454
self.foundEntities += 1
5555
self.appearsWithinFirst50 = True
5656

@@ -64,7 +64,7 @@ def determine_ranks(self):
6464
processed_items = 0
6565
for entity in CsvReader.read_csv(f):
6666
if self.foundEntities < self.samplesize:
67-
if current_item_count in self.random_ids and len(entity.claims) > 2 and len(entity.claims) < 6:
67+
if current_item_count in self.random_ids and len(entity.claims) > 6:
6868
print "Processed Items: {0}".format(processed_items)
6969
self.process_entities(entity)
7070
processed_items += 1
@@ -77,9 +77,9 @@ def print_output(self, current_item_number):
7777
print " samplesize the user requested: " + str(self.samplesize)
7878
print "Total amount of items in dump: " + str(current_item_number)
7979
print "We extract the last added property and check at which position it would have been suggested (based on remaining properties)."
80-
if self.itemsOnFirst50Count > 0 and self.itemsOnFirst50PositionSum > 0:
80+
if self.foundMissingProperties > 0 and self.foundMissingPropertiesRankSum > 0:
8181
print "Average rank of extracted properties positioned within the first 50 suggestions: " + str(
82-
float(self.itemsOnFirst50PositionSum) / self.itemsOnFirst50Count)
82+
float(self.foundMissingPropertiesRankSum) / self.foundMissingProperties)
8383
print "Count of extracted properties not positioned within the first 50 suggestions: " + str(
8484
self.itemsBeyond50Count)
8585

propertysuggester/evaluator/SearchResultEvaluation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ class SearchResultEvaluation(ResultEvaluation):
77
def __init__(self):
88
self.filename = "C:\Users\Virginia\Documents\GitHub\PropertySuggester-Python\propertysuggester\evaluator\wikidatawiki-20140526-pages-articles.csv"
99
self.samplesize = 1000
10-
self.outputfile = "20130526_dump" + str(self.samplesize) + "2-5.csv"
10+
self.outputfile = "20130526_dump" + str(self.samplesize) + "6_threshold_0_.csv"
1111

1212
ResultEvaluation.__init__(self, self.filename,
1313
self.samplesize , self.outputfile)
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
__author__ = 'Virginia'
2+
from propertysuggester.evaluator.ResultEvaluation import ResultEvaluation
3+
from propertysuggester.utils.WikidataApi import WikidataApi
4+
from Entity import Entity
5+
import csv
6+
from manuelle_auswertung import UserFeedbackEvaluation
7+
import json
8+
9+
10+
class Missing_Evaluation(ResultEvaluation):
11+
def __init__(self):
12+
self.api = WikidataApi("http://suggester.wmflabs.org/w/")
13+
self.ranking_amounts = None
14+
self.filename = "C:\Users\Virginia\Documents\GitHub\PropertySuggester-Python\propertysuggester\evaluator\wikidatawiki-20140526-pages-articles.csv"
15+
self.samplesize = 1000
16+
self.outputfile = "20130526_dump" + str(self.samplesize) + ".csv"
17+
self.ranked_dict = {}
18+
self.foundMissing = 0
19+
self.notFoundMissing = 0
20+
ResultEvaluation.__init__(self, self.filename, self.samplesize , self.outputfile)
21+
22+
# input: entity, missing_properties
23+
# fuer jede property schauen, wo sie in Liste auftaucht
24+
25+
def get_missing(self, eval_list):
26+
entity_dict= {}
27+
for entity in eval_list:
28+
if entity.missing == "[]":
29+
continue
30+
entity_dict[entity.entity] = str(entity.missing)
31+
#print str(entitity_dict)
32+
print str(len(entity_dict))
33+
return entity_dict
34+
35+
def find_missing(self, entity_dict):
36+
for entity, missing in entity_dict.items():
37+
entity_json = self.api.wb_getentities(entityid=entity, language="en")
38+
entity_description = entity_json["entities"][str(entity)]
39+
claims = entity_description["claims"]
40+
self.process_entities(entity_description, json.loads(missing))
41+
print "AVG of found ones " + str(float(self.foundMissingPropertiesRankSum)/self.foundMissingProperties)
42+
print "Total amount of found(dealed with) missing properties " + str(self.foundEntities)
43+
print "Total amount entities: " +str(len(entity_dict))
44+
print " Found with property_suggester: "+ str(self.foundMissingProperties) + " " + str(self.foundMissing) +" not found: " + str(self.notFoundMissing)
45+
46+
def process_entities(self, entity, missing_dict):
47+
48+
missing_properties = [int(str(item.keys()[0])[1:]) for item in missing_dict]
49+
print "\nMissing properties {0} for entity {1}".format(missing_properties,entity["title"])
50+
# propertyIds = [int(prop_id[1:]) for prop_id in entity["claims"].keys()] # get ids from claims
51+
#print "Property Ids: {0}".format(propertyIds)
52+
print "Item {0} - properties".format(entity["title"])
53+
entity_string = entity["title"][5:]
54+
suggestions = self.api.wbs_getsuggestions(entity=entity_string, limit=50, cont=0)
55+
self.rank_suggestions(entity["title"],suggestions, missing_properties)
56+
57+
def rank_suggestions(self, entity, suggestions, missing_properties):
58+
rank = 0
59+
self.appearsWithinFirst50 = False
60+
entity = entity[5:] # cut off "Item:"
61+
local_found_missing = 0
62+
for suggestion in suggestions["search"]:
63+
rank += 1
64+
print "currently checking: " + suggestion["id"][1:]
65+
#print str(missing_properties)
66+
if int(suggestion["id"][1:]) in missing_properties:
67+
print "Found missing property " + str(suggestion["id"][1:]) + " at rank " + str(rank)
68+
local_found_missing += 1
69+
if entity not in self.ranked_dict:
70+
self.ranked_dict[entity] = [{"missing": suggestion["id"], "rank": rank}]
71+
else:
72+
self.ranked_dict[entity].append({"missing": suggestion["id"], "rank": rank})
73+
74+
self.foundMissingPropertiesRankSum += rank
75+
self.foundMissingProperties += 1
76+
77+
78+
not_found_local = len(missing_properties)- local_found_missing
79+
self.foundMissing += local_found_missing
80+
self.notFoundMissing += not_found_local
81+
82+
def print_result(self):
83+
print str(self.ranked_dict)
84+
with open("missing_analysis___2.csv","wb") as csv_file:
85+
missing_writer = csv.writer(csv_file, delimiter=';',
86+
quotechar='|')
87+
missing_writer.writerow(["item","missing","rank"])
88+
for item, result in self.ranked_dict.items():
89+
for entry in result:
90+
# [{'rank': 12, 'missing': u'P1082'},x,y]
91+
missing = entry["missing"]
92+
rank =entry["rank"]
93+
missing_writer.writerow([item, missing, rank])
94+
95+
96+
97+
98+
99+
def make_readableoutput(self, entity_missing_dict):
100+
result_dict = {}
101+
with open("missing_readable.csv","wb") as csv_file:
102+
missing_writer = csv.writer(csv_file, delimiter=';',
103+
quotechar='|')
104+
for entity, missing in entity_missing_dict.items():
105+
entity_json = self.api.wb_getentities(entityid=entity, language="en")
106+
label = entity_json["entities"][entity]["labels"]["en"]["value"]
107+
missing = json.loads(missing)
108+
for i in missing:
109+
for property, description in i.items():
110+
missing_writer.writerow([entity, label.encode("utf-8"), property, description.encode("utf-8")])
111+
112+
113+
114+
115+
116+
117+
118+
119+
luser = UserFeedbackEvaluation()
120+
eval_list = luser.preprocess_file()
121+
m = Missing_Evaluation()
122+
entity_dict = m.get_missing(eval_list)
123+
#m.make_readableoutput(entity_dict)
124+
m.find_missing(entity_dict)
125+
m.print_result()

0 commit comments

Comments
 (0)