commit

ValueError · ValueError · commit 482140382fff · 2014-06-22T01:26:30.000+02:00
diff --git a/propertysuggester/evaluator/FirstLetterResultEvaluation.py b/propertysuggester/evaluator/FirstLetterResultEvaluation.py
@@ -1,18 +1,20 @@
 from propertysuggester.evaluator.ResultEvaluation import ResultEvaluation
 from propertysuggester.parser import CsvReader
+from propertysuggester.utils.WikidataApi import WikidataApi
 import csv
 class FirstLetterResultEvaluation(ResultEvaluation):
 
     def __init__(self):
-        ResultEvaluation.__init__(self, "Y:\Documents\GitHub\PropertySuggester-Python\wikidatawiki-20140526-pages-articles.csv", 1000, "20130526_dump_1000_first_letter.csv")
+        ResultEvaluation.__init__(self, "C:\Users\Virginia\Documents\GitHub\PropertySuggester-Python\propertysuggester\evaluator\wikidatawiki-20140526-pages-articles.csv", 1000, "20130526_dump_1000_first_letter_threshold_0_6.csv")
         self.property_dic = {}
+        self.wikidata_api = WikidataApi("http://wikidata.org/w/")
 
 
     def get_first_letter(self, property_id):
         if property_id in self.property_dic:
             first_letter = self.property_dic[property_id]
         else:
-            property_json = self.api.wb_getentities(entityid=property_id, language="en")
+            property_json = self.wikidata_api.wb_getentities(entityid=property_id, language="en")
             property_description = property_json["entities"][str(property_id)]["labels"]["en"]["value"]
             first_letter = property_description[0]
             self.property_dic[property_id] = first_letter
@@ -24,7 +26,7 @@ def process_entities(self, entity):
         removed_property_id = propertyIds[-1]
         property_id = "P" + str(removed_property_id)
         print "\nItem " + entity.title + ":"
-        first_letter = self.get_first_letter(property_id)  #api = WikidataApi("http://suggester.wmflabs.org/wiki")
+        first_letter = self.get_first_letter(property_id)  # api = WikidataApi("http://suggester.wmflabs.org/wiki")
         suggestions = self.api.wbs_getsuggestions(properties=removed_list, limit=50, cont=0, search=first_letter)
         self.rank_suggestions( propertyIds, suggestions)
 
@@ -61,6 +63,6 @@ def get_all_letters(self):
 
 
 x = FirstLetterResultEvaluation()
-x.get_all_letters()
-
+#x.get_all_letters()
+x.evaluate()
 #importfile = "Wikidata-20131129161111.xml.gz.csv"
diff --git a/propertysuggester/evaluator/ResultEvaluation.py b/propertysuggester/evaluator/ResultEvaluation.py
@@ -9,18 +9,18 @@ def __init__(self, inputfile, samplesize, outputfile):
         self.samplesize = samplesize
         self.outputfile = outputfile
         self.itemsBeyond50Count = 0
-        self.itemsOnFirst50PositionSum = 0
-        self.itemsOnFirst50Count = 0
+        self.foundMissingPropertiesRankSum = 0
+        self.foundMissingProperties = 0
         self.appearsWithinFirst50 = False
         self.ranking_amounts = {i: 0 for i in range(1,51)}
         self.random_ids = []
         self.foundEntities = 0
-        self.api = WikidataApi("http://wikidata.org/w/")
+        self.api = WikidataApi("http://127.0.0.1/devrepo")
 
     def generate_random_integers(self):
         random.seed(2)
         counter = 0
-        while counter < 5 * self.samplesize:
+        while counter < 100 * self.samplesize:
             self.random_ids.append(random.randint(0, 10957764))
             counter += 1
 
@@ -49,8 +49,8 @@ def rank_suggestions(self, propertyIds, suggestions):
     def handle_found_suggestion(self, rank):
         print "Found suggestion at rank " + str(rank)
         self.ranking_amounts[rank] += 1
-        self.itemsOnFirst50PositionSum += rank
-        self.itemsOnFirst50Count += 1
+        self.foundMissingPropertiesRankSum += rank
+        self.foundMissingProperties += 1
         self.foundEntities += 1
         self.appearsWithinFirst50 = True
 
@@ -64,7 +64,7 @@ def determine_ranks(self):
             processed_items = 0
             for entity in CsvReader.read_csv(f):
                 if self.foundEntities < self.samplesize:
-                    if current_item_count in self.random_ids and len(entity.claims) > 2 and len(entity.claims) < 6:
+                    if current_item_count in self.random_ids and len(entity.claims) > 6:
                         print "Processed Items: {0}".format(processed_items)
                         self.process_entities(entity)
                         processed_items += 1
@@ -77,9 +77,9 @@ def print_output(self, current_item_number):
         print " samplesize the user requested: " + str(self.samplesize)
         print "Total amount of items in dump: " + str(current_item_number)
         print "We extract the last added property and check at which position it would have been suggested (based on remaining properties)."
-        if self.itemsOnFirst50Count > 0 and self.itemsOnFirst50PositionSum > 0:
+        if self.foundMissingProperties > 0 and self.foundMissingPropertiesRankSum > 0:
             print "Average rank of extracted properties positioned within the first 50 suggestions: " + str(
-                float(self.itemsOnFirst50PositionSum) / self.itemsOnFirst50Count)
+                float(self.foundMissingPropertiesRankSum) / self.foundMissingProperties)
         print "Count of extracted properties not positioned within the first 50 suggestions: " + str(
             self.itemsBeyond50Count)
 
diff --git a/propertysuggester/evaluator/SearchResultEvaluation.py b/propertysuggester/evaluator/SearchResultEvaluation.py
@@ -7,7 +7,7 @@ class SearchResultEvaluation(ResultEvaluation):
     def __init__(self):
         self.filename = "C:\Users\Virginia\Documents\GitHub\PropertySuggester-Python\propertysuggester\evaluator\wikidatawiki-20140526-pages-articles.csv"
         self.samplesize = 1000
-        self.outputfile = "20130526_dump" + str(self.samplesize) + "2-5.csv"
+        self.outputfile = "20130526_dump" + str(self.samplesize) + "6_threshold_0_.csv"
 
         ResultEvaluation.__init__(self, self.filename,
 								  self.samplesize , self.outputfile)
diff --git a/propertysuggester/evaluator/manuell/missing_auswertung.py b/propertysuggester/evaluator/manuell/missing_auswertung.py
@@ -0,0 +1,125 @@
+__author__ = 'Virginia'
+from propertysuggester.evaluator.ResultEvaluation import ResultEvaluation
+from propertysuggester.utils.WikidataApi import WikidataApi
+from Entity import Entity
+import csv
+from manuelle_auswertung import UserFeedbackEvaluation
+import json
+
+
+class Missing_Evaluation(ResultEvaluation):
+    def __init__(self):
+        self.api = WikidataApi("http://suggester.wmflabs.org/w/")
+        self.ranking_amounts = None
+        self.filename = "C:\Users\Virginia\Documents\GitHub\PropertySuggester-Python\propertysuggester\evaluator\wikidatawiki-20140526-pages-articles.csv"
+        self.samplesize = 1000
+        self.outputfile = "20130526_dump" + str(self.samplesize) + ".csv"
+        self.ranked_dict = {}
+        self.foundMissing = 0
+        self.notFoundMissing = 0
+        ResultEvaluation.__init__(self, self.filename, self.samplesize , self.outputfile)
+
+    # input: entity, missing_properties
+    # fuer jede property schauen, wo sie in Liste auftaucht
+
+    def get_missing(self, eval_list):
+        entity_dict= {}
+        for entity in eval_list:
+            if entity.missing == "[]":
+                continue
+            entity_dict[entity.entity] = str(entity.missing)
+        #print str(entitity_dict)
+        print str(len(entity_dict))
+        return entity_dict
+
+    def find_missing(self, entity_dict):
+        for entity, missing in entity_dict.items():
+            entity_json = self.api.wb_getentities(entityid=entity, language="en")
+            entity_description = entity_json["entities"][str(entity)]
+            claims = entity_description["claims"]
+            self.process_entities(entity_description, json.loads(missing))
+        print "AVG of found ones " + str(float(self.foundMissingPropertiesRankSum)/self.foundMissingProperties)
+        print "Total amount of found(dealed with) missing properties " + str(self.foundEntities)
+        print "Total amount entities: " +str(len(entity_dict))
+        print " Found with property_suggester: "+  str(self.foundMissingProperties) + " " + str(self.foundMissing) +"  not found: " + str(self.notFoundMissing)
+
+    def process_entities(self, entity, missing_dict):
+
+            missing_properties = [int(str(item.keys()[0])[1:]) for item in missing_dict]
+            print "\nMissing properties {0} for entity {1}".format(missing_properties,entity["title"])
+           # propertyIds = [int(prop_id[1:]) for prop_id in entity["claims"].keys()]  # get ids from claims
+            #print "Property Ids: {0}".format(propertyIds)
+            print "Item {0} -  properties".format(entity["title"])
+            entity_string = entity["title"][5:]
+            suggestions = self.api.wbs_getsuggestions(entity=entity_string, limit=50, cont=0)
+            self.rank_suggestions(entity["title"],suggestions, missing_properties)
+
+    def rank_suggestions(self, entity, suggestions, missing_properties):
+        rank = 0
+        self.appearsWithinFirst50 = False
+        entity = entity[5:]  # cut off "Item:"
+        local_found_missing = 0
+        for suggestion in suggestions["search"]:
+            rank += 1
+            print "currently checking: " + suggestion["id"][1:]
+            #print str(missing_properties)
+            if int(suggestion["id"][1:]) in missing_properties:
+                print "Found missing property " + str(suggestion["id"][1:]) + " at rank " + str(rank)
+                local_found_missing += 1
+                if entity not in self.ranked_dict:
+                    self.ranked_dict[entity] = [{"missing": suggestion["id"], "rank": rank}]
+                else:
+                    self.ranked_dict[entity].append({"missing": suggestion["id"], "rank": rank})
+
+                self.foundMissingPropertiesRankSum += rank
+                self.foundMissingProperties += 1
+
+
+        not_found_local = len(missing_properties)- local_found_missing
+        self.foundMissing += local_found_missing
+        self.notFoundMissing += not_found_local
+
+    def print_result(self):
+        print str(self.ranked_dict)
+        with open("missing_analysis___2.csv","wb") as csv_file:
+            missing_writer = csv.writer(csv_file, delimiter=';',
+                                       quotechar='|')
+            missing_writer.writerow(["item","missing","rank"])
+            for item, result in self.ranked_dict.items():
+                for entry in result:
+                   # [{'rank': 12, 'missing': u'P1082'},x,y]
+                    missing = entry["missing"]
+                    rank =entry["rank"]
+                    missing_writer.writerow([item, missing, rank])
+
+
+
+
+
+    def make_readableoutput(self, entity_missing_dict):
+        result_dict = {}
+        with open("missing_readable.csv","wb") as csv_file:
+            missing_writer = csv.writer(csv_file, delimiter=';',
+                                       quotechar='|')
+            for entity, missing in entity_missing_dict.items():
+                entity_json = self.api.wb_getentities(entityid=entity, language="en")
+                label =  entity_json["entities"][entity]["labels"]["en"]["value"]
+                missing = json.loads(missing)
+                for i in missing:
+                    for property, description in i.items():
+                        missing_writer.writerow([entity, label.encode("utf-8"), property, description.encode("utf-8")])
+
+
+
+
+
+
+
+
+luser = UserFeedbackEvaluation()
+eval_list = luser.preprocess_file()
+m = Missing_Evaluation()
+entity_dict = m.get_missing(eval_list)
+#m.make_readableoutput(entity_dict)
+m.find_missing(entity_dict)
+m.print_result()