Skip to content

Commit

Permalink
v2.6: PageMetadata includes wikiDataQid, siteId, and PageTags
Browse files Browse the repository at this point in the history
  • Loading branch information
Laura Dietz committed Feb 1, 2022
1 parent 2c759d2 commit 4a24526
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 5 deletions.
2 changes: 1 addition & 1 deletion python3/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name='trec-car-tools',
version='2.5.4',
version='2.6',
packages=['trec_car'],
url='https://github.com/TREMA-UNH/trec-car-tools/python3',
# download_url='https://github.com/TREMA-UNH/trec-car-tools/archive/2.0.tar.gz',
Expand Down
42 changes: 38 additions & 4 deletions python3/trec_car/read_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,20 +244,42 @@ class PageMetadata(object):
:rtype: str
(Anchor text, frequency) of pages containing inlinks
.. attribute:: wikidataQid
:rtype: str
Language and time independent Wikidata IDs (e.g. Q12345)
.. attribute:: siteId
:rtype: str
SiteId (e.g. enwiki). The combination of WikidataQid and SiteId identifies a page in a wikipedia across time stamps. Note that PageName and PageId can change over time.
.. attribute:: pageTags
:rtype: str
Template tags of pages, e.g. "Good article" or "Vital article"
"""
def __init__(self, redirectNames, disambiguationNames, disambiguationIds, categoryNames, categoryIds, inlinkIds,
inlinkAnchors):
inlinkAnchors, wikiDataQid, siteId, pageTags):
self.inlinkAnchors = inlinkAnchors
self.inlinkIds = inlinkIds
self.categoryIds = categoryIds
self.categoryNames = categoryNames
self.disambiguationIds = disambiguationIds
self.disambiguationNames = disambiguationNames
self.redirectNames = redirectNames
self.wikiDataQid = wikiDataQid
self.siteId = siteId
self.pageTags = pageTags

@staticmethod
def default():
return PageMetadata(None, None, None, None, None, None, None)
return PageMetadata(None, None, None, None, None, None, None, None, None, None)

def __str__(self):
redirStr = ("" if self.redirectNames is None else (" redirected = "+", ".join([name for name in self.redirectNames])))
Expand All @@ -270,7 +292,10 @@ def __str__(self):
[ ("%s: %d" % (name, freq)) for (name, freq) in self.inlinkAnchors]
# [ ("%s: " % (name)) for (name, freq) in self.inlinkAnchors] \
)))
return "%s \n%s \n%s \n%s \n%s\n" % (redirStr, disamStr, catStr, inlinkStr, inlinkAnchorStr)
wikiDataStr = ("" if self.wikiDataQid is None else (" wikiDataQid = "+self.wikiDataQid))
siteIdStr = ("" if self.siteId is None else (" siteId = "+self.siteId))
pageTagsStr = ("" if self.pageTags is None else (" pageTags = "+", ".join([name for name in (self.pageTags or [])])))
return "%s \n%s \n%s \n%s \n%s\n%s \n%s \n%s \n" % (redirStr, disamStr, catStr, inlinkStr, inlinkAnchorStr, wikiDataStr, siteIdStr, pageTagsStr)

@staticmethod
def from_cbor(cbor):
Expand All @@ -281,6 +306,9 @@ def from_cbor(cbor):
categoryIds=None
inlinkIds=None
inlinkAnchors=None
wikiDataQid=None
siteId=None
pageTags=None

def decodeListOfIdList(cbor):
if len(cbor)==0: return None
Expand Down Expand Up @@ -321,9 +349,15 @@ def decodeListOfNameIntList(cbor):
elif tag == 7:
# compatability with v2.0
inlinkAnchors = decodeListOfNameIntList(cbor_data)
elif tag == 8:
wikiDataQid=cbor_data
elif tag == 9:
siteId=cbor_data
elif tag == 10:
pageTags=decodeListOfNameList(cbor_data)
i+=2

return PageMetadata(redirectNames, disambiguationNames, disambiguationIds, categoryNames, categoryIds, inlinkIds, inlinkAnchors)
return PageMetadata(redirectNames, disambiguationNames, disambiguationIds, categoryNames, categoryIds, inlinkIds, inlinkAnchors, wikiDataQid, siteId, pageTags)

class PageSkeleton(object):
"""
Expand Down

0 comments on commit 4a24526

Please sign in to comment.