Skip to content

Commit

Permalink
#8: Partial implementation of author parsing.
Browse files Browse the repository at this point in the history
  • Loading branch information
Bystroushaak committed Jul 27, 2015
1 parent d22695e commit 74545e7
Showing 1 changed file with 38 additions and 0 deletions.
38 changes: 38 additions & 0 deletions src/wa_kat/analyzers/author_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
# Imports =====================================================================
import dhtmlparser

from shared import parse_meta


# Functions & classes =========================================================
def _get_html_authors(index_page):
"""
Return list of `authors` from HTML ``<meta>`` tags.
"""
return parse_meta(index_page, "author", "HTML")


def _get_dc_authors(index_page):
"""
Return list of `authors` parsed from dublin core.
"""
return parse_meta(index_page, "DC.Creator", "DC")


def get_author(index_page):
"""
Parse `authors` from HTML ``<meta>`` and dublin core.
"""
dom = dhtmlparser.parseString(index_page)

authors = [
_get_html_authors(dom),
_get_dc_authors(dom),
]

return sum(authors, []) # return flattened list

0 comments on commit 74545e7

Please sign in to comment.