Permalink
Browse files

Added HTML5 Parser and transforming microdata by direct input.

  • Loading branch information...
1 parent 8a8c6ea commit 1463fabca7bfad95abe2d13be72ad27d6030f6e2 @WebOrganics committed Mar 18, 2012
View
@@ -1,4 +1,4 @@
-# TransFormr version 2.5
+# TransFormr version 2.6
TransFormr is a simple toolkit that uses PHP and XSLT for extracting and transforming microformats <http://microformats.org/>.
@@ -124,6 +124,7 @@ Work is in progress supporting the Species Microformat <http://microformats.org/
## Notes
+* 2.6 Added HTML5 Parser and parsing microdata by direct input.
* 2.5 Added microdata to JSON transformation.
* 2.4 Added value-title parsing for all microformats.
* 2.3 Transformr is now Faster due to new caching action. Added support for Transforming by direct input, Some XSLT bugs fixed.
View
@@ -21,10 +21,10 @@ class Transformr
function __construct()
{
!defined('_Transformr') ? define('_Transformr', true) : '' ;
- ini_set('display_errors', 0 );
+ ini_set('display_errors', 1 );
$this->path = $this->set_path();
- $this->version = '2.5';
- $this->updated = array('Saturday, 3rd March 2012', '2012-03-17T08:00:00+01:00');
+ $this->version = '2.6';
+ $this->updated = array('Sunday, 18th March 2012', '2012-03-18T13:30:00+01:00');
$this->check_php_version('5.2.0', 'Transformr');
$params = array_merge($_GET, $_POST);
@@ -35,7 +35,7 @@ function __construct()
$this->query = isset($params['q']) ? stripslashes($params["q"]) : '';
$this->template = dirname(__FILE__).'/template/';
$this->xsl = dirname(__FILE__).'/xsl/';
- $this->required = array('arc/ARC2', 'include/class.hqr', 'include/function.encoded', 'include/MicrodataPHP');
+ $this->required = array('arc/ARC2', 'include/class.hqr', 'include/function.encoded', 'include/MicrodataPHP', 'include/HTML5/Parser');
header("X-Application: Transformr ".$this->version );
}
@@ -45,6 +45,7 @@ public function transform($settings = '')
$this->a = $this->config_ns();
foreach ( $this->required as $require ) require_once(dirname(__FILE__).DIRECTORY_SEPARATOR.$require.'.php');
$this->ARC2 = ARC2::getComponent('RDFTranformrPlugin', $this->a);
+ $this->HTML5 = new HTML5_Parser;
return ( $this->query !='' ? $this->json_query($this->query) : $this->transformr_types() );
}
@@ -360,10 +361,27 @@ private function return_qrcode($url)
private function return_microdata($url)
{
+ if($this->text !='') {
+ $url = $this->parseHTML5($this->text, true);
+ }
$md = new MicrodataPhp($url);
return $md->json();
}
+ private function parseHTML5($text, $fragment = null)
+ {
+ if(is_null($fragment)) $file = $this->HTML5->Parse($text);
+ else $file = $this->HTML5->ParseFragment($text);
+ $newDom = new DOMDocument('utf-8');
+ $root = $newDom->createElement('html');
+ $root = $newDom->appendChild($root);
+ foreach ($file as $domElement){
+ $domNode = $newDom->importNode($domElement, true);
+ $root->appendChild($domNode);
+ }
+ return $newDom->saveHTML();
+ }
+
private function tidy_html($html, $url='', $tidy_option='', $output ='')
{
$output = $output == '' ? 'output-xhtml' : $output ;
@@ -380,7 +398,7 @@ private function tidy_html($html, $url='', $tidy_option='', $output ='')
'logical-emphasis' => true,
"$output" => true,
'wrap' => 200,
- 'clean' =>true
+ 'clean' =>true,
);
$tidy = new tidy;
$tidy->parseString($html, $config, 'utf8');
View
@@ -0,0 +1,120 @@
+<?php
+
+// warning: this file is encoded in UTF-8!
+
+class HTML5_Data
+{
+
+ // at some point this should be moved to a .ser file. Another
+ // possible optimization is to give UTF-8 bytes, not Unicode
+ // codepoints
+ protected static $realCodepointTable = array(
+ 0x0D => 0x000A, // LINE FEED (LF)
+ 0x80 => 0x20AC, // EURO SIGN ('€')
+ 0x81 => 0xFFFD, // REPLACEMENT CHARACTER
+ 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
+ 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
+ 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
+ 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
+ 0x86 => 0x2020, // DAGGER ('†')
+ 0x87 => 0x2021, // DOUBLE DAGGER ('‡')
+ 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
+ 0x89 => 0x2030, // PER MILLE SIGN ('‰')
+ 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
+ 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
+ 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
+ 0x8D => 0xFFFD, // REPLACEMENT CHARACTER
+ 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
+ 0x8F => 0xFFFD, // REPLACEMENT CHARACTER
+ 0x90 => 0xFFFD, // REPLACEMENT CHARACTER
+ 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
+ 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
+ 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
+ 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
+ 0x95 => 0x2022, // BULLET ('•')
+ 0x96 => 0x2013, // EN DASH ('–')
+ 0x97 => 0x2014, // EM DASH ('—')
+ 0x98 => 0x02DC, // SMALL TILDE ('˜')
+ 0x99 => 0x2122, // TRADE MARK SIGN ('™')
+ 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
+ 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
+ 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
+ 0x9D => 0xFFFD, // REPLACEMENT CHARACTER
+ 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
+ 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
+ );
+
+ protected static $namedCharacterReferences;
+
+ protected static $namedCharacterReferenceMaxLength;
+
+ /**
+ * Returns the "real" Unicode codepoint of a malformed character
+ * reference.
+ */
+ public static function getRealCodepoint($ref) {
+ if (!isset(self::$realCodepointTable[$ref])) return false;
+ else return self::$realCodepointTable[$ref];
+ }
+
+ public static function getNamedCharacterReferences() {
+ if (!self::$namedCharacterReferences) {
+ self::$namedCharacterReferences = unserialize(
+ file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
+ }
+ return self::$namedCharacterReferences;
+ }
+
+ public static function getNamedCharacterReferenceMaxLength() {
+ if (!self::$namedCharacterReferenceMaxLength) {
+ $namedCharacterReferences = self::getNamedCharacterReferences();
+ $lengths = array_map('strlen', array_keys($namedCharacterReferences));
+ self::$namedCharacterReferenceMaxLength = max($lengths);
+ }
+ return self::$namedCharacterReferenceMaxLength;
+ }
+
+
+ /**
+ * Converts a Unicode codepoint to sequence of UTF-8 bytes.
+ * @note Shamelessly stolen from HTML Purifier, which is also
+ * shamelessly stolen from Feyd (which is in public domain).
+ */
+ public static function utf8chr($code) {
+ if($code > 0x10FFFF or $code < 0x0 or
+ ($code >= 0xD800 and $code <= 0xDFFF) ) {
+ // bits are set outside the "valid" range as defined
+ // by UNICODE 4.1.0
+ return "\xEF\xBF\xBD";
+ }
+
+ $x = $y = $z = $w = 0;
+ if ($code < 0x80) {
+ // regular ASCII character
+ $x = $code;
+ } else {
+ // set up bits for UTF-8
+ $x = ($code & 0x3F) | 0x80;
+ if ($code < 0x800) {
+ $y = (($code & 0x7FF) >> 6) | 0xC0;
+ } else {
+ $y = (($code & 0xFC0) >> 6) | 0x80;
+ if($code < 0x10000) {
+ $z = (($code >> 12) & 0x0F) | 0xE0;
+ } else {
+ $z = (($code >> 12) & 0x3F) | 0x80;
+ $w = (($code >> 18) & 0x07) | 0xF0;
+ }
+ }
+ }
+ // set up the actual character
+ $ret = '';
+ if($w) $ret .= chr($w);
+ if($z) $ret .= chr($z);
+ if($y) $ret .= chr($y);
+ $ret .= chr($x);
+
+ return $ret;
+ }
+
+}
Oops, something went wrong.

0 comments on commit 1463fab

Please sign in to comment.