Skip to content
Browse files

initial import of ElementParser library

  • Loading branch information...
0 parents commit 3190f505112640e2f333929e486c1b9ab0e1e282 Blake Watters committed May 7, 2009
Showing with 5,336 additions and 0 deletions.
  1. +35 −0 Classes/CDataChunk.h
  2. +42 −0 Classes/CDataChunk.m
  3. +49 −0 Classes/CSSPartMatcher.h
  4. +109 −0 Classes/CSSPartMatcher.m
  5. +68 −0 Classes/CSSSelector.h
  6. +97 −0 Classes/CSSSelector.m
  7. +48 −0 Classes/CSSSelectorMatcher.h
  8. +60 −0 Classes/CSSSelectorMatcher.m
  9. +67 −0 Classes/CSSSelectorPart.h
  10. +160 −0 Classes/CSSSelectorPart.m
  11. +117 −0 Classes/Chunk.h
  12. +85 −0 Classes/Chunk.m
  13. +36 −0 Classes/CommentChunk.h
  14. +41 −0 Classes/CommentChunk.m
  15. +36 −0 Classes/DoctypeChunk.h
  16. +42 −0 Classes/DoctypeChunk.m
  17. +30 −0 Classes/DocumentRoot.h
  18. +48 −0 Classes/DocumentRoot.m
  19. +235 −0 Classes/Element.h
  20. +263 −0 Classes/Element.m
  21. +115 −0 Classes/ElementParser.h
  22. +265 −0 Classes/ElementParser.m
  23. +37 −0 Classes/EntityChunk.h
  24. +42 −0 Classes/EntityChunk.m
  25. +172 −0 Classes/NSString_HTML.h
  26. +624 −0 Classes/NSString_HTML.m
  27. +36 −0 Classes/ProcessingInstructionChunk.h
  28. +42 −0 Classes/ProcessingInstructionChunk.m
  29. +91 −0 Classes/TagChunk.h
  30. +113 −0 Classes/TagChunk.m
  31. +36 −0 Classes/TxtChunk.h
  32. +38 −0 Classes/TxtChunk.m
  33. +51 −0 Classes/URLParser.h
  34. +110 −0 Classes/URLParser.m
  35. +43 −0 Code Overview.txt
  36. BIN GPL v3 Liscense.pdf
  37. +48 −0 Read Me.txt
  38. BIN Test/.DS_Store
  39. +33 −0 Test/CSSSelectorTest.h
  40. +91 −0 Test/CSSSelectorTest.m
  41. +34 −0 Test/ElementParserTest.h
  42. +105 −0 Test/ElementParserTest.m
  43. +38 −0 Test/Element_Test.h
  44. +175 −0 Test/Element_Test.m
  45. +35 −0 Test/NSString_HTML_Test.h
  46. +305 −0 Test/NSString_HTML_Test.m
  47. +989 −0 Test/gizmodo.xml
35 Classes/CDataChunk.h
@@ -0,0 +1,35 @@
+//
+// CDataChunk.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Chunk.h"
+
+/**
+ CDataChunk a chunk corresponding to a CDATA section
+ */
+@interface CDataChunk : Chunk {
+
+}
+
+@end
42 Classes/CDataChunk.m
@@ -0,0 +1,42 @@
+//
+// CDataChunk.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "CDataChunk.h"
+
+
+@implementation CDataChunk
+
+-(NSString*)kind{
+ return ChunkKindCData;
+}
+
+-(NSRange)interiorRange{
+ return NSMakeRange(range.location + 9, range.length - 12);
+}
+
++(NSString*)humanName{
+ return @"cdata";
+}
+
+@end
49 Classes/CSSPartMatcher.h
@@ -0,0 +1,49 @@
+//
+// CSSPartMatcher.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/19/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Element.h"
+@class CSSSelectorMatcher;
+
+/**
+ * Responsible for representing a successful match on a part.
+ * It is presented elements in an attempt to complete the next part of the match
+ *
+ */
+@interface CSSPartMatcher : NSObject {
+ CSSSelectorMatcher* selectorMatcher; // not retained
+ Element* scopeElement;
+ Element* matchedElement;
+ int matchedPartIndex;
+ NSMutableArray* matchersForNextPart;
+}
+@property (nonatomic, retain) Element* scopeElement;
+@property (nonatomic, retain) Element* matchedElement;
+@property int matchedPartIndex;
+
+-(id)initWithElement:(Element*) anElement selectorMatcher:(CSSSelectorMatcher*)aSelectorMatcher;
+-(void)pruneMatchesForElement:(Element*)anElement;
+-(BOOL)matchNextElement:(Element*) nextElement forIndex: (int) index;
+
+@end
109 Classes/CSSPartMatcher.m
@@ -0,0 +1,109 @@
+//
+// CSSPartMatcher.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/19/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "CSSPartMatcher.h"
+#import "CSSSelectorPart.h"
+#import "CSSSelectorMatcher.h"
+
+@implementation CSSPartMatcher
+
+@synthesize matchedElement, scopeElement, matchedPartIndex;
+
+-(id)initWithElement:(Element*) anElement selectorMatcher:(CSSSelectorMatcher*)aSelectorMatcher{
+ self = [super init];
+ matchedElement = [anElement retain];
+ scopeElement = [anElement retain];
+ selectorMatcher = aSelectorMatcher;
+ return self;
+}
+
+-(void)dealloc{
+// NSLog(@"pruned: %@", [self description]);
+ [matchedElement release];
+ [scopeElement release];
+ [matchersForNextPart release];
+ [super dealloc];
+}
+
+-(void)pruneMatchesForElement: (Element*)anElement{
+ if (!matchersForNextPart) return;
+ for (CSSPartMatcher* match in matchersForNextPart){
+ if ([match scopeElement] == anElement)
+ [matchersForNextPart removeObject: match];
+ else
+ [match pruneMatchesForElement: anElement];
+ }
+}
+
+-(void)addNextMatch:(Element*)nextElement withIndex:(int)index{
+ CSSPartMatcher* nextMatch = [[CSSPartMatcher alloc] initWithElement: nextElement selectorMatcher: selectorMatcher];
+ nextMatch.matchedPartIndex = index;
+ CSSVerb nextNextVerb = [[selectorMatcher selector] verbAfterIndex: index];
+ if (nextNextVerb == CSSVerbSuccessor)
+ [nextMatch setScopeElement: self.matchedElement];
+ if (!matchersForNextPart)
+ matchersForNextPart = [[NSMutableArray alloc] initWithCapacity: 4];
+ [matchersForNextPart addObject: nextMatch];
+ [nextMatch release];
+}
+
+-(BOOL)matchNextElement:(Element*) nextElement forIndex: (int) index{
+ CSSSelectorPart* nextPart = [[selectorMatcher selector] partAtIndex: index];
+ CSSVerb nextVerb = [[selectorMatcher selector] verbAtIndex: index];
+ BOOL verbMatches = NO;
+ if ([nextPart matchesElement: nextElement]){
+ if (nextVerb == CSSVerbAny)
+ verbMatches = YES;
+ else if (nextVerb == CSSVerbDescendant)
+ verbMatches = YES;//because we prune
+ else if (nextVerb == CSSVerbChild)
+ verbMatches = nextElement.parent == self.matchedElement;
+ else if (nextVerb == CSSVerbSuccessor)
+ verbMatches = nextElement == self.matchedElement.nextSybling;
+ }
+
+ BOOL completeMatch = verbMatches && (index == [[selectorMatcher selector] countOfParts] - 1);
+ if (matchersForNextPart){
+ for (CSSPartMatcher* match in matchersForNextPart)
+ completeMatch = completeMatch || [match matchNextElement: nextElement forIndex: index + 1];
+ }
+
+ if (completeMatch)
+ return YES;
+
+ if (verbMatches)
+ [self addNextMatch: nextElement withIndex: index];
+
+ return NO;
+}
+
+-(CSSSelectorPart*)matchedPart{
+ return [[selectorMatcher selector] partAtIndex: matchedPartIndex];
+}
+
+-(NSString*)description{
+ return [NSString stringWithFormat: @"%@ matched %@ -- %i matchersForNextPart", [[self matchedPart] description], [matchedElement description], (matchersForNextPart) ? [matchersForNextPart count] : 0];
+}
+
+@end
68 Classes/CSSSelector.h
@@ -0,0 +1,68 @@
+//
+// CSSSelector.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/17/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Element.h"
+
+@class CSSSelectorMatcher;
+@class CSSSelectorPart;
+
+#define CSSVerbChild @" > "
+#define CSSVerbSuccessor @" + "
+#define CSSVerbDescendant @" "
+#define CSSVerbAny @""
+#define CSSVerb NSString*
+
+/**
+ * CSSSelector is responsible for modeling a chain of CSSSelectorParts. For example
+ *
+ * body a.link
+ *
+ * is a chain of two parts "body" and "a.link"
+ *
+ * Parts are joined by "verbs" which correspond to symbols " ", "+", and ">"
+ * These parts define the relative position of the second part to the first
+ * Supported parts are:
+ * space within - the second part must match an Element within the
+ * Element matching the first part
+ *
+ * > child - the second part must match an Element whose parent is
+ * the Element matching the first part
+ *
+ * + successor - the second part must match an Element whose previous
+ * sybling was the Element matching the first part
+ */
+
+@interface CSSSelector : NSObject {
+ NSMutableArray* chain;
+}
+-(id)initWithString:(NSString*)string;
+-(NSString*)description;
+
+-(int)countOfParts;
+-(CSSSelectorPart*)partAtIndex:(int)index;
+-(CSSVerb)verbAtIndex:(int)index;
+-(CSSVerb)verbAfterIndex:(int)index;
+
+@end
97 Classes/CSSSelector.m
@@ -0,0 +1,97 @@
+//
+// CSSSelector.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/17/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "CSSSelector.h"
+#import "CSSSelectorPart.h"
+#import "NSString_HTML.h"
+#import "CSSSelectorMatcher.h"
+
+
+@implementation CSSSelector
+
+
+-(id)initWithString:(NSString*)string{
+ CFStringInlineBuffer buffer;
+ CFRange range = CFRangeMake(0, [string length]);
+ CFStringInitInlineBuffer((CFStringRef)string, &buffer, range);
+
+ chain = [[NSMutableArray alloc] initWithCapacity: 10];
+ unichar c;
+ CFIndex index = 0;
+ while (c = skipWhitespace(&buffer, &index)){
+ CSSSelectorPart* part = [[CSSSelectorPart alloc] initWithIndex: &index inBuffer: &buffer];
+ [chain addObject: part];
+ [part release];
+
+ c = skipWhitespace(&buffer, &index);
+ if (!c) break;
+
+ if (c=='+'){
+ [chain addObject: CSSVerbSuccessor];
+ index++;
+ }
+ else if (c=='>'){
+ [chain addObject: CSSVerbChild];
+ index++;
+ }
+ else
+ [chain addObject: CSSVerbDescendant];
+ }
+
+ return self;
+}
+
+-(void)dealloc{
+// NSLog(@"disposing of %@", [self description]);
+ [chain release];
+ [super dealloc];
+}
+
+-(NSString*)description{
+ NSMutableString* result = [NSMutableString string];
+ for (id item in chain){
+ [result appendString: [item description]];
+ }
+ return result;
+}
+
+-(int)countOfParts{
+ return ([chain count] + 1) / 2;
+}
+-(CSSSelectorPart*)partAtIndex:(int)index{
+ return [chain objectAtIndex: index * 2];
+}
+
+-(CSSVerb)verbAtIndex:(int)index{
+ return (index > 0) ? [chain objectAtIndex: index * 2 - 1] : CSSVerbAny;
+}
+
+// sometime we need to access the next verb after an index... see scopingElement
+-(CSSVerb)verbAfterIndex:(int)index{
+ return (index < [self countOfParts] - 1) ? [self verbAtIndex: index + 1] : CSSVerbAny;
+}
+
+@end
+
+
48 Classes/CSSSelectorMatcher.h
@@ -0,0 +1,48 @@
+//
+// CSSSelectorMatcher.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/19/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Element.h"
+#import "CSSSelector.h"
+#import "CSSPartMatcher.h"
+
+/**
+ * Responsible for matching a CSSSelector.
+ * It does this by minting matching parts and creating
+ * CSSPartMatchers for all intermediate potential matches
+ *
+ */
+
+@interface CSSSelectorMatcher : NSObject {
+ CSSSelector* selector;
+ CSSPartMatcher* rootMatch;
+ NSMutableArray* matches;
+}
+@property (nonatomic, retain) CSSSelector* selector;
+@property (nonatomic, retain) NSMutableArray* matches;
+
+-(id)initWithSelector:(CSSSelector*)selector;
+-(BOOL)matchElement:(Element*) element;
+-(Element*)firstMatch;
+@end
60 Classes/CSSSelectorMatcher.m
@@ -0,0 +1,60 @@
+//
+// CSSSelectorMatcher.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/19/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "CSSSelectorMatcher.h"
+#import "CSSSelectorPart.h"
+
+@implementation CSSSelectorMatcher
+
+@synthesize selector, matches;
+
+-(id)initWithSelector:(CSSSelector*)aSelector{
+ self = [super init];
+ selector = [aSelector retain];
+ rootMatch = [[CSSPartMatcher alloc] initWithElement: nil selectorMatcher: self];
+ matches = [[NSMutableArray alloc] initWithCapacity: 1];
+ return self;
+}
+
+-(void)dealloc{
+ [selector release];
+ [rootMatch release];
+ [matches release];
+ [super dealloc];
+}
+
+-(Element*)firstMatch{
+ return ([matches count] > 0) ? [matches objectAtIndex: 0] : nil;
+}
+
+-(BOOL)matchElement:(Element*) element{
+ if ([element isCloseTag]) return NO;
+ BOOL matchComplete = [rootMatch matchNextElement: element forIndex: 0];
+ if (matchComplete)
+ [matches addObject: element];
+ return matchComplete;
+}
+
+
+@end
67 Classes/CSSSelectorPart.h
@@ -0,0 +1,67 @@
+//
+// CSSSelectorPart.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/17/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Element.h"
+
+/**
+ * CSSSelectorPart is responsible for modeling one part of CSSSelector. For example
+ *
+ * a.link[target]
+ *
+ * is a part which matches <a> tags which have a link class name and an attribute 'target'
+ *
+ * A part can consist of one or more of the following:
+ *
+ * * All elements match (used when no tagname is supplied)
+ * tagname Matching elements have this tag name
+ * #id Matching elements have this as their id attribute
+ * .class Matching elements have this as one of their class names
+ * [attr] Matching elements have this attribute (regarless of its value)
+ * [attr=val] Matching elements have this attribute with this value
+ *
+ */
+
+
+@interface CSSSelectorPart : NSObject {
+ NSString* identifier;
+ NSString* tag;
+ NSMutableArray* classNames;
+ NSString* attrName;
+ NSString* attrValue;
+}
+
+
+@property (nonatomic, retain) NSString* identifier;
+@property (nonatomic, retain) NSString* tag;
+@property (nonatomic, retain) NSArray* classNames;
+@property (nonatomic, retain) NSString* attrName;
+@property (nonatomic, retain) NSString* attrValue;
+
+-(id)initWithIndex:(int*) index inString:(NSString*)string;
+-(id)initWithIndex:(CFIndex*) index inBuffer:(CFStringInlineBuffer*)buffer;
+-(NSString*)description;
+-(BOOL)matchesElement:(Element*)element;
+
+@end
160 Classes/CSSSelectorPart.m
@@ -0,0 +1,160 @@
+//
+// CSSSelectorPart.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/17/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "CSSSelectorPart.h"
+#import "NSString_HTML.h"
+
+@implementation CSSSelectorPart
+
+@synthesize identifier, tag, classNames, attrName, attrValue;
+
+-(id)initWithIndex:(int*) index inString:(NSString*)string{
+ CFStringInlineBuffer buffer;
+ CFRange range = CFRangeMake(0, [string length]);
+ CFStringInitInlineBuffer((CFStringRef)string, &buffer, range);
+ CFIndex i = 0;
+ self = [self initWithIndex: &i inBuffer: &buffer];
+ *index = i;
+ return self;
+
+}
+
+-(id)initWithIndex:(CFIndex*) index inBuffer:(CFStringInlineBuffer*)buffer{
+ unichar c;
+ CFIndex len;
+ c = skipWhitespace(buffer, index);
+ while (c > 32){
+ if (c=='#'){
+ len = lenToken(buffer, *index + 1);
+ assert(len);
+ self.identifier = createStringFromBuffer(buffer, *index + 1, len);
+ [self.identifier release];//retained by property setter
+ (*index) += len + 1;
+ }
+ else if (c == '.'){
+ len = lenToken(buffer, *index + 1);
+ assert(len);
+ NSString* className = createStringFromBuffer(buffer, *index + 1, len);
+ if (!classNames)
+ classNames = [[NSMutableArray alloc] initWithObjects: className, nil];
+ else
+ [classNames addObject: className];
+ [className release];
+ (*index) += len + 1;
+ }
+ else if (c == '['){
+ (*index)++;
+ c = skipWhitespace(buffer, index);
+ len = lenToken(buffer, *index);
+ assert(len);
+ self.attrName = createStringFromBuffer(buffer, *index, len);
+ [self.attrName release];//retained by property setter
+ (*index) += len;
+
+ c = skipWhitespace(buffer, index);
+
+ if (c == '='){
+ (*index)++;
+ c = skipWhitespace(buffer, index);
+ if (c=='\''){
+ len = lenThru(buffer, (*index) + 1, "'");
+ assert(len);
+ self.attrValue = createStringFromBuffer(buffer, *index + 1, len-1);
+ (*index)++;
+ }
+ else if (c == '"'){
+ len = lenThru(buffer, (*index) + 1, "\"");
+ assert(len);
+ self.attrValue = createStringFromBuffer(buffer, *index + 1, len-1);
+ (*index)++;
+ }
+ else{
+ len = lenToken(buffer, (*index));
+ assert(len);
+ self.attrValue = createStringFromBuffer(buffer, *index, len);
+ }
+ [self.attrValue release];//retained by property setter
+ (*index) += len;
+ c = skipWhitespace(buffer, index);
+ }
+ assert(c==']');
+ (*index) += 1;
+ }
+ else if (c == '*')
+ (*index)++;
+ else{
+ len = lenToken(buffer, (*index));
+ self.tag = createStringFromBuffer(buffer, *index, len);
+ [self.tag release];//retained by property setter
+ (*index) += len;
+ }
+ c = CFStringGetCharacterFromInlineBuffer(buffer, *index);
+ }
+ return self;
+}
+
+-(void)dealloc{
+ [identifier release];
+ [tag release];
+ [classNames release];
+ [attrName release];
+ [attrValue release];
+ [super dealloc];
+}
+
+-(BOOL)matchesElement:(Element*)element{
+ if(element.range.length == 0) return NO; //ElementParser's root
+ if (tag && ![element tagNameEquals: tag]) return NO;
+ if (identifier && ![identifier isEqualToString: [element attribute: @"id"]]) return NO;
+ if (classNames){
+ for (NSString* className in classNames)
+ if (![element hasClassName: className])
+ return NO;
+ }
+ if (attrName && attrValue && ![attrValue isEqualToString: [element attribute: attrName]]) return NO;
+ if (attrName && ![element hasAttribute: attrName]) return NO;
+ return YES;
+}
+
+-(NSString*)description{
+ NSMutableString* result = [NSMutableString string];
+ if (tag)
+ [result appendString: tag];
+ if (identifier)
+ [result appendFormat: @"#%@", identifier];
+ if (classNames){
+ for (NSString* className in classNames)
+ [result appendFormat: @".%@", className];
+ }
+ if (attrName){
+ [result appendFormat: @"[%@", attrName];
+ if (attrValue)
+ [result appendFormat: @"='%@']", attrValue];
+ else
+ [result appendString: @"]"];
+ }
+ return result;
+}
+
+@end
117 Classes/Chunk.h
@@ -0,0 +1,117 @@
+//
+// Chunk.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+
+#define ChunkKindDocument @"ChunkKindDocument"
+#define ChunkKindTag @"ChunkKindTag"
+#define ChunkKindPI @"ChunkKindPI"
+#define ChunkKindComment @"ChunkKindComment"
+#define ChunkKindEntity @"ChunkKindEntity"
+#define ChunkKindCData @"ChunkKindCData"
+#define ChunkKindDoctype @"ChunkKindDoctype"
+#define ChunkKindText @"ChunkKindText"
+
+/**
+ Chunk is a range of source text that has been divided into a meaningful "chunk" by
+ the NSString_HTML parser. Examples of a chunk include an element, a cdata section, an entity,
+ character data, etc. It is an abstract base class that handles basic housekeeping.
+ Subclasses include TagChunk, TxtChunk, CommentChunk, EntityChunk, etc.
+ */
+@interface Chunk : NSObject {
+ CFStringInlineBuffer* buffer;
+ NSString* source;
+ NSRange range;
+}
+
+
+
+/**
+ A human readable name for the chunk. Used for debugging purposes.
+ */
++(NSString*)humanName;
+
+
+/**
+ The string that contains the whole source being parsed.
+ */
+@property (nonatomic, retain) NSString* source;
+
+
+/**
+ The range within the source of this chunk. Includes delimiters like '<' and '>'
+ */
+@property NSRange range;
+
+
+/** During the parse (only) this buffer provides faster access to individual characters */
+@property CFStringInlineBuffer* buffer;
+
+
+/**
+ Only some of the whole string is buffered... when this chunk is delivered by the parser
+ the whole chunk will be available in the buffer
+ */
+@property (readonly) CFRange rangeInBuffer;
+
+
+/**
+ The interior of a chunk ususally excludes the delimiters. This method does the index
+ math to point inside the buffer. Currently only used to access the character
+ data within a cdata section.
+ */
+@property (readonly) CFRange interiorRangeInBuffer;
+
+
+/**
+ Creates a new chunk from the range aRange in aSource string
+ */
+-(id)initWithString: (NSString*)aSource range:(NSRange)aRange;
+
+
+/**
+ The interior of a chunk ususally excludes the delimiters of the chunk.
+ */
+-(NSRange)interiorRange;
+
+
+/**
+ Convenience method that returns a string corresponding to the interior of the chunk.
+ */
+-(NSString*)interiorString;
+
+
+/**
+ Each chunk has a kind denotes what type of chunk it is.
+ */
+-(NSString*)kind;
+
+
+/**
+ Convenience method to test kind
+ */
+-(BOOL)isKind:(NSString*)aKind;
+
+
+@end
85 Classes/Chunk.m
@@ -0,0 +1,85 @@
+//
+// Chunk.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "Chunk.h"
+
+
+@implementation Chunk
+
+@synthesize source, range, buffer;
+
+-(id)initWithString: (NSString*)aSource range:(NSRange)aRange{
+ source = [aSource retain];
+ range = aRange;
+ return self;
+}
+
+-(void)dealloc{
+ [source release];
+ [super dealloc];
+}
+
+-(CFRange)rangeInBuffer{
+ if (buffer)
+ return CFRangeMake(range.location + buffer->rangeToBuffer.location, range.length);
+ else
+ return CFRangeMake(kCFNotFound, 0);
+}
+
+-(CFRange)interiorRangeInBuffer{
+ if (buffer){
+ NSRange inRange = self.interiorRange;
+ return CFRangeMake(inRange.location + buffer->rangeToBuffer.location, inRange.length);
+ }
+ else
+ return CFRangeMake(kCFNotFound, 0);
+}
+
+-(NSRange)interiorRange{
+ return range;
+}
+
+-(NSString*)interiorString{
+ return [source substringWithRange: [self interiorRange]];
+}
+
+-(NSString*)kind{
+ [self doesNotRecognizeSelector: _cmd];
+ return nil;
+}
+
+-(BOOL)isKind:(NSString*)aKind{
+ return aKind == [self kind];
+}
+
+-(NSString*)description{
+ return [source substringWithRange: range];
+}
+
++(NSString*)humanName{
+ return @"generic";
+}
+
+
+@end
36 Classes/CommentChunk.h
@@ -0,0 +1,36 @@
+//
+// CommentChunk.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Chunk.h"
+
+
+/**
+ CommentChunk corresponds to a comment section.
+ */
+@interface CommentChunk : Chunk {
+
+}
+
+@end
41 Classes/CommentChunk.m
@@ -0,0 +1,41 @@
+//
+// CommentChunk.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "CommentChunk.h"
+
+
+@implementation CommentChunk
+
+-(NSRange)interiorRange{
+ return NSMakeRange(range.location + 4, range.length - 7);
+}
+-(NSString*)kind{
+ return ChunkKindComment;
+}
+
++(NSString*)humanName{
+ return @"comment";
+}
+
+@end
36 Classes/DoctypeChunk.h
@@ -0,0 +1,36 @@
+//
+// DoctypeChunk.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/25/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Chunk.h"
+
+/**
+ DoctypeChunk corresponds to a Doctype section. Note that the parser does not
+ extract declarations within an internal subset, and it ignores references to system and external ids.
+ */
+@interface DoctypeChunk : Chunk {
+
+}
+
+@end
42 Classes/DoctypeChunk.m
@@ -0,0 +1,42 @@
+//
+// DoctypeChunk.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/25/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "DoctypeChunk.h"
+
+
+@implementation DoctypeChunk
+
+-(NSString*)kind{
+ return ChunkKindDoctype;
+}
+
+-(NSRange)interiorRange{
+ return NSMakeRange(range.location + 9, range.length - 10);
+}
+
++(NSString*)humanName{
+ return @"doctype";
+}
+
+@end
30 Classes/DocumentRoot.h
@@ -0,0 +1,30 @@
+//
+// DocumentRoot.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Element.h"
+
+@interface DocumentRoot : Element {
+}
+@end
48 Classes/DocumentRoot.m
@@ -0,0 +1,48 @@
+//
+// DocumentRoot.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "DocumentRoot.h"
+
+@implementation DocumentRoot
+
+
+-(id)initWithString: (NSString*)aSource range:(NSRange)aRange{
+ self = [super initWithString: aSource range:aRange tagName: @"DOCUMENT ROOT"];
+ self.contentsLength = [aSource length];
+ return self;
+}
+
+-(NSString*)kind{
+ return ChunkKindDocument;
+}
+
+-(BOOL)isEmptyTag{
+ return NO;
+}
+
+-(BOOL)isCloseTag{
+ return NO;
+}
+
+@end
235 Classes/Element.h
@@ -0,0 +1,235 @@
+//
+// Element.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/18/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "TagChunk.h"
+
+@class CSSSelector;
+@class DocumentRoot;
+
+/**
+ * An Element is the fundemental building block for ElementParser.
+ */
+@interface Element : TagChunk {
+ NSMutableDictionary* attributes;
+ BOOL attributesParsed;
+ Element* nextElement;
+ Element* nextSybling;
+ Element* parent;
+ int contentsLength;
+ NSString* contentsText;
+ NSString* key;
+ BOOL containsMarkup; // includes entities
+
+ NSObject* domainObject;
+}
+
+/**
+ * The attributes in the element
+ */
+@property (nonatomic, retain) NSDictionary* attributes;
+
+
+/**
+ * The character data inside the element. This text is stripped of tags, whitespace, etc
+ * by stripTags. To see the actual source within the element, use contentsSource
+ */
+@property (nonatomic, retain) NSString* contentsText;
+
+
+/**
+ * A case-normalized version of the tagName when appropriate. Used in situations
+ * where the tag name might need to serve as a key into a dictionary
+ */
+@property (nonatomic, retain) NSString* key;
+
+/**
+ * One or more chunks where encountered within this element
+ * Used for more efficient return of contentsText
+ */
+@property BOOL containsMarkup;
+
+
+/**
+ * The length of the text from the end of the start tag to the start of the end tag
+ */
+@property int contentsLength;
+
+
+/**
+ * The next Element encountered in the document
+ */
+@property (nonatomic, retain) Element* nextElement;
+
+
+/**
+ * The next sybling Element (ie the Element at the same depth with the same parent)
+ */
+@property (nonatomic, retain) Element* nextSybling;
+
+
+/**
+ * The parent Element to this Element
+ */
+@property (nonatomic, assign) Element* parent;
+
+
+/**
+ * Available for developer's use to hang an object onto this Element
+ */
+@property (nonatomic, retain) NSObject* domainObject;
+
+
+/**
+ * Parses the supplied source and return an Element tree with Document element serving as the root
+ * or all top level elements. As HTML, Elements shall be considered case insensative and tag
+ * specific heuristics will be used to close tags intelligently. See ElementParser for details.
+ */
++(DocumentRoot*)parseHTML:(NSString*)source;
+
+
+/**
+ * Parses the supplied source and return an Element tree with Document element serving as the root
+ * or all top level elements. XML, Elements shall be considered case sensative. See ElementParser for details.
+ */
++(DocumentRoot*)parseXML:(NSString*)source;
+
+/**
+ * Initializer used by ElementParser. See TagChunk for other intializers
+ */
+-(id)initWithTag:(TagChunk*)tag caseSensative:(BOOL)aCaseSensative;
+
+
+/**
+ * Returns true if the element contains the specified attribute.
+ * If the attributes have not yet been parsed, this will parser them first.
+ */
+-(BOOL)hasAttribute:(NSString*)attr;
+
+
+/**
+ * Returns a dictionary of attributes name/values.
+ * If an attribute had no value in the source (e.g. <table noborders>) then the value will be NSNull
+ * If the attributes have not yet been parsed, this will parser them first.
+ */
+-(NSDictionary*)attributes;
+
+
+/**
+ * Returns the value of a particular attribute (or nil if it doesn't exist)
+ * Note: ElementParser does not support default attributes
+ * If the attributes have not yet been parsed, this will parser them first.
+ */
+-(NSString*)attribute:(NSString*)attr;
+
+
+/**
+ * Convenience method to compare an element's tag name.
+ * Comparision will be cases sensative for XML elements and insensative for HTML elements.
+ */
+-(BOOL)isEqualToString:(NSString*)string;
+
+
+/**
+ * An array of child Elements in document order
+ */
+-(NSArray*)childElements;
+
+
+/**
+ * The first child Element for this element (or nil if none).
+ */
+-(Element*)firstChild;
+
+
+/**
+ * A dictionary containing the tagnames of children as keys
+ * and the contentsText of the children as values.
+ * If duplicate children tag names are encountered, only the last will
+ * appear in the dictionary.
+ */
+-(NSDictionary*)contentsOfChildren;
+
+/**
+ * Returns true if the supplied Element is a parent of receiver or one of its parents
+ */
+-(BOOL)hasAncestor:(Element*)ancestor;
+
+
+/**
+ * Returns the nextElement but only if it has the scope Element as an ancestor
+ */
+-(Element*)nextElementWithinScope:(Element*)scope;
+
+
+/**
+ * Returns true if the class attribute contains the class name (perhaps as one of multiple classes).
+ */
+-(BOOL)hasClassName:(NSString*)aClassName;
+
+
+/**
+ * Returns true receiver can be a chlid of aParent. Used by ElementParser to prevent inappropriate
+ * nesting in HTML (e.g. <p><p>)
+ */
+-(BOOL)acceptsParent:(Element*)aParent;
+
+/**
+ * Debugging method
+ */
+-(NSString*)dumpTree;
+
+
+/**
+ * The source between the end of the open tag and the beginning of the close tag
+ */
+-(NSString*)contentsSource;
+
+
+/**
+ * Convenience method for using a selector to find elements within the receiver that match.
+ * See CSSSelector for details.
+ */
+-(Element*)elementWithCSSSelector:(CSSSelector*)selector;
+
+/**
+ * Convenience method for using a selector to find elements within the receiver that match
+ * See CSSSelector for details.
+ */
+-(Element*)selectElement:(NSString*)cssSelectorString;
+
+
+/**
+ * Convenience method for using a selector to find elements within the receiver that match
+ * See CSSSelector for details.
+ */
+-(NSArray*)elementsWithCSSSelector:(CSSSelector*)selector;
+
+/**
+ * Convenience method for using a selector to find elements within the receiver that match
+ * See CSSSelector for details.
+ */
+-(NSArray*)selectElements:(NSString*)cssSelectorString;
+
+@end
263 Classes/Element.m
@@ -0,0 +1,263 @@
+//
+// Element.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/18/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "Element.h"
+#import "NSString_HTML.h"
+#import "CSSSelectorMatcher.h"
+#import "ElementParser.h"
+
+@interface Element (Private)
+@end
+
+@implementation Element
+
+@synthesize attributes, nextElement, nextSybling, parent, contentsLength, contentsText, key, containsMarkup, domainObject;
+
+
++(DocumentRoot*)parseHTML:(NSString*)source{
+ ElementParser* parser = [[ElementParser alloc] init];
+ DocumentRoot* root = [parser parseHTML: source];
+ [[root retain] autorelease];
+ [parser release];
+ return root;
+}
+
++(DocumentRoot*)parseXML:(NSString*)source{
+ ElementParser* parser = [[ElementParser alloc] init];
+ DocumentRoot* root = [parser parseXML: source];
+ [[root retain] autorelease];
+ [parser release];
+ return root;
+}
+
+-(id)initWithString:(NSString*)string{
+ return [self initWithString: string range: NSMakeRange(0, [string length])];
+}
+
+-(id)initWithTag:(TagChunk*)tag caseSensative:(BOOL)aCaseSensative{
+ self = [self initWithString: tag.source range: tag.range tagName: tag.tagName];
+ [self setCaseSensative: aCaseSensative];
+ return self;
+}
+
+-(void)dealloc{
+ [attributes release];
+ [contentsText release];
+ [nextElement release];
+ [nextSybling release];
+ [key release];
+ [super dealloc];
+}
+
+
+-(void)setRange: (NSRange)aRange{
+ attributesParsed = NO;
+ [attributes removeAllObjects];
+ [super setRange: aRange];
+}
+
+//cleans up nested p tags
+-(BOOL)acceptsParent:(Element*)aParent{
+ if ([self tagNameEquals: @"p"] && [aParent tagNameEquals: @"p"])
+ return NO;
+ return YES;
+}
+
+
+-(BOOL)closesTag:(TagChunk*)aTag{
+ if (self == aTag || [self isEmptyTag]) //former case is true when shouldBeEmptyTag
+ return self == aTag;
+ else
+ return [super closesTag: aTag];
+}
+
+-(BOOL)hasAttribute:(NSString*)attr{
+ return [[[self attributes] allKeys] containsObject: attr];
+}
+
+-(NSString*)attribute:(NSString*)attr{
+ return [self.attributes objectForKey: attr];
+}
+
+// warning, may contain empty classnames
+-(NSArray*)classNames{
+ NSString* classNames = [self attribute: @"class"];
+ if (!classNames) return [NSArray array];
+ return [classNames componentsSeparatedByCharactersInSet: [NSCharacterSet whitespaceCharacterSet]];
+}
+
+-(BOOL)hasClassName:(NSString*)aClassName{
+ if (![self attribute: @"class"]) return NO;
+ for (NSString* className in [self classNames])
+ if ([className isEqualToString: aClassName])
+ return YES;
+ return NO;
+}
+
+-(NSDictionary*)attributes{
+ if (!attributesParsed){
+ self.attributes = [source parseElementAttributesWithRange: range caseSensative: [self caseSensative]];
+ attributesParsed = YES;
+ }
+ return attributes;
+}
+
+-(Element*)firstChild{
+ if ([nextElement parent] == self)
+ return nextElement;
+ else
+ return nil;
+}
+
+-(BOOL)hasAncestor:(Element*)ancestor{
+ for (Element* p = parent; p; p = p.parent){
+ if (p == ancestor)
+ return YES;
+ }
+ return NO;
+}
+
+-(Element*)nextElementWithinScope:(Element*)scope{
+ if ((nextElement.parent == self) || nextSybling)
+ return nextElement;
+ else
+ return ([nextElement hasAncestor: scope]) ? nextElement : nil;
+}
+
+-(NSString*)contentsText{
+ if (!contentsText){
+// NSRange contentsRange = NSMakeRange(NSMaxRange(range), contentsLength);
+ self.contentsText = (containsMarkup) ? [[self contentsSource] stripTags] : [self contentsSource];//[source stringByReplacingEntitiesInRange: contentsRange];
+ }
+ return contentsText;
+}
+
+-(NSString*)contentsSource{
+ NSRange contentsRange = NSMakeRange(NSMaxRange(range), contentsLength);
+ NSString* result = [source substringWithRange: contentsRange];
+ return result;
+}
+
+-(NSArray*)selectElements:(NSString*)cssSelectorString{
+ if (!cssSelectorString) return [NSArray array];
+ CSSSelector* selector = [[CSSSelector alloc] initWithString: cssSelectorString];
+ NSArray* result = [self elementsWithCSSSelector: selector];
+ [selector release];
+ return result;
+}
+
+-(Element*)selectElement:(NSString*)cssSelectorString{
+ if (!cssSelectorString) return nil;
+ CSSSelector* selector = [[CSSSelector alloc] initWithString: cssSelectorString];
+ Element* result = [self elementWithCSSSelector: selector];
+ [selector release];
+ return result;
+}
+
+-(NSArray*)elementsWithCSSSelector:(CSSSelector*)selector{
+ CSSSelectorMatcher* matcher = [[CSSSelectorMatcher alloc] initWithSelector: selector];
+ Element* e = self;
+ while (e){
+ [matcher matchElement: e];
+ e = e.nextElement;
+ }
+ NSArray* result = [[[matcher matches] retain] autorelease];
+ [matcher release];
+ return result;
+}
+
+-(Element*)elementWithCSSSelector:(CSSSelector*)selector{
+ CSSSelectorMatcher* matcher = [[CSSSelectorMatcher alloc] initWithSelector: selector];
+ Element* e = self;
+ BOOL success = NO;
+ while (e && !success){
+ success = [matcher matchElement: e];
+ e = [e nextElementWithinScope: self];
+ }
+ Element* result = [matcher firstMatch];
+ [matcher release];
+ return result;
+}
+
+-(NSArray*)childElements{
+ NSMutableArray* kids = [NSMutableArray array];
+ Element* e = [self firstChild];
+ while (e){
+ [kids addObject: e];
+ e = e.nextSybling;
+ }
+ return kids;
+}
+
+-(NSDictionary*)contentsOfChildren{
+ NSMutableDictionary* result = [NSMutableDictionary dictionary];
+ Element* e = [self firstChild];
+ while (e){
+ [result setObject: [e contentsText] forKey: [e key]];
+ e = e.nextSybling;
+ }
+ return result;
+}
+
+-(BOOL)isEqualToString:(NSString*)string{
+ return [[self description] isEqualToString: string];
+}
+
+-(NSString*)key{
+ if (!key)
+ self.key = ([self caseSensative])
+ ? [self tagName]
+ : [[self tagName] lowercaseString];
+ return key;
+}
+
+-(NSString*)description{
+ NSMutableString* result = [NSMutableString string];
+ if (!source) return result;//root element has no source
+ [result appendString: @"<"];
+ [result appendString: [self tagName]];
+ for (NSString* att in [self.attributes allKeys]){
+ [result appendFormat: @" %@='%@'", att, [attributes objectForKey: att]];
+ }
+ if ([self isEmptyTag])
+ [result appendString: @" />"];
+ else
+ [result appendString: @">"];
+ return result;
+}
+
+-(NSString*)dumpTree{
+ NSMutableString* result = [NSMutableString string];
+ Element* e = self;
+ while (e){
+ for (Element* ee = e; ee; ee = [ee parent])
+ [result appendString: @" "];
+ [result appendString: [e description]];
+ [result appendString: @"\n"];
+ e = e.nextElement;
+ }
+ return result;
+}
+
+@end
115 Classes/ElementParser.h
@@ -0,0 +1,115 @@
+//
+// ElementParser.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/20/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Element.h"
+#import "DocumentRoot.h"
+
+typedef enum{
+ ElementParserModeHTML,
+ ElementParserModeXML
+} ElementParserMode;
+
+#define ElementParserErrorDomain 1022
+typedef enum{
+ ElementParserTagNotClosedError = -1,
+ ElementParserGeneralError = -2
+}ElementParserErrors;
+
+@interface ElementParser : NSObject {
+ NSMutableArray* tagStack;
+ DocumentRoot* root;
+ Element* lastOpened; //assigned
+ Element* lastClosedBeforeOpen;
+ Chunk* lastChunk;
+
+ CFMutableArrayRef callbackMethods;
+ NSMutableArray* callbackMatchers;
+ id delegate;
+ ElementParserMode mode;
+}
+
+/**
+ * The delegate that is called when selectors match
+ */
+@property (nonatomic, assign) id delegate;
+
+/**
+ * HTML or XML
+ */
+@property ElementParserMode mode;
+
+
+/**
+ * The source being parsed.
+ */
+@property (readonly) NSString* source;
+
+
+/**
+ * Parse an HMTL document and return a tree of Elements corresponding to the document.
+ * The DocumentRoot is a special Element that contains all the top-level Elements in the
+ * source.
+ */
+-(DocumentRoot*)parseHTML:(NSString*)source;
+
+
+/**
+ * Parse an XML document and return a tree of Elements corresponding to the document.
+ * The DocumentRoot is a special Element that contains all the top-level Elements in the
+ * source.
+ */
+-(DocumentRoot*)parseXML:(NSString*)source;
+
+/**
+ * When parsing a document incrementally, begin with a single call to beginParsing,
+ * followed by multiple calls to continueParsing as text arrives and finaly a single
+ * call to finishParsing
+ */
+-(DocumentRoot*)beginParsing;
+-(void)continueParsingString:(NSString*)string;
+-(void)finishParsing;
+
+/**
+ * Registers a callback to be performed whenever the supplied selector matches
+ */
+-(void)performSelector:(SEL)method forElementsMatching:(NSString*)cssSelector;
+
+/**
+ * returns true for html elements like <img>
+ */
+ -(BOOL)shouldBeEmptyElement:(Element*)element;
+
+/**
+ * internal callback when a warning condition occurs. May be overidden to surface an
+ * NSError
+ */
+-(void)warning:(int)code description:(NSString*)description chunk: (Chunk*)chunk;
+
+/**
+ * internal callback when an info condition occurs. May be overidden for debugging purposes
+ */
+-(void)info:(NSString*)info atIndex:(int)sourceIndex;
+
+@end
265 Classes/ElementParser.m
@@ -0,0 +1,265 @@
+//
+// ElementParser.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/20/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "ElementParser.h"
+#import "NSString_HTML.h"
+#import "Chunk.h"
+#import "TagChunk.h"
+#import "CSSSelectorMatcher.h"
+
+static NSSet* HTML_TAGS_THAT_SHOULD_BE_EMPTY;
+
+
+@interface ElementParser()
+
+@property (nonatomic, assign) Element* lastOpened;
+@property (nonatomic, assign) Element* lastClosedBeforeOpen;
+@property (nonatomic, retain) DocumentRoot* root;
+@property (nonatomic, retain) Chunk* lastChunk;
+
+-(void)closeAllTags;
+-(void)prepareParseWithString:(NSString*)string;
+-(void)parseMoreWithPartial:(BOOL)partial;
+
+@end
+
+
+@implementation ElementParser
+
+@synthesize root, lastOpened, lastClosedBeforeOpen, lastChunk, delegate, mode;
+
++(void)initialize{
+ HTML_TAGS_THAT_SHOULD_BE_EMPTY = [[NSSet alloc] initWithObjects: @"img", @"meta", @"br", @"hr", @"area", @"base", @"basefont", @"col", @"frame", @"input", @"isindex", @"link", @"param", nil];
+}
+
+-(id)init{
+ self = [super init];
+ tagStack = [[NSMutableArray alloc] initWithCapacity: 24];
+ mode = ElementParserModeHTML;
+ return self;
+}
+
+-(void)dealloc{
+ [tagStack release];
+ [root release];
+ if (callbackMethods){
+ CFRelease(callbackMethods);
+ [callbackMatchers release];
+ }
+ [super dealloc];
+}
+
+-(DocumentRoot*)parseHTML:(NSString*)source{
+ self.mode = ElementParserModeHTML;
+ [self prepareParseWithString: source];
+ [self parseMoreWithPartial: NO];
+ [self closeAllTags];
+ return root;
+}
+
+-(DocumentRoot*)parseXML:(NSString*)source{
+ self.mode = ElementParserModeXML;
+ [self prepareParseWithString: source];
+ [self parseMoreWithPartial: NO];
+ [self closeAllTags];
+ return root;
+}
+
+
+-(DocumentRoot*)beginParsing{
+ NSMutableString* source = [NSMutableString string];
+ [self prepareParseWithString: source];
+ return root;
+}
+
+-(void)continueParsingString:(NSString*)moreString{
+ [(NSMutableString*)self.source appendString:moreString];
+ [self parseMoreWithPartial: YES];
+}
+
+-(void)finishParsing{
+ [self parseMoreWithPartial: NO];
+ [self closeAllTags];
+}
+
+-(NSString*)source{
+ return root.source;
+}
+
+-(void)prepareParseWithString:(NSString*)string{
+ root = [[DocumentRoot alloc] initWithString: string range: NSMakeRange(0,0)];
+ lastOpened = root;
+ [tagStack removeAllObjects];
+ [tagStack addObject: root];
+}
+
+-(void)parseMoreWithPartial:(BOOL)partial{
+ int index = lastChunk ? NSMaxRange(lastChunk.range) : 0;
+ NSString* source = [root source];
+ root.contentsLength = [source length];
+ [NSString parseHTML: source delegate: self selector: @selector(buildElementTreeWithChunk:context:) context: self index: &index partial: partial];
+}
+
+
+-(Element*)parentElement{
+ return [tagStack objectAtIndex: [tagStack count] - 1];
+}
+
+-(void)matchElement:(Element*)element{
+ for (int i = 0; i < [callbackMatchers count]; i++){
+ CSSSelectorMatcher* matcher = [callbackMatchers objectAtIndex: i];
+ BOOL matchComplete = [matcher matchElement: element];
+ if (matchComplete){
+ SEL selector = (SEL)CFArrayGetValueAtIndex(callbackMethods, i);
+ NSObject* domainObject = [delegate performSelector: selector withObject: element];
+ if (domainObject)
+ element.domainObject = domainObject;
+ }
+ }
+}
+
+// nil is a valid value... closed first open tag
+-(void)closeElementWithTag:(TagChunk*) tag{
+ int depthIndex;
+ for (depthIndex = [tagStack count] - 1; depthIndex > 0; depthIndex--){
+ // crawl up stack to find matching element
+ Element* stackElement = [tagStack objectAtIndex: depthIndex];
+ if (!tag || [tag closesTag: stackElement])
+ break;
+ }
+ if (depthIndex > 0){
+ Element* closedElement;
+ // close everything up to found element
+ while ([tagStack count] > depthIndex){//int ii=[tagStack count] - 1; ii >= depth; ii--
+ closedElement = [tagStack lastObject];
+ closedElement.contentsLength =
+ (tag == nil) ? lastChunk.range.location - NSMaxRange(closedElement.range) :
+ (tag == closedElement) ? 0 :
+ tag.range.location - NSMaxRange(closedElement.range);
+ if(!tag && closedElement.contentsLength == 0)
+ [self warning: ElementParserGeneralError description:@"Contents may not be right" chunk: closedElement];
+// NSLog(@"Close %@", [closedElement description]);
+ self.lastClosedBeforeOpen = closedElement;
+ [tagStack removeObjectsInRange: NSMakeRange([tagStack count] - 1, 1)];
+ if (delegate && callbackMatchers)
+ [self matchElement: closedElement];
+ }
+// self.lastClosedBeforeOpen = closedElement;
+// [tagStack removeObjectsInRange: NSMakeRange(i, [tagStack count] - i)];
+ }
+ else{
+ // orphan close tag - ignore
+ }
+}
+
+-(void)openElement:(Element*) element{
+// NSLog(@"Open %@", [element description]);
+ element.parent = [self parentElement];
+ lastOpened.nextElement = element;
+ self.lastClosedBeforeOpen.nextSybling = element;
+ [tagStack addObject: element];
+ self.lastOpened = element;
+ self.lastClosedBeforeOpen = nil;
+}
+
+-(void)closeAllTags{
+ for (int i = [tagStack count] - 1; i >= 0; i--){
+ Element* stackElement = [tagStack objectAtIndex: i];
+ if (i > 0)
+ [self warning: ElementParserTagNotClosedError description:@"document left tag open" chunk: stackElement];
+ [self closeElementWithTag: nil];
+ }
+}
+
+-(void)info:(NSString*)info atIndex:(int)sourceIndex{
+ NSLog(@"INFO [index: %i]: %@", sourceIndex, info);
+}
+
+-(void)warning:(int)code description:(NSString*)description chunk: (Chunk*)chunk{
+ NSLog(@"WARN [index: %i]: %@\n%@", chunk.range.location, description, [chunk description]);
+ /* subclasses should do this work if they want to do something with the warnings
+ NSMutableDictionary* info = [NSMutableDictionary dictionaryWithCapacity: 2];
+ if (description)
+ [info addObject: description forKey: NSLocalizedDescriptionKey];
+ if (chunk)
+ [info addObject: chunk forKey: ElementParserErrorChunk];
+ NSError* error = [NSError errorWithDomain: ElementParserErrorDomain code: code userInfo: info];
+ */
+}
+
+-(BOOL)shouldBeEmptyElement:(Element*)element{
+ if (mode == ElementParserModeXML) return NO;
+ BOOL result = [HTML_TAGS_THAT_SHOULD_BE_EMPTY containsObject: [element key]];
+ return result;
+}
+
+-(id)buildElementTreeWithChunk:(Chunk*)chunk context:(void*)builder{
+// NSLog([chunk description]);
+ self.lastChunk = chunk;
+ if (![chunk isKind: ChunkKindText])
+ lastOpened.containsMarkup = YES;
+ if (![chunk isKind: ChunkKindTag]) return self;
+ TagChunk* tag = (TagChunk*) chunk;
+ if ([tag isCloseTag])
+ [self closeElementWithTag: tag];
+ else {
+ Element* element = [[Element alloc] initWithTag: tag caseSensative: mode == ElementParserModeXML];
+ if ([element isEmptyTag] || [self shouldBeEmptyElement: element]){
+ [self openElement: element];
+ [self closeElementWithTag: element];
+ }
+ else {
+ if (![element acceptsParent: [self parentElement]])
+ [self closeElementWithTag: [self parentElement]];
+ [self openElement: element];
+ }
+ [element release];
+ }
+ return self;//to continue parsing
+}
+
+-(void)performSelector:(SEL)method forElementsMatching:(NSString*)cssSelector{
+ if (!callbackMethods){
+ callbackMethods = CFArrayCreateMutable(NULL, 0, NULL);
+ callbackMatchers = [[NSMutableArray alloc] initWithCapacity: 10];
+ }
+ CFArrayAppendValue(callbackMethods, method);
+ CSSSelector* css = [[CSSSelector alloc] initWithString: cssSelector];
+ CSSSelectorMatcher* matcher = [[CSSSelectorMatcher alloc] initWithSelector: css];
+ [callbackMatchers addObject: matcher];
+ [css release];
+ [matcher release];
+}
+
+-(NSString*)description{
+ NSMutableString* result = [NSMutableString string];
+ Element* e = root.nextElement;
+ while (e){
+ [result appendString: [e description]];
+ e = e.nextElement;
+ }
+ return result;
+}
+
+@end
37 Classes/EntityChunk.h
@@ -0,0 +1,37 @@
+//
+// EntityChunk.h
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "Chunk.h"
+
+
+/**
+ EntityChunk corresponds to a Entity section (e.g. &amp;)
+ */
+
+@interface EntityChunk : Chunk {
+
+}
+
+@end
42 Classes/EntityChunk.m
@@ -0,0 +1,42 @@
+//
+// EntityChunk.m
+// Thumbprint
+//
+// Created by Lee Buck on 4/21/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "EntityChunk.h"
+
+
+@implementation EntityChunk
+
+-(NSRange)interiorRange{
+ return NSMakeRange(range.location + 1, range.length - 2);
+}
+
+-(NSString*)kind{
+ return ChunkKindEntity;
+}
+
++(NSString*)humanName{
+ return @"entity";
+}
+
+@end
172 Classes/NSString_HTML.h
@@ -0,0 +1,172 @@
+//
+// NSString_HTML.h
+// Thumbprint
+//
+// Created by Lee Buck on 3/27/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import <Foundation/Foundation.h>
+#import "CSSSelector.h"
+@class Element;
+
+/**
+ * spins through string buffer until character a or b encountered (or end of buffer)
+ */
+CFIndex lenThruOr(CFStringInlineBuffer* buffer, CFIndex index, const char a, const char b);
+
+
+/**
+ * spins through string buffer until a white character is encountered.
+ * Assumes <32 denotes whitespace. Returns 0 if end of buffer encountered.
+ */
+unichar skipNonWhitespace(CFStringInlineBuffer* buffer, CFIndex* index);
+
+
+/**
+ * spins through string buffer until a non white character is encountered.
+ * Assumes <32 denotes whitespace. Returns 0 if end of buffer encountered.
+ */
+unichar skipWhitespace(CFStringInlineBuffer* buffer, CFIndex* index);
+
+
+/**
+ * spins through string buffer until a non token character is encountered.
+ * Returns length of the token. Used for attributes, class names, identifiers and tag names.
+ * Does not accommodate non latin characters.
+ * Accepts '-', '_', ':' even when in first character position
+ * Also permits '/' to begin the token (simplifies parsing close tags).
+ */
+CFIndex lenToken(CFStringInlineBuffer* buffer, CFIndex index);
+
+
+/**
+ * Returns true if the characters in th buffer at index begin with the supplied string
+ */
+CFIndex startsWithStr(CFStringInlineBuffer* buffer, CFIndex index, const char* prefix);
+
+/**
+ * Parses an entity name and returns its length. Returns 0 if end of buffer or
+ * an invalid entity is encountered.
+ */
+CFIndex lenEntityName(CFStringInlineBuffer* buffer, CFIndex index);
+
+
+/**
+ * Spins through buffer until the supplied suffix is encountered. Returns
+ * 0 if end of buffer is encountered before the suffix.
+ */
+CFIndex lenThru(CFStringInlineBuffer* buffer, CFIndex index, const char* suffix);
+
+
+/**
+ * Returns the character corresponding to the entity at the supplied index in the buffer
+ */
+unichar parseEntity(CFStringInlineBuffer* buffer, CFIndex index, CFIndex* len);
+
+
+/**
+ * Parses the doctype at the suppied index in the buffer and returns its length.
+ * Return 0 if end of buffer encountered first
+ */
+CFIndex lenDoctype(CFStringInlineBuffer* buffer, CFIndex index);
+
+
+/**
+ * Convenience method that creates an string from a range in the buffer
+ */
+NSString* createStringFromBuffer(CFStringInlineBuffer* buffer, CFIndex index, CFIndex length);
+
+@interface NSString (HTML)
+
+/**
+ * converts the string assuming it is a hex number
+ */
+-(int)hexValue;
+
+
+/**
+ * Returns a string in which
+ * a) all the tags have been removed
+ * b) entities are resolved
+ * c) cdata sections are processed
+ * d) whitespace is compressed
+ * e) html markup like <br> and <p> are used to provide minimal formatting
+ */
+-(NSString*)stripTags;
+
+
+/**
+ * Convenience method to url encode a string
+ */
+-(NSString*)stringByAddingPercentEscaping;
+
+
+/**
+ * Convenience method to url decode a string
+ */
+-(NSString*)stringByRemovingPercentEscaping;
+
+
+/**
+ * Resolves entities in string
+ */
+-(NSString*)stringByReplacingEntities;
+
+
+/**
+ * Convenienece method that replaces entities for a range
+ */
+-(NSString*)stringByReplacingEntitiesInRange:(NSRange)range;
+
+
+/**
+ * Convenienece method to create an element
+ */
+-(Element*)element;
+
+
+/**
+ * Parses an element returning its attributes.
+ */
+-(NSDictionary*)parseElementAttributesWithRange:(NSRange) range caseSensative:(BOOL)caseSensative;
+
+
+/**
+ * Very simpleminded parsing out of character encoding based on an http header contentType
+ */
++ (NSStringEncoding) encodingForContentType:(NSString *)contentType;
+
+
+/**
+ * The base parser that spins through a string and calls a delegate for each chunk encountered.
+ * Chucks include: tags, entities, comments, cdata, characters and others.
+ * ElementParser uses this low level parser to build an Element tree.
+ */
++(void)parseHTML:(NSString*) source delegate:(id)delegate selector:(SEL)selector context: (void*) context;
+
+/**
+ * The base parser that spins through a string and calls a delegate for each chunk encountered.
+ * This version of the method permits partial parsing... ie the parser will stop if
+ * it encounters a chunk that extends beyond the end of the string. It can be called
+ * repeatedly as more text arrives and is appended to the string.
+ */
++(void)parseHTML:(NSString*)source delegate:(id)delegate selector:(SEL)selector context: (void*) context index:(int*)sourceIndex partial:(BOOL)partial;
+
+@end
624 Classes/NSString_HTML.m
@@ -0,0 +1,624 @@
+//
+// NSString_HTML.m
+// Thumbprint
+//
+// Created by Lee Buck on 3/27/09.
+// Copyright 2009 Blue Bright Ventures. All rights reserved.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// Commercial licences without many of the obligations of GPL
+// are available for a nomial fee at sales@touchtankapps.com.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#import "NSString_HTML.h"
+#import "Element.h"
+#import "CSSSelectorMatcher.h"
+#import "ElementParser.h"
+#import "TagChunk.h"
+#import "CommentChunk.h"
+#import "EntityChunk.h"
+#import "ProcessingInstructionChunk.h"
+#import "CDataChunk.h"
+#import "DoctypeChunk.h"
+#import "TxtChunk.h"
+
+#define MAX_OUT_BUFFER_LENGTH 20000
+#define MAX_READ_BUFFER_LENGTH 60000
+static const NSDictionary* ENTITIES_MAP;
+
+
+
+CFIndex lenThruOr(CFStringInlineBuffer* buffer, CFIndex index, const char a, const char b){
+ CFIndex startIndex = index;
+ unichar c;
+ while ((c = CFStringGetCharacterFromInlineBuffer(buffer, index)) && (c!=a) && (c != b))
+ index++;
+ return index - startIndex;
+}
+
+
+unichar skipNonWhitespace(CFStringInlineBuffer* buffer, CFIndex* index){
+ unichar c;
+ for (;(c = CFStringGetCharacterFromInlineBuffer(buffer, *index)); (*index)++){
+ if (c <= 32)
+ return c;
+ }
+ return 0;
+}
+
+
+unichar skipWhitespace(CFStringInlineBuffer* buffer, CFIndex* index){
+ unichar c;
+ for (;(c = CFStringGetCharacterFromInlineBuffer(buffer, *index)); (*index)++){
+ if (c > 32)
+ return c;
+ }
+ return 0;
+}
+
+
+// allowed to start with / or close elements
+CFIndex lenToken(CFStringInlineBuffer* buffer, CFIndex index){
+ CFIndex maxIndex = buffer->rangeToBuffer.location + buffer->rangeToBuffer.length;
+ CFIndex i;
+ for (i = index; i < maxIndex; i++){
+ unichar c = CFStringGetCharacterFromInlineBuffer(buffer, i);
+ BOOL valid = ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9')) || (c=='-') || (c=='_') || (c == ':');
+ if ((valid == NO) && (i == index) && ((c == '/')))
+ valid = YES;
+ if (valid == NO)
+ break;
+ }
+ return i - index;
+}
+
+
+CFIndex startsWithStr(CFStringInlineBuffer* buffer, CFIndex index, const char* prefix){
+ CFIndex startIndex = index;
+ while (*prefix){
+ unichar c = CFStringGetCharacterFromInlineBuffer(buffer, index);