v0.4.9-alpha

Tjatse · Apr 7, 2016 · c77738e · c77738e
1 parent 9e92ed0
commit c77738e
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 31 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,3 +1,11 @@
+# 2016/04/07
+- remove useless anchors i.e.: paginations, contactors...
+- remove related links if neccessary
+- improve link density algorithm
+- options recognition
+- cheerio object can be passed in
+- more test cases
+
 # 2016/03/23
 - improve `selectors` option
 

diff --git a/README.md b/README.md
@@ -54,18 +54,19 @@ npm install read-art --production
 <a name="usage" />
 ## Usage
 ```javascript
-read(html/uri [, options], callback)
+read(<html|uri|cheerio> [, options], [callback])
 ```
 
 It supports the definitions such as:
-  * **html/uri** Html or Uri string.
+  * **html|uri|cheerio** Html, Uri string or Cheerio instance.
   * **options** An optional options object, including:
     - **output** The data type of article content, head over to [Output](#output) to get more information.
     - **killBreaks** A value indicating whether or not kill breaks, blanks, tab symbols(\r\t\n) into one `<br />`, `true` by default.
     - **betterTitle** Defined how to extract the title, [read more](#better_title).
     - **minTextLength** If the content is less than `[minTextLength]` characters, don't even count it, `25` by default.
     - **minParagraphs** A number indicates whether or not take the top candidate as a article candidate, `3` by default, i.e.: If `topCandidate` dom has more than `3` `<p>` children, `topCandidate` will be considered as the article dom, otherwise, it will be the parent of `topCandidate` (not `<body>`).
     - **tidyAttrs** Remove all the attributes on elements, `false` by default.
+    - **keepAllLinks** A value indicates whether or not keep all the links, especially the useless anchors such as pagination, print, email and so on, `false` by default.
     - **forceDecode** A value indicates whether or not decode the full text/html by (https://github.com/fb55/entities)[entities], `false` by default.
     - **dom** Will return the whole cheerio dom (proceeded) when this property is set to `true`, `false` by default, try to use `art.dom` to get the dom object in callback function (uses the `$_` to get the original).
     - **damping** The damping to calculate score of parent node, `1/2` by default. e.g.: the score of current document node is `20`, the score of parent will be `20 * damping`.
@@ -90,30 +91,36 @@ It supports the definitions such as:
 var read = require('read-art');
 // read from google:
 read('http://google.com', function(err, art, options, resp){
-    if(err){
-      throw err;
-    }
-    var title = art.title,      // title of article
-        content = art.content,  // content of article
-        html = art.html;        // whole original innerHTML
+  if(err){
+    throw err;
+  }
+  var title = art.title,      // title of article
+      content = art.content,  // content of article
+      html = art.html;        // whole original innerHTML
 
-    console.log('[STATUS CODE]', resp && resp.statusCode);
+  console.log('[STATUS CODE]', resp && resp.statusCode);
 });
 // or:
 read({
-    uri: 'http://google.com',
-    charset: 'utf8'
-  }, function(err, art, options, resp){
+  uri: 'http://google.com',
+  charset: 'utf8'
+}, function(err, art, options, resp){
 
 });
 // what about html?
 read('<title>node-art</title><body><div><p>hello, read-art!</p></div></body>', function(err, art, options, resp){
 
 });
 // of course could be
+var $ = cheerio.load('<title>node-art</title><body><div><p>hello, read-art!</p></div></body>')
+read({
+  cheerio: $
+}, function(err, art, options, resp){
+
+});
 read({
-    uri: '<title>node-art</title><body><div><p>hello, read-art!</p></div></body>'
-  }, function(err, art, options, resp){
+  uri: '<title>node-art</title><body><div><p>hello, read-art!</p></div></body>'
+}, function(err, art, options, resp){
 
 });
 ```
@@ -352,6 +359,12 @@ The `[usage]` could be one of following:
   /com(?:bx|ment|-)|contact|comment|captcha|foot(?:er|note)?|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|util|shopping|tags|tool|widget|tip|dialog|copyright|bottom|dv101|dv102/i
   ```
 
+- `this.regexps.uselessAnchors([re], [override])`
+  If `uselessAnchors` regexp test `text content` of adjacent sibling anchors(`a+a+a`) success, and the `keepAllLinks` is set to `false`, the parent node of anchor will be removed - normally it could be the pagination, print/email nodes. `[re]` is a regexp, e.g. `/next\spage|previous\spage/i` will match the element likes `<a href="2.htm">Next Page</a>` or `<a href="1.htm">Previous Page</a>`, if `[override]` is set to `true`, `uselessAnchors` will be `/next\spage|previous\spage/i`, otherwise it will be appended to the origin, i.e. :
+  ```
+  /(\d+|next|prev|first|last|print|comment|mail|font|about|contact|(下|下|前|后)一|(首|尾)页)|打印|评论|邮件|信箱|转发|关于|联系|^(大|中|小)$|next\spage|previous\spage/i
+  ```
+
 - `this.regexps.unlikely([re], [override])`
   If `unlikely` regexp test `id` + `className` of node success, it probably will not be took as a candidate. `[re]` is a regexp, e.g. `/dv101|dv102/` will match the element likes `<div class="dv101">...` or `<div id="dv102">...`, if `[override]` is set to `true`, `unlikely` will be `/dv101|dv102/i`, otherwise it will be appended to the origin, i.e. :
   ```

diff --git a/examples/simple.js b/examples/simple.js
@@ -1,27 +1,14 @@
 var read = require('../')
 
-read('http://news.163.com/16/0224/23/BGKI6D0M00014PRF.html', {
+var uri = 'http://www.cq.xinhuanet.com/2016-03/28/c_1118467794.htm'
+// 'http://media.china.com.cn/gdxw/2016-03-16/665392.html'
+// 'http://www.cq.xinhuanet.com/2016-03/28/c_1118465265.htm'
+read(uri, {
   timeout: 15000,
   output: {
     type: 'text',
     stripSpaces: true,
     break: true
-  },
-  selectors: {
-    quote: {
-      selector: '#ne_article_source',
-      extract: {
-        link: 'href',
-        label: 'text'
-      }
-    }
-  },
-  minTextLength: 0,
-  scoreRule: function (node) {
-    if (node.hasClass('w740')) {
-      return 100
-    }
-    return 0
   }
 }, function (err, art, options, resp) {
   if (err) {