# 2016/04/07

- remove useless anchors i.e.: paginations, contactors... - remove related links if neccessary - improve link density algorithm - options recognition - cheerio object can be passed in - more test cases
Tjatse · Apr 7, 2016 · 9e92ed0 · 9e92ed0
1 parent e565310
commit 9e92ed0
Show file tree

Hide file tree

Showing 6 changed files with 371 additions and 121 deletions.
diff --git a/index.js b/index.js
@@ -1,6 +1,7 @@
 'use strict'
 
 var req = require('req-fast')
+var cheerio = require('cheerio')
 var util = require('util')
 var debug = require('debug')('read-art.main')
 var Article = require('./lib/article')
@@ -15,18 +16,40 @@ module.exports = read
  * 1: error
  * 2: article
  */
-function read (uri, options, callback) {
+function read () {
+  return handle.apply(null, arguments) // eslint-disable-line no-useless-call
+}
+
+/**
+ * Custom settings.
+ * @type {readArt.use}
+ */
+read.use = Article.use
+
+function handle (uri, options, callback) {
+  if (arguments.length === 0) {
+    return new Error('Incorrect arguments.')
+  }
   // organize parameters
   if ((typeof options === 'function') && !callback) {
     callback = options
   }
   if (options && typeof options === 'object') {
     options.uri = uri
-  } else if (typeof uri === 'string') {
+  } else if (typeof uri === 'string' || isCheerio(uri)) {
     options = { uri: uri }
   } else {
     options = uri
-    uri = options.uri || options.html
+  }
+
+  if (typeof options !== 'object') {
+    return new Error('options are required!')
+  }
+
+  uri = options.cheerio || options.html || options.uri
+
+  if ((typeof uri !== 'string' && !isCheerio(uri)) || !uri) {
+    return new Error('only accept cheerio, url or HTML as article content.')
   }
 
   options = util._extend({
@@ -52,12 +75,37 @@ function read (uri, options, callback) {
   if (isNaN(options.minParagraphs)) {
     options.minParagraphs = 3
   }
-
+  if (typeof callback !== 'function') {
+    callback = false
+  }
+  // indicating whether uri is a cheerio object or not.
+  if (typeof uri !== 'string') {
+    if (isCheerio(uri)) {
+      if (!isHtml(options.html)) {
+        delete options.html
+      }
+      if (isHtml(options.uri)) {
+        if (!options.html) {
+          options.html = options.uri
+        }
+        delete options.uri
+      }
+      options.cheerio = uri
+      return parse({
+        cheerio: uri,
+        options: options,
+        callback: callback
+      })
+    }
+    var err = new Error('only accept cheerio, url or HTML as article content.')
+    return callback ? callback(err) : err
+  }
   // indicating uri is html or url.
-  var isHTML = uri.match(/^\s*</)
-  if (isHTML && options.uri && !options.html) {
-    options.html = options.uri
-    delete options.uri
+  if (isHtml(uri) && options.uri && !options.html) {
+    options.html = uri
+    if (isHtml(options.uri)) {
+      delete options.uri
+    }
   }
 
   var parsingData = {
@@ -75,12 +123,12 @@ function read (uri, options, callback) {
         if (debug.enabled) {
           debug('     ∟ Error: ' + (err ? err.message : 'no response'))
         }
-        return callback(err || new Error('Response is empty.'))
+        return callback && callback(err || new Error('Response is empty.'))
       }
       if (!resp.body) {
         var errMsg = 'No body was found, status code: ' + resp.statusCode
         debug('     ∟ Warning: ' + errMsg)
-        return callback(new Error(errMsg))
+        return callback && callback(new Error(errMsg))
       }
       debug('     ∟ succeed')
 
@@ -93,30 +141,55 @@ function read (uri, options, callback) {
   }
 }
 
-/**
- * Custom settings.
- * @type {readArt.use}
- */
-read.use = Article.use
-
 /**
  * Parse html to cheerio dom.
  * @param o options
  * @param e extra data
  * @return {String}
  */
 function parse (o, e) {
-  debug('   analyzing HTML')
-  if (!o.html) {
-    debug('   ∟ HTML content could not be found, simply returned')
-    return ''
+  debug('   parsing...')
+  if (!o.html && !o.cheerio) {
+    var errMsg = 'Article content could not be found.'
+    debug('   ∟ ' + errMsg)
+    return o.callback && o.callback(new Error(errMsg))
   }
-  if (o.options.killBreaks) {
+  if (o.html && o.options.killBreaks) {
     // replace <br />(blanks goes here) to <br />.
     o.html = o.html.replace(/<br[^\/>]*\/?>/ig, '<br />')
     // remove tab symbols like \r\t\n
     o.html = o.html.replace(/[\n\r\t]{2,}/gi, ' ')
   }
 
-  o.callback(null, new Article(o), o.options, e)
+  o.callback && o.callback(null, new Article(o), o.options, e)
+}
+
+// from cheerio module.
+var quickExpr = /^(?:[^#<]*(<[\w\W]+>)[^>]*$|#([\w\-]*)$)/
+/**
+ * Check if string is HTML
+ * @param  {String}  str
+ * @return {Boolean}
+ */
+function isHtml (str) {
+  if (typeof str !== 'string') {
+    return false
+  }
+  // Faster than running regex, if str starts with `<` and ends with `>`, assume it's HTML
+  if (str.charAt(0) === '<' && str.charAt(str.length - 1) === '>' && str.length >= 3) {
+    return true
+  }
+
+  // Run the regex
+  var match = quickExpr.exec(str)
+  return !!(match && match.length > 1)
+}
+
+/**
+ * Check if object is an instance of Cheerio
+ * @param  {Object}  o
+ * @return {Boolean}
+ */
+function isCheerio (o) {
+  return o && typeof o.root === 'function' && o.root() instanceof cheerio
 }
diff --git a/lib/article.js b/lib/article.js
@@ -15,15 +15,19 @@ module.exports = Article
  * @constructor
  */
 function Article (o) {
-  var co = {
-    decodeEntities: false
+  var $
+  if (o.html) {
+    var co = {
+      decodeEntities: false
+    }
+    var cheerioOptions = ['normalizeWhitespace', 'xmlMode', 'lowerCaseTags']
+    cheerioOptions.forEach(function (n) {
+      co[n] = !!o.options[n]
+    })
+    $ = cheerio.load(o.html, co)
+  } else {
+    $ = o.cheerio
   }
-  var cheerioOptions = ['normalizeWhitespace', 'xmlMode', 'lowerCaseTags']
-  cheerioOptions.forEach(function (n) {
-    co[n] = !!o.options[n]
-  })
-
-  var $ = cheerio.load(o.html, co)
   this.$ = $
 
   this.caches = {}

diff --git a/lib/reader.js b/lib/reader.js
@@ -21,6 +21,7 @@ var extRegexps = {
   unlikely: /agegate|auth?or|bookmark|cat|com(?:bx|ment|munity)|date|disqus|extra|foot|header|ignore|link|menu|nav|pag(?:er|ination)|popup|related|remark|rss|share|shoutbox|sidebar|similar|social|sponsor|teaserlist|time|tweet|twitter|\bad[\s_-]?\b/i,
   maybe: /and|article|body|column|main|column/i,
   div2p: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul|span|font|label)/i,
+  uselessAnchors: /(\d+|next|prev|first|last|print|comment|mail|font|about|contact|(下|下|前|后)一|(首|尾)页)|打印|评论|邮件|信箱|转发|关于|联系|^(大|中|小)$/i,
   images: /\.(gif|jpe?g|png)$/i
 }
 var tagsToSkip = ''
@@ -503,11 +504,13 @@ function grabArticle ($, topCandidate, options) {
       }
     }
   }
+  var shouldRemoveRelatedLinks = false
   if (shouldUseParent(topCandidate, options) && (parent = getNBParent(topCandidate))) {
     // 1. topCandidate has not enough [P] children
     // 2. parent exist and not [BODY]
     debugRd(' ∟ top candidate has not enough <p /> children, take the parent node (not <body />) instead of it')
     siblings = parent.children()
+    shouldRemoveRelatedLinks = true
   } else {
     // self children.
     siblings = topCandidate.children()
@@ -539,6 +542,19 @@ function grabArticle ($, topCandidate, options) {
         append = true
       }
     }
+
+    var extraDebugInfo
+    var ignoreMedias = false
+    if (append && shouldRemoveRelatedLinks) {
+      var linkDensity = getLinkDensity($, node, true) // eslint-disable-line no-redeclare
+      if (linkDensity >= 0.5) {
+        append = false
+        ignoreMedias = true
+        if (debugDo.enabled) {
+          extraDebugInfo = 'Probably related links which scores ' + linkDensity.toFixed(2)
+        }
+      }
+    }
     if (append) {
       // remove comments.
       node.contents().filter(function (index, ele) {
@@ -549,16 +565,36 @@ function grabArticle ($, topCandidate, options) {
     }
     if (debugDo.enabled) {
       debugDo('    ∟ <' + tagName + ' /> ' + na.classAndId)
-      debugDo('      ∟ ' + (append ? 'append' : 'remove'))
+      debugDo('      ∟ ' + (append ? 'append' : 'remove') + (extraDebugInfo ? ' (' + extraDebugInfo + ')' : ''))
     }
-    // append medias.
-    if (!append) {
+    // append medias if neccessary
+    if (!append && !ignoreMedias) {
       var medias = node.find(tagsOfMedia)
       if (medias.length > 0) {
         article.append(medias)
       }
     }
   })
+
+  if (!options.keepAllLinks) {
+    article.find('a+a+a').each(function () {
+      var node = $(this)
+      var parent = node.parent()
+      if (!parent || parent.length === 0 || parent.is(article)) {
+        return
+      }
+      var reg = regexps.uselessAnchors
+      var prev
+      if (reg.test(node.text() || '') || reg.test((prev = node.prev()).text() || '') || reg.test(prev.prev().text() || '')) {
+        parent.remove()
+        if (debugDo.enabled) {
+          var na = getNodeAttr(parent)
+          debugDo('    ∟ <' + na.tagName + ' /> ' + na.classAndId)
+          debugDo('      ∟ removed \'Cause contains useless anchors')
+        }
+      }
+    })
+  }
   // fix links
   readArt.fixLink($, options.uri, article, options)
   return article
@@ -782,9 +818,10 @@ function getClassWeight (node) {
  * Get the density of links as a percentage of the content.
  * @param $ dom
  * @param node the node element.
+ * @param strictMode length must be calculated.
  * @return {Number}
  */
-function getLinkDensity ($, node) {
+function getLinkDensity ($, node, strictMode) {
   var textLen = node.text().length
   if (textLen === 0) {
     return 0
@@ -796,7 +833,18 @@ function getLinkDensity ($, node) {
     if (!href || href[0] === '#') {
       return
     }
-    linkLen += anchor.text().length
+    var len = (anchor.text() || anchor.attr('title') || '').length
+    if (strictMode && len === 0) {
+      var children
+      if ((children = anchor.children()).length === 1 && children.get(0).tagName === 'img') {
+        len = (children.attr('alt') || children.attr('title') || '').length
+      }
+      if (len === 0) {
+        len = href.length
+      }
+      textLen += len
+    }
+    linkLen += len
   })
   return linkLen / textLen
 }

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "read-art",
-  "version": "0.4.8",
+  "version": "0.4.9-alpha",
   "description": "Scrape/Crawl article from any site automatically. Make any web page readable, no matter Chinese or English.",
   "main": "index.js",
   "scripts": {

diff --git a/test/anchors.js b/test/anchors.js
@@ -0,0 +1,24 @@
+var read = require('../')
+var chai = require('chai')
+var expect = chai.expect
+var should = chai.should()
+
+describe('grab content', function () {
+  describe('by requesting url', function () {
+    it('should without related links and useless anchors', function (done) {
+      read({
+        uri: 'http://www.cq.xinhuanet.com/2016-03/28/c_1118467794.htm',
+        output: 'text'
+      }, function (err, art) {
+        should.not.exist(err)
+        expect(art).to.be.an('object')
+        art.content.should.not.contains('打印')
+        art.content.should.not.contains('下一页')
+        art.content.should.not.contains('评论')
+        art.content.should.not.contains('信箱')
+        art.content.should.not.contains('推荐')
+        done()
+      })
+    })
+  })
+})