From ce0e89332c6a6f0c2571023a5ecbd8c0c61574d3 Mon Sep 17 00:00:00 2001 From: Bruno Skvorc Date: Sat, 7 Nov 2015 19:38:30 +0000 Subject: [PATCH] Fixes #19 and optimizes tests a bit --- src/Entity/Article.php | 58 +++++++-- src/Traits/StandardEntity.php | 6 + tests/Entity/AbstractTest.php | 20 +--- tests/Entity/ArticleTest.php | 112 ++++++++++++++---- tests/Entity/CrawlJobTest.php | 19 +-- tests/Entity/DiscussionTest.php | 22 +--- tests/Entity/ImageTest.php | 23 +--- tests/Entity/PostTest.php | 23 +--- tests/Entity/ProductTest.php | 23 +--- tests/Entity/WildCardTest.php | 25 +--- .../15-11-07/diffbot-sitepoint-basic.json | 2 + tests/ResponseProvider.php | 69 +++++++++-- 12 files changed, 214 insertions(+), 188 deletions(-) create mode 100644 tests/Mocks/Articles/15-11-07/diffbot-sitepoint-basic.json diff --git a/src/Entity/Article.php b/src/Entity/Article.php index 2c62e0e..6a2ad4d 100644 --- a/src/Entity/Article.php +++ b/src/Entity/Article.php @@ -66,7 +66,7 @@ public function getDate() */ public function getAuthor() { - return (isset($this->data['author'])) ? $this->data['author'] : null; + return $this->getOrDefault('author'); } /** @@ -75,7 +75,7 @@ public function getAuthor() */ public function getAuthorUrl() { - return (isset($this->data['authorUrl'])) ? $this->data['authorUrl'] : null; + return $this->getOrDefault('authorUrl'); } /** @@ -118,7 +118,7 @@ public function getTags() */ public function getNumPages() { - return (isset($this->data['numPages'])) ? $this->data['numPages'] : 1; + return $this->getOrDefault('numPages', 1); } /** @@ -129,7 +129,7 @@ public function getNumPages() */ public function getNextPages() { - return (isset($this->data['nextPages'])) ? $this->data['nextPages'] : []; + return $this->getOrDefault('nextPages', []); } /** @@ -139,7 +139,7 @@ public function getNextPages() */ public function getSentiment() { - return (isset($this->data['sentiment'])) ? $this->data['sentiment'] : null; + return $this->getOrDefault('sentiment'); } /** @@ -172,7 +172,7 @@ public function getSentiment() */ public function getImages() { - return (isset($this->data['images'])) ? $this->data['images'] : []; + return $this->getOrDefault('images', []); } /** @@ -199,7 +199,7 @@ public function getImages() */ public function getVideos() { - return (isset($this->data['videos'])) ? $this->data['videos'] : []; + return $this->getOrDefault('videos', []); } /** @@ -210,4 +210,48 @@ public function getDiscussion() { return $this->discussion; } + + /** + * The plain-text name of the site (e.g. The New York Times or Diffbot). + * + * If no site name is automatically determined, the root domain (diffbot.com) will be returned. + * + * @return string | null + */ + public function getSiteName() + { + return $this->getOrDefault('siteName'); + } + + /** + * If known, the country of the article publication. + * + * @return string | null + */ + public function getPublisherCountry() + { + return $this->getOrDefault('publisherCountry', null); + } + + /** + * If known, the region of the article publication. + * + * @return string | null + */ + public function getPublisherRegion() + { + return $this->getOrDefault('publisherRegion', null); + } + + /** + * If an article's date is ambiguous, Diffbot will attempt to estimate a + * more specific timestamp using various factors. This will not be + * generated for articles older than two days, or articles without an identified date. + * + * @return string | null + */ + public function getEstimatedDate() + { + return $this->getOrDefault('estimatedDate', $this->getDate()); + } } \ No newline at end of file diff --git a/src/Traits/StandardEntity.php b/src/Traits/StandardEntity.php index d3f24c4..10ca851 100644 --- a/src/Traits/StandardEntity.php +++ b/src/Traits/StandardEntity.php @@ -113,4 +113,10 @@ public function getDiffbotUri() return $this->data['diffbotUri']; } + protected function getOrDefault($key, $default = null, $data = null) + { + $data = ($data !== null) ?: $this->data; + return (isset($data[$key]) ? $data[$key] : $default); + } + } \ No newline at end of file diff --git a/tests/Entity/AbstractTest.php b/tests/Entity/AbstractTest.php index c68db35..f125be9 100644 --- a/tests/Entity/AbstractTest.php +++ b/tests/Entity/AbstractTest.php @@ -9,28 +9,10 @@ class AbstractTest extends ResponseProvider { - /** @var array */ - protected $responses = []; - - protected $files = [ + protected static $staticFiles = [ 'Products/dogbrush.json' ]; - protected function ei($file) - { - $ef = new Entity(); - return $ef->createAppropriateIterator($this->prepareResponses()[$file]); - } - - public function returnFiles() - { - $files = []; - foreach ($this->files as $file) { - $files[] = [$file]; - } - return $files; - } - public function queryStringProvider() { return [ diff --git a/tests/Entity/ArticleTest.php b/tests/Entity/ArticleTest.php index 6f5a54b..cb30eba 100644 --- a/tests/Entity/ArticleTest.php +++ b/tests/Entity/ArticleTest.php @@ -3,42 +3,23 @@ namespace Swader\Diffbot\Test\Entity; use Swader\Diffbot\Entity\Article; -use Swader\Diffbot\Factory\Entity; use Swader\Diffbot\Test\ResponseProvider; class ArticleTest extends ResponseProvider { - /** @var array */ - protected $responses = []; - - protected $files = [ + protected static $staticFiles = [ 'Articles/diffbot-sitepoint-basic.json', // http%3A%2F%2Fwww.sitepoint.com%2Fdiffbot-crawling-visual-machine-learning 'Articles/diffbot-sitepoint-extended.json', 'Articles/apple-watch-verge-basic.json', // http%3A%2F%2Fwww.theverge.com%2Fa%2Fapple-watch-review - 'Articles/apple-watch-verge-extended.json' + 'Articles/apple-watch-verge-extended.json', + 'Articles/15-11-07/diffbot-sitepoint-basic.json', ]; - protected function ei($file) - { - $ef = new Entity(); - - return $ef->createAppropriateIterator($this->prepareResponses()[$file]); - } - - public function returnFiles() - { - $files = []; - foreach ($this->files as $file) { - $files[] = [$file]; - } - - return $files; - } - /** * @dataProvider returnFiles + * @param $file */ public function testType($file) { @@ -303,4 +284,89 @@ public function testDiscussion($file, $articles) } } } + + public function siteNameProvider() + { + return [ + ['Articles/15-11-07/diffbot-sitepoint-basic.json', 'SitePoint'], + ]; + } + + /** + * @dataProvider siteNameProvider + * @param $file + * @param $value1 + */ + public function testSiteName($file, $value1) + { + $value1 = (is_array($value1)) ? $value1 : [$value1]; + /** @var Article $entity */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($value1[$i], $entity->getSiteName()); + } + } + + public function publisherCountryProvider() + { + return [ + ['Articles/15-11-07/diffbot-sitepoint-basic.json', 'Australia'], + ]; + } + + /** + * @dataProvider publisherCountryProvider + * @param $file + * @param $value1 + */ + public function testPublisherCountry($file, $value1) + { + $value1 = (is_array($value1)) ? $value1 : [$value1]; + /** @var Article $entity */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($value1[$i], $entity->getPublisherCountry()); + } + } + + public function publisherRegionProvider() + { + return [ + ['Articles/15-11-07/diffbot-sitepoint-basic.json', 'Australia and New Zealand'], + ]; + } + + /** + * @dataProvider publisherRegionProvider + * @param $file + * @param $value1 + */ + public function testPublisherRegion($file, $value1) + { + $value1 = (is_array($value1)) ? $value1 : [$value1]; + /** @var Article $entity */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($value1[$i], $entity->getPublisherRegion()); + } + } + + public function estimatedDateProvider() + { + return [ + ['Articles/15-11-07/diffbot-sitepoint-basic.json', 'Sun, 27 Jul 2014 00:00:00 GMT'], + ]; + } + + /** + * @dataProvider estimatedDateProvider + * @param $file + * @param $value1 + */ + public function testEstimatedDate($file, $value1) + { + $value1 = (is_array($value1)) ? $value1 : [$value1]; + /** @var Article $entity */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($value1[$i], $entity->getEstimatedDate()); + } + } + } diff --git a/tests/Entity/CrawlJobTest.php b/tests/Entity/CrawlJobTest.php index 37262e3..37f2550 100644 --- a/tests/Entity/CrawlJobTest.php +++ b/tests/Entity/CrawlJobTest.php @@ -10,19 +10,16 @@ class CrawlJobTest extends ResponseProvider { - /** @var array */ - protected $responses = []; - - protected $files = [ + protected static $staticFiles = [ 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 'Crawlbot/15-05-20/multiplejobs01.json' ]; protected function ei($file) { - $this->prepareResponses(); + $responses = parent::prepareResponsesStatic(); /** @var ResponseInterface $response */ - $response = $this->responses[$file]; + $response = $responses[$file]; $jobs = []; foreach (json_decode($response->getBody(), true)['jobs'] as $data) { $jobs[] = new Job($data); @@ -31,16 +28,6 @@ protected function ei($file) return new EntityIterator($jobs, $response); } - public function returnFiles() - { - $files = []; - foreach ($this->files as $file) { - $files[] = [$file]; - } - - return $files; - } - /** * @dataProvider returnFiles */ diff --git a/tests/Entity/DiscussionTest.php b/tests/Entity/DiscussionTest.php index 67034f0..52c1c5e 100644 --- a/tests/Entity/DiscussionTest.php +++ b/tests/Entity/DiscussionTest.php @@ -8,31 +8,11 @@ class DiscussionTest extends ResponseProvider { - /** @var array */ - protected $responses = []; - - protected $files = [ + protected static $staticFiles = [ 'Discussions/15-05-01/sp_discourse_php7_recap.json', //http%3A%2F%2Fcommunity.sitepoint.com%2Ft%2Fphp7-resource-recap%2F174325%2F14 ]; - protected function ei($file) - { - $ef = new Entity(); - - return $ef->createAppropriateIterator($this->prepareResponses()[$file]); - } - - public function returnFiles() - { - $files = []; - foreach ($this->files as $file) { - $files[] = [$file]; - } - - return $files; - } - /** * @dataProvider returnFiles * @param $file diff --git a/tests/Entity/ImageTest.php b/tests/Entity/ImageTest.php index 2fbccdc..8a024ec 100644 --- a/tests/Entity/ImageTest.php +++ b/tests/Entity/ImageTest.php @@ -3,36 +3,15 @@ namespace Swader\Diffbot\Test\Entity; use Swader\Diffbot\Entity\Image; -use Swader\Diffbot\Factory\Entity; use Swader\Diffbot\Test\ResponseProvider; class ImageTest extends ResponseProvider { - /** @var array */ - protected $responses = []; - - protected $files = [ + protected static $staticFiles = [ 'Images/multi_images_smittenkitchen.json', 'Images/one_image_zola.json', ]; - protected function ei($file) - { - $ef = new Entity(); - - return $ef->createAppropriateIterator($this->prepareResponses()[$file]); - } - - public function returnFiles() - { - $files = []; - foreach ($this->files as $file) { - $files[] = [$file]; - } - - return $files; - } - /** * @dataProvider returnFiles */ diff --git a/tests/Entity/PostTest.php b/tests/Entity/PostTest.php index 776d52e..8bcadb0 100644 --- a/tests/Entity/PostTest.php +++ b/tests/Entity/PostTest.php @@ -3,37 +3,16 @@ namespace Swader\Diffbot\Test\Entity; use Swader\Diffbot\Entity\Discussion; -use Swader\Diffbot\Factory\Entity; use Swader\Diffbot\Test\ResponseProvider; use Swader\Diffbot\Entity\Post; class PostTest extends ResponseProvider { - /** @var array */ - protected $responses = []; - - protected $files = [ + protected static $staticFiles = [ 'Discussions/15-05-01/sp_discourse_php7_recap.json', //http%3A%2F%2Fcommunity.sitepoint.com%2Ft%2Fphp7-resource-recap%2F174325%2F14 ]; - protected function ei($file) - { - $ef = new Entity(); - - return $ef->createAppropriateIterator($this->prepareResponses()[$file]); - } - - public function returnFiles() - { - $files = []; - foreach ($this->files as $file) { - $files[] = [$file]; - } - - return $files; - } - /** * @dataProvider returnFiles */ diff --git a/tests/Entity/ProductTest.php b/tests/Entity/ProductTest.php index 8ba1846..0f36127 100644 --- a/tests/Entity/ProductTest.php +++ b/tests/Entity/ProductTest.php @@ -3,36 +3,15 @@ namespace Swader\Diffbot\Test\Entity; use Swader\Diffbot\Entity\Product; -use Swader\Diffbot\Factory\Entity; use Swader\Diffbot\Test\ResponseProvider; class ProductTest extends ResponseProvider { - /** @var array */ - protected $responses = []; - - protected $files = [ + protected static $staticFiles = [ 'Products/dogbrush.json', 'Products/15-05-03/shoes-sportsdirect.json' ]; - protected function ei($file) - { - $ef = new Entity(); - - return $ef->createAppropriateIterator($this->prepareResponses()[$file]); - } - - public function returnFiles() - { - $files = []; - foreach ($this->files as $file) { - $files[] = [$file]; - } - - return $files; - } - /** * @dataProvider returnFiles */ diff --git a/tests/Entity/WildCardTest.php b/tests/Entity/WildCardTest.php index e1be13b..1cd0c55 100644 --- a/tests/Entity/WildCardTest.php +++ b/tests/Entity/WildCardTest.php @@ -3,35 +3,14 @@ namespace Swader\Diffbot\Test\Entity; use Swader\Diffbot\Entity\Wildcard; -use Swader\Diffbot\Factory\Entity; use Swader\Diffbot\Test\ResponseProvider; class WildCardTest extends ResponseProvider { - /** @var array */ - protected $responses = []; - - protected $files = [ + protected static $staticFiles = [ 'Custom/AuthorFolioNew/15-05-03/bskvorc.json', ]; - protected function ei($file) - { - $ef = new Entity(); - - return $ef->createAppropriateIterator($this->prepareResponses()[$file]); - } - - public function returnFiles() - { - $files = []; - foreach ($this->files as $file) { - $files[] = [$file]; - } - - return $files; - } - public function customFieldProvider() { return [ @@ -57,7 +36,7 @@ public function testCustomFields($file, $posts) foreach ($this->ei($file) as $i => $entity) { $property = $posts[$i][0]; - $method = 'get'.ucfirst($property); + $method = 'get' . ucfirst($property); $value = $posts[$i][1]; if (!isset($posts[$i][2])) { diff --git a/tests/Mocks/Articles/15-11-07/diffbot-sitepoint-basic.json b/tests/Mocks/Articles/15-11-07/diffbot-sitepoint-basic.json new file mode 100644 index 0000000..9ec617b --- /dev/null +++ b/tests/Mocks/Articles/15-11-07/diffbot-sitepoint-basic.json @@ -0,0 +1,2 @@ + +{"request":{"options":["mentos"],"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","api":"article","version":3},"objects":[{"date":"Sun, 27 Jul 2014 00:00:00 GMT","images":[{"naturalHeight":727,"width":749,"diffbotUri":"image|3|-851701004","url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624455201.png","naturalWidth":1063,"primary":true,"height":512},{"naturalHeight":216,"width":523,"diffbotUri":"image|3|762494522","url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624487602.png","naturalWidth":523,"height":216},{"naturalHeight":184,"width":664,"diffbotUri":"image|3|302236938","url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624509003.png","naturalWidth":664,"height":184},{"naturalHeight":972,"width":749,"diffbotUri":"image|3|-1836356546","url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624552704.png","naturalWidth":966,"height":753},{"naturalHeight":184,"width":749,"diffbotUri":"image|3|1297360030","url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624650505.png","naturalWidth":918,"height":150},{"naturalHeight":188,"width":749,"diffbotUri":"image|3|502449852","url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624665606.png","naturalWidth":929,"height":151},{"naturalHeight":237,"width":749,"diffbotUri":"image|3|-2007985802","url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624803507.png","naturalWidth":1053,"height":168},{"naturalHeight":604,"width":749,"diffbotUri":"image|3|-140134863","url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624809508.png","naturalWidth":906,"height":499},{"naturalHeight":727,"width":749,"diffbotUri":"image|3|1129235416","url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624834309.png","naturalWidth":1063,"height":512}],"author":"Bruno Skvorc","estimatedDate":"Sun, 27 Jul 2014 00:00:00 GMT","publisherRegion":"Australia and New Zealand","icon":"http://www.sitepoint.com/wp-content/themes/sitepoint/assets/images/apple-touch-icon-144x144-precomposed.png","diffbotUri":"article|3|-1226129584","siteName":"SitePoint","type":"article","title":"Diffbot: Crawling with Visual Machine Learning","publisherCountry":"Australia","humanLanguage":"en","authorUrl":"http://www.sitepoint.com/author/bskvorc/","metaTags":[{"name":"ai"},{"name":"Artificial Intelligence"},{"name":"crawling"},{"name":"Diffbot"},{"name":"framework"},{"name":"laravel"},{"name":"machine learning"},{"name":"OOPHP"},{"name":"PHP"},{"name":"scraping"},{"name":"visual learning"}],"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","html":"

Have you ever wondered how social networks do URL previews so well when you share links? How do they know which images to grab, whom to cite as an author, or which tags to attach to the preview? Is it all crawling with complex regexes over source code? Actually, more often than not, it isn’t. Meta information defined in the source can be unreliable, and sites with less than stellar reputation often use them as keyword carriers, attempting to get search engines to rank them higher. Isn’t what we, the humans, see in front of us what matters anyway?<\/p>\n

If you want to build a URL preview snippet or a news aggregator, there are many automatic crawlers available online, both proprietary and open source, but you seldom find something as niche as visual machine learning. This is exactly what Diffbot<\/a> is – a “visual learning robot” which renders a URL you request in full and then visually extracts data, helping itself with some metadata from the page source as needed.<\/p>\n

After covering some theory, in this post we’ll do a demo API call at one of SitePoint’s posts.<\/p>\n

PHP Library<\/h2>\n

The PHP library for Diffbot is somewhat out of date, and as such we won’t be using it in this demo. We’ll be performing raw API calls, and in some future posts we’ll build our own library for API interaction.<\/p>\n

If you’d like to take a look at the PHP library nonetheless, see here<\/a>, and if you’re interested in libraries for other languages, Diffbot has a directory<\/a>.<\/p>\n

Update, July 2015<\/strong>: A PHP library has been developed since this article was published. See its entire development process here<\/a>, or the source code here<\/a>.<\/em><\/p>\n

JavaScript Content<\/h2>\n

We said in the introductory section that Diffbot renders the request in full and then analyzes it. But, what about JavaScript content? Nowadays, websites often render some HTML above the fold, and then finish the CSS, JS, and dynamic content loading afterwards. Can the Diffbot API see that?<\/p>\n

As a matter of fact, yes. Diffbot literally renders the page in full, and then inspects it visually, as explained in my StackOverflow Q&A here<\/a>. There are some caveats, though, so make sure you read the answer carefully.<\/p>\n

Pricing and API Health<\/h2>\n

Diffbot has several usage tiers. There’s a free trial tier which kills your API token after 7 days or 10000 calls, whichever comes first. The commercial tokens can be purchased at various prices<\/a>, and never expire, but do have limitations. A special case by case approach is afforded to open source and/or educational projects which provides an older model of the free token – 10k calls per month, once per second max, but never expires. You need to contact them directly if you think you qualify.<\/p>\n

Diffbot guarantees a high uptime, but failures sometimes do happen – especially in the most resource intensive API of the bunch: Crawlbot. Crawlbot is used to crawl entire domains, not just individual pages, and as such has a lower reliability rate than other APIs. Not by a lot, but enough to be noticeable in the API Health<\/a> screen – the screen you can check to see if an API is up and running or currently unavailable if your calls run into issues or return error 500.<\/p>\n

Demo<\/h2>\n

To prepare your environment, please boot up a Homestead Improved<\/a> instance.<\/p>\n

Create Project<\/h3>\n

Create a starter Laravel project by SSHing into the VM with vagrant ssh<\/code>, going into the Code<\/code> folder, and executing composer create-project laravel/laravel Laravel --prefer-dist<\/code>. This will let you access the Laravel greeting page via http://homestead.app:8000<\/code> from the host’s browser.<\/p>\n

Add a Route and Action<\/h3>\n

In app/routes.php<\/code> add the following route:<\/p>\n

Route::get('/diffbot', 'HomeController@diffbotDemo');<\/code><\/pre>\n

In app/controllers/HomeController<\/code> add the following action:<\/p>\n

public function diffbotDemo() {\n        die(\"hi\");\n    }<\/code><\/pre>\n

If http://homestead.app:8000/diffbot<\/code> now outputs “hi” on the screen, we’re ready to start playing with the API.<\/p>\n

Get a Token<\/h3>\n

To interact with the Diffbot API, you need a token. Sign up for one on their pricing page<\/a>. For the sake of this demo, let’s call our token $TOKEN<\/code>, and we’ll refer to it as such in URLs. Replace $TOKEN<\/code> with your own value where appropriate.<\/p>\n

Install Guzzle<\/h3>\n

We’ll be using Guzzle as our HTTP client. It’s not required, but I do recommend you get familiar with it through a past article of ours<\/a>.<\/p>\n

Add the "guzzlehttp/guzzle": "4.1.*@dev"<\/code> to your composer.json<\/code> so the require block looks like this:<\/p>\n

\"require\": {\n\t\t\"laravel/framework\": \"4.2.*\",\n        \"guzzlehttp/guzzle\": \"4.1.*@dev\"\n\t},<\/code><\/pre>\n

In the project root, run composer update<\/code>.<\/p>\n

Fetch Article Data<\/h3>\n

In the first example, we’ll crawl a SitePoint post with the default Article API from Diffbot. To do this, we refer to the docs<\/a> which do an excellent job at explaining the workflow. Change the body of the diffbotDemo<\/code> action to the following code:<\/p>\n

public function diffbotDemo() {\n\n        $token = \"$TOKEN\";\n        $version = 'v3';\n\n        $client = new GuzzleHttp\\Client(['base_url' => 'http://api.diffbot.com/']);\n\n        $response = $client->get($version.'/article', ['query' => [\n            'token' => $token,\n            'url' => 'http://www.sitepoint.com/7-mistakes-commonly-made-php-developers/'\n        ]]);\n\n        die(var_dump($response->json()));\n    }<\/code><\/pre>\n

First, we set our token. Then, we define a variable that’ll hold the API version. Next, it’s up to us to create a new Guzzle client, and we also give it a base URL so we don’t have to type it in every time we make another request.<\/p>\n

Next up, we create a response object by sending a GET request to the API’s URL, and we add in an array of query parameters in key => value format. In this case, we only pass in the token and the URL, the most basic of parameters.<\/p>\n

Finally, since the Diffbot API returns JSON data, we use Guzzle’s json()<\/code> method to automatically decode it into an array. We then pretty-print this data:<\/p>\n

\"\"<\/img><\/figure>\n

As you can see, we got some information back rather quickly. There’s the icon that was used, a preview of the text, the title, even the language, date and HTML have been returned. You’ll notice there’s no author, however. Let’s change this and request some more values.<\/p>\n

If we add the “fields” parameter to the query params list and give it a value of “tags”, Diffbot will attempt to extract tags/categories from the URL provided. Add this line to the query<\/code> array:<\/p>\n

'fields' => 'tags'<\/code><\/pre>\n

and then change the die<\/code> part to this:<\/p>\n

$data = $response->json();\ndie(var_dump($data['objects'][0]['tags']));<\/code><\/pre>\n

Refreshing the screen now gives us this:<\/p>\n

\"\"<\/img><\/figure>\n

But, the source code of the article notes several other tags:<\/p>\n

\"\"<\/img><\/figure>\n

Why is the result so very different? It’s precisely due to the reason we mentioned at the end of the very first paragraph of this post: what we humans see takes precedence. Diffbot is a visual learning robot, and as such its AI deducts the tags from the actual rendered content – what it can see – rather than from looking at the source code which is far too easily spiced up for SEO purposes.<\/p>\n

Is there a way to get the tags from the source code, though, if one really needs them? Furthermore, can we make Diffbot recognize the author on SitePoint articles? Yes. With the Custom API.<\/p>\n

Meta Tags and Author with Custom API<\/h3>\n

The Custom API is a feature which allows you to not only tweak existing Diffbot API to your liking by adding new fields and rules for content extraction, but also allows you to create completely new APIs (accessed via a dedicated URL, too) for custom content processing.<\/p>\n

Go to the dev dashboard<\/a> and log in with your token. Then, go into “Custom API”. Activate the “Create a Rule” tab at the bottom, and input the URL of the article we’re crawling into the URL box, then click Test. Your screen should look something like this:<\/p>\n

\"\"<\/img><\/figure>\n

You’ll immediately notice the Author field is empty. You can tweak the author-searching rule by clicking Edit next to it, and finding the Author element in the live preview window that opens, then click on it to get the desired result. However, due to some, well, less than perfect CSS on SitePoint’s end, it’s very difficult to provide Diffbot’s API with a consistent path to the author name, especially by clicking on elements. Instead, add the following rule manually: .contributor--large .contributor_name a<\/code> and click Save.<\/p>\n

You’ll notice the Preview window now correctly populates the Author field:<\/p>\n

\"\"<\/img><\/figure>\n

In fact, this new rule is automatically applied to all SitePoint links for your token. If you try to preview another SitePoint article, like this one<\/a>, you’ll notice Peter Nijssen is successfully extracted:<\/p>\n

\"\"<\/img><\/figure>\n

Ok, let’s modify the API further. We need the article:tag<\/code> values that are visible in source code. Doing this requires a two-step process.<\/p>\n

A collection is exactly what it sounds like – a collection of values grabbed via a specific ruleset. We’ll call our collection “MetaTags”, and give it the following selector: meta[property=article:tag]<\/code>. This means “find all meta elements in the HTML that have the property<\/code> attribute with the value article:tag<\/code>“.<\/p>\n

Collection fields are individual entries in a collection – in our case, the various tags. Click on “Add a custom field to this collection”, and add the following values:<\/p>\n

\"\"<\/img><\/figure>\n

Click Save. You’ll immediately have access to the list of Tags in the result window:<\/p>\n

\"\"<\/img><\/figure>\n

Change the final output of the diffbotDemo()<\/code> action to this:<\/p>\n

die(var_dump($data['objects'][0]['metaTags']));<\/code><\/pre>\n

If you now refresh the URL we tested with (http://homestead.app:8000/diffbot<\/code>), you’ll notice the author and meta tags values are there. Here’s the output the above line of code produces:<\/p>\n

\"\"<\/img><\/figure>\n

We have our tags!<\/p>\n

Conclusion<\/h2>\n

Diffbot is a powerful data extractor for the web – whether you need to consolidate many sites into a single search index without combining their back-ends, want to build a news aggregator, have an idea for a URL preview web component, or want to regularly harvest the contents of competitors’ public pricing lists, Diffbot can help. With dead simple API calls and highly structured responses, you’ll be up and running in next to no time. In a later article, we’ll build a brand new API for using Diffbot with PHP, and redo the calls above with it. We’ll also host the library on Packagist, so you can easily install it with Composer. Stay tuned!<\/p>","text":"Have you ever wondered how social networks do URL previews so well when you share links? How do they know which images to grab, whom to cite as an author, or which tags to attach to the preview? Is it all crawling with complex regexes over source code? Actually, more often than not, it isn\u2019t. Meta information defined in the source can be unreliable, and sites with less than stellar reputation often use them as keyword carriers, attempting to get search engines to rank them higher. Isn\u2019t what we, the humans, see in front of us what matters anyway?\nIf you want to build a URL preview snippet or a news aggregator, there are many automatic crawlers available online, both proprietary and open source, but you seldom find something as niche as visual machine learning. This is exactly what Diffbot is \u2013 a \u201cvisual learning robot\u201d which renders a URL you request in full and then visually extracts data, helping itself with some metadata from the page source as needed.\nAfter covering some theory, in this post we\u2019ll do a demo API call at one of SitePoint\u2019s posts.\nPHP Library\nThe PHP library for Diffbot is somewhat out of date, and as such we won\u2019t be using it in this demo. We\u2019ll be performing raw API calls, and in some future posts we\u2019ll build our own library for API interaction.\nIf you\u2019d like to take a look at the PHP library nonetheless, see here, and if you\u2019re interested in libraries for other languages, Diffbot has a directory.\nUpdate, July 2015: A PHP library has been developed since this article was published. See its entire development process here, or the source code here.\nJavaScript Content\nWe said in the introductory section that Diffbot renders the request in full and then analyzes it. But, what about JavaScript content? Nowadays, websites often render some HTML above the fold, and then finish the CSS, JS, and dynamic content loading afterwards. Can the Diffbot API see that?\nAs a matter of fact, yes. Diffbot literally renders the page in full, and then inspects it visually, as explained in my StackOverflow Q&A here. There are some caveats, though, so make sure you read the answer carefully.\nPricing and API Health\nDiffbot has several usage tiers. There\u2019s a free trial tier which kills your API token after 7 days or 10000 calls, whichever comes first. The commercial tokens can be purchased at various prices, and never expire, but do have limitations. A special case by case approach is afforded to open source and/or educational projects which provides an older model of the free token \u2013 10k calls per month, once per second max, but never expires. You need to contact them directly if you think you qualify.\nDiffbot guarantees a high uptime, but failures sometimes do happen \u2013 especially in the most resource intensive API of the bunch: Crawlbot. Crawlbot is used to crawl entire domains, not just individual pages, and as such has a lower reliability rate than other APIs. Not by a lot, but enough to be noticeable in the API Health screen \u2013 the screen you can check to see if an API is up and running or currently unavailable if your calls run into issues or return error 500.\nDemo\nTo prepare your environment, please boot up a Homestead Improved instance.\nCreate Project\nCreate a starter Laravel project by SSHing into the VM with vagrant ssh, going into the Code folder, and executing composer create-project laravel/laravel Laravel --prefer-dist. This will let you access the Laravel greeting page via http://homestead.app:8000 from the host\u2019s browser.\nAdd a Route and Action\nIn app/routes.php add the following route:\nRoute::get('/diffbot', 'HomeController@diffbotDemo');\nIn app/controllers/HomeController add the following action:\npublic function diffbotDemo() {\n die(\"hi\");\n }\nIf http://homestead.app:8000/diffbot now outputs \u201chi\u201d on the screen, we\u2019re ready to start playing with the API.\nGet a Token\nTo interact with the Diffbot API, you need a token. Sign up for one on their pricing page. For the sake of this demo, let\u2019s call our token $TOKEN, and we\u2019ll refer to it as such in URLs. Replace $TOKEN with your own value where appropriate.\nInstall Guzzle\nWe\u2019ll be using Guzzle as our HTTP client. It\u2019s not required, but I do recommend you get familiar with it through a past article of ours.\nAdd the \"guzzlehttp/guzzle\": \"4.1.*@dev\" to your composer.json so the require block looks like this:\n\"require\": {\n\t\t\"laravel/framework\": \"4.2.*\",\n \"guzzlehttp/guzzle\": \"4.1.*@dev\"\n\t},\nIn the project root, run composer update.\nFetch Article Data\nIn the first example, we\u2019ll crawl a SitePoint post with the default Article API from Diffbot. To do this, we refer to the docs which do an excellent job at explaining the workflow. Change the body of the diffbotDemo action to the following code:\npublic function diffbotDemo() {\n\n $token = \"$TOKEN\";\n $version = 'v3';\n\n $client = new GuzzleHttp\\Client(['base_url' => 'http://api.diffbot.com/']);\n\n $response = $client->get($version.'/article', ['query' => [\n 'token' => $token,\n 'url' => 'http://www.sitepoint.com/7-mistakes-commonly-made-php-developers/'\n ]]);\n\n die(var_dump($response->json()));\n }\nFirst, we set our token. Then, we define a variable that\u2019ll hold the API version. Next, it\u2019s up to us to create a new Guzzle client, and we also give it a base URL so we don\u2019t have to type it in every time we make another request.\nNext up, we create a response object by sending a GET request to the API\u2019s URL, and we add in an array of query parameters in key => value format. In this case, we only pass in the token and the URL, the most basic of parameters.\nFinally, since the Diffbot API returns JSON data, we use Guzzle\u2019s json() method to automatically decode it into an array. We then pretty-print this data:\nAs you can see, we got some information back rather quickly. There\u2019s the icon that was used, a preview of the text, the title, even the language, date and HTML have been returned. You\u2019ll notice there\u2019s no author, however. Let\u2019s change this and request some more values.\nIf we add the \u201cfields\u201d parameter to the query params list and give it a value of \u201ctags\u201d, Diffbot will attempt to extract tags/categories from the URL provided. Add this line to the query array:\n'fields' => 'tags'\nand then change the die part to this:\n$data = $response->json();\ndie(var_dump($data['objects'][0]['tags']));\nRefreshing the screen now gives us this:\nBut, the source code of the article notes several other tags:\nWhy is the result so very different? It\u2019s precisely due to the reason we mentioned at the end of the very first paragraph of this post: what we humans see takes precedence. Diffbot is a visual learning robot, and as such its AI deducts the tags from the actual rendered content \u2013 what it can see \u2013 rather than from looking at the source code which is far too easily spiced up for SEO purposes.\nIs there a way to get the tags from the source code, though, if one really needs them? Furthermore, can we make Diffbot recognize the author on SitePoint articles? Yes. With the Custom API.\nMeta Tags and Author with Custom API\nThe Custom API is a feature which allows you to not only tweak existing Diffbot API to your liking by adding new fields and rules for content extraction, but also allows you to create completely new APIs (accessed via a dedicated URL, too) for custom content processing.\nGo to the dev dashboard and log in with your token. Then, go into \u201cCustom API\u201d. Activate the \u201cCreate a Rule\u201d tab at the bottom, and input the URL of the article we\u2019re crawling into the URL box, then click Test. Your screen should look something like this:\nYou\u2019ll immediately notice the Author field is empty. You can tweak the author-searching rule by clicking Edit next to it, and finding the Author element in the live preview window that opens, then click on it to get the desired result. However, due to some, well, less than perfect CSS on SitePoint\u2019s end, it\u2019s very difficult to provide Diffbot\u2019s API with a consistent path to the author name, especially by clicking on elements. Instead, add the following rule manually: .contributor--large .contributor_name a and click Save.\nYou\u2019ll notice the Preview window now correctly populates the Author field:\nIn fact, this new rule is automatically applied to all SitePoint links for your token. If you try to preview another SitePoint article, like this one, you\u2019ll notice Peter Nijssen is successfully extracted:\nOk, let\u2019s modify the API further. We need the article:tag values that are visible in source code. Doing this requires a two-step process.\nA collection is exactly what it sounds like \u2013 a collection of values grabbed via a specific ruleset. We\u2019ll call our collection \u201cMetaTags\u201d, and give it the following selector: meta[property=article:tag]. This means \u201cfind all meta elements in the HTML that have the property attribute with the value article:tag\u201c.\nCollection fields are individual entries in a collection \u2013 in our case, the various tags. Click on \u201cAdd a custom field to this collection\u201d, and add the following values:\nClick Save. You\u2019ll immediately have access to the list of Tags in the result window:\nChange the final output of the diffbotDemo() action to this:\ndie(var_dump($data['objects'][0]['metaTags']));\nIf you now refresh the URL we tested with (http://homestead.app:8000/diffbot), you\u2019ll notice the author and meta tags values are there. Here\u2019s the output the above line of code produces:\nWe have our tags!\nConclusion\nDiffbot is a powerful data extractor for the web \u2013 whether you need to consolidate many sites into a single search index without combining their back-ends, want to build a news aggregator, have an idea for a URL preview web component, or want to regularly harvest the contents of competitors\u2019 public pricing lists, Diffbot can help. With dead simple API calls and highly structured responses, you\u2019ll be up and running in next to no time. In a later article, we\u2019ll build a brand new API for using Diffbot with PHP, and redo the calls above with it. We\u2019ll also host the library on Packagist, so you can easily install it with Composer. Stay tuned!"}]} \ No newline at end of file diff --git a/tests/ResponseProvider.php b/tests/ResponseProvider.php index 28421e5..1589f1c 100644 --- a/tests/ResponseProvider.php +++ b/tests/ResponseProvider.php @@ -4,6 +4,7 @@ use GuzzleHttp\Client; use GuzzleHttp\Psr7\Response; +use Swader\Diffbot\Factory\Entity; /** * @property $files array @@ -15,28 +16,70 @@ class ResponseProvider extends \PHPUnit_Framework_TestCase { protected $folder = '/Mocks/'; + protected static $staticResponses = []; + protected static $staticFiles = []; + protected function prepareResponses() { if (empty($this->responses)) { - $mockInput = []; foreach ($this->files as $file) { - //$mockInput[] = file_get_contents(__DIR__ . '/Mocks/' . $file); - $this->responses[$file] = new Response(200, [], - file_get_contents(__DIR__ . '/Mocks/' . $file)); + + $path = __DIR__ . '/Mocks/' . $file; + if (!is_readable($path)) { + throw new \InvalidArgumentException("Test will error because mock file '$path' ($file) not readable!"); + } + $contents = file_get_contents($path); + + $this->responses[$file] = new Response(200, [], $contents); } unset($file); -// -// $mock = new Mock($mockInput); -// $client = new Client(); -// $client->getEmitter()->attach($mock); -// -// foreach ($this->files as $file) { -// $this->responses[$file] = $client->get('sampleurl.com'); -// } -// unset($file); } return $this->responses; } + protected static function prepareResponsesStatic() + { + if (empty(self::$staticResponses)) { + foreach (static::$staticFiles as $file) { + $path = __DIR__ . '/Mocks/' . $file; + if (!is_readable($path)) { + throw new \InvalidArgumentException("Test will error because mock file '$path' ($file) not readable!"); + } + $contents = file_get_contents($path); + self::$staticResponses[$file] = new Response(200, [], $contents); + } + } + + return self::$staticResponses; + } + + public static function setUpBeforeClass() + { + self::prepareResponsesStatic(); + } + + public static function tearDownAfterClass() + { + self::$staticResponses = []; + } + + protected function ei($file) + { + $ef = new Entity(); + return $ef->createAppropriateIterator(self::prepareResponsesStatic()[$file]); + } + + public function returnFiles() + { + $files = []; + foreach (static::$staticFiles as $file) { + $files[] = [$file]; + } + + return $files; + } + + + } \ No newline at end of file