From bc498c7d7f572717606c3063f68beb0c8ae67280 Mon Sep 17 00:00:00 2001 From: Bruno Skvorc Date: Sun, 1 Nov 2015 10:08:33 +0000 Subject: [PATCH 1/2] Converted to PHP-HTTP and PSR-7. PHP-HTTP was used to abstract away the dependency on Guzzle 5, and Guzzle 6 was used for testing. The end user can use any supported PHP-HTTP adapter implementation, does not have to be Guzzle 6. --- composer.json | 11 +- src/Api/Crawl.php | 2 +- src/Api/Search.php | 2 +- src/Diffbot.php | 10 +- src/Entity/EntityIterator.php | 3 +- src/Factory/Entity.php | 7 +- src/Interfaces/EntityFactory.php | 2 +- tests/Abstracts/EntityIteratorTest.php | 39 +- tests/Api/AnalyzeApiTest.php | 26 +- tests/Api/ArticleApiTest.php | 26 +- tests/Api/CrawlCustomMocksTest.php | 42 +- tests/Api/CrawlTest.php | 21 +- tests/Api/CustomApiTest.php | 26 +- tests/Api/DiscussionApiTest.php | 25 +- tests/Api/ImageApiTest.php | 29 +- tests/Api/ProductApiTest.php | 43 +- tests/Api/SearchCustomMocksTest.php | 14 +- tests/Api/SearchTest.php | 23 +- tests/Api/setterUpper.php | 54 ++ tests/DiffbotTest.php | 32 +- tests/Entity/CrawlJobTest.php | 4 +- tests/Factory/EntityTest.php | 18 +- .../Articles/sitepoint_diffbot_basic-old.json | 10 + .../Articles/sitepoint_diffbot_basic.json | 9 - .../15-04-19/Images/500px_zola_basic-old.json | 9 + .../15-04-19/Images/500px_zola_basic.json | 8 - .../Products/hobbit_amazon_basic-old.json | 10 + .../Products/hobbit_amazon_basic.json | 9 - .../Articles/apple-watch-verge-basic-old.json | 10 + .../Articles/apple-watch-verge-basic.json | 9 - .../apple-watch-verge-extended-old.json | 10 + .../Articles/apple-watch-verge-extended.json | 9 - .../Articles/diffbot-sitepoint-basic-old.json | 10 + .../Articles/diffbot-sitepoint-basic.json | 9 - .../diffbot-sitepoint-extended-old.json | 10 + .../Articles/diffbot-sitepoint-extended.json | 9 - .../Mocks/Articles/hi_quicktip_basic-old.json | 10 + tests/Mocks/Articles/hi_quicktip_basic.json | 9 - .../15-05-18/sitepoint_01_deleted-old.json | 11 + .../15-05-18/sitepoint_01_deleted.json | 10 - .../15-05-18/sitepoint_01_maxCrawled.json | 92 +-- .../Crawlbot/15-05-20/deletedSuccess-old.json | 11 + .../Crawlbot/15-05-20/deletedSuccess.json | 10 - .../Crawlbot/15-05-20/invalid_name-old.json | 10 + .../Mocks/Crawlbot/15-05-20/invalid_name.json | 11 +- .../15-05-20/invalid_response-old.json | 11 + .../Crawlbot/15-05-20/invalid_response.json | 10 - .../Crawlbot/15-05-20/multiplejobs01.json | 170 ++--- .../15-05-20/sitepoint_01_paused.json | 98 ++- .../15-05-20/sitepoint_01_restart.json | 101 ++- .../15-05-20/sitepoint_01_roundstart.json | 98 ++- .../15-05-20/sitepoint_01_unpaused.json | 98 ++- .../AuthorFolioNew/15-05-03/bskvorc-old.json | 10 + .../AuthorFolioNew/15-05-03/bskvorc.json | 9 - .../15-05-01/sp_discourse_php7_recap-old.json | 10 + .../15-05-01/sp_discourse_php7_recap.json | 9 - .../multi_images_smittenkitchen-old.json | 10 + .../Images/multi_images_smittenkitchen.json | 9 - tests/Mocks/Images/one_image_zola-old.json | 9 + tests/Mocks/Images/one_image_zola.json | 8 - .../15-05-03/shoes-sportsdirect-old.json | 10 + .../Products/15-05-03/shoes-sportsdirect.json | 9 - tests/Mocks/Products/dogbrush-old.json | 10 + tests/Mocks/Products/dogbrush.json | 9 - tests/Mocks/Search/15-05-24/test.json | 684 ++++++++++++++++-- tests/ResponseProvider.php | 28 +- tests/jsonconvert.php | 122 ++++ 67 files changed, 1452 insertions(+), 844 deletions(-) create mode 100644 tests/Api/setterUpper.php create mode 100644 tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic-old.json create mode 100644 tests/Mocks/Analyze/15-04-19/Images/500px_zola_basic-old.json create mode 100644 tests/Mocks/Analyze/15-04-19/Products/hobbit_amazon_basic-old.json create mode 100644 tests/Mocks/Articles/apple-watch-verge-basic-old.json create mode 100644 tests/Mocks/Articles/apple-watch-verge-extended-old.json create mode 100644 tests/Mocks/Articles/diffbot-sitepoint-basic-old.json create mode 100644 tests/Mocks/Articles/diffbot-sitepoint-extended-old.json create mode 100644 tests/Mocks/Articles/hi_quicktip_basic-old.json create mode 100644 tests/Mocks/Crawlbot/15-05-18/sitepoint_01_deleted-old.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/deletedSuccess-old.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/invalid_name-old.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/invalid_response-old.json create mode 100644 tests/Mocks/Custom/AuthorFolioNew/15-05-03/bskvorc-old.json create mode 100644 tests/Mocks/Discussions/15-05-01/sp_discourse_php7_recap-old.json create mode 100644 tests/Mocks/Images/multi_images_smittenkitchen-old.json create mode 100644 tests/Mocks/Images/one_image_zola-old.json create mode 100644 tests/Mocks/Products/15-05-03/shoes-sportsdirect-old.json create mode 100644 tests/Mocks/Products/dogbrush-old.json create mode 100644 tests/jsonconvert.php diff --git a/composer.json b/composer.json index 9e49f6a..752b314 100644 --- a/composer.json +++ b/composer.json @@ -16,11 +16,14 @@ ], "require": { "php" : ">=5.4.0", - "guzzlehttp/guzzle": "~5.0" + "php-http/client-implementation": "^1.0", + "php-http/utils": "^0.1.0@dev", + "php-http/discovery": "^0.2.0@dev" }, "require-dev": { "symfony/var-dumper": "~2", - "phpunit/phpunit": "^5.0" + "phpunit/phpunit": "^5.0", + "php-http/guzzle6-adapter": "~0.2@dev" }, "autoload": { "psr-4": { @@ -36,5 +39,7 @@ "branch-alias": { "dev-master": "0.5-dev" } - } + }, + "prefer-stable": true, + "minimum-stability": "dev" } diff --git a/src/Api/Crawl.php b/src/Api/Crawl.php index a6d47e1..92d9f1f 100644 --- a/src/Api/Crawl.php +++ b/src/Api/Crawl.php @@ -442,7 +442,7 @@ public function call() { $response = $this->diffbot->getHttpClient()->get($this->buildUrl()); - $array = $response->json(); + $array = json_decode($response->getBody(), true); if (isset($array['jobs'])) { $jobs = []; diff --git a/src/Api/Search.php b/src/Api/Search.php index fccdad7..08c9980 100644 --- a/src/Api/Search.php +++ b/src/Api/Search.php @@ -138,7 +138,7 @@ public function call($info = false) $ei = parent::call(); set_error_handler(function() { /* ignore errors */ }); - $arr = $ei->getResponse()->json(['big_int_strings' => true]); + $arr = json_decode((string)$ei->getResponse()->getBody(), true, 512, 1); restore_error_handler(); unset($arr['request']); diff --git a/src/Diffbot.php b/src/Diffbot.php index 79dfe11..3d001d8 100644 --- a/src/Diffbot.php +++ b/src/Diffbot.php @@ -2,6 +2,8 @@ namespace Swader\Diffbot; +use Http\Discovery\HttpClientDiscovery; +use Http\Discovery\MessageFactoryDiscovery; use Swader\Diffbot\Api\Crawl; use Swader\Diffbot\Api\Custom; use Swader\Diffbot\Api\Search; @@ -11,7 +13,7 @@ use Swader\Diffbot\Api\Analyze; use Swader\Diffbot\Api\Article; use Swader\Diffbot\Api\Discussion; -use GuzzleHttp\Client; +use Http\Client\Utils\HttpMethodsClient as Client; use Swader\Diffbot\Factory\Entity; use Swader\Diffbot\Interfaces\Api; use Swader\Diffbot\Interfaces\EntityFactory; @@ -90,12 +92,16 @@ public function getToken() * Sets the client to be used for querying the API endpoints * * @param Client $client + * @see http://php-http.readthedocs.org/en/latest/utils/#httpmethodsclient * @return $this */ public function setHttpClient(Client $client = null) { if ($client === null) { - $client = new Client(); + $client = new Client( + HttpClientDiscovery::find(), + MessageFactoryDiscovery::find() + ); } $this->client = $client; return $this; diff --git a/src/Entity/EntityIterator.php b/src/Entity/EntityIterator.php index 05d0de9..a15820f 100644 --- a/src/Entity/EntityIterator.php +++ b/src/Entity/EntityIterator.php @@ -2,7 +2,7 @@ namespace Swader\Diffbot\Entity; -use GuzzleHttp\Message\ResponseInterface as Response; +use Psr\Http\Message\ResponseInterface as Response; use Swader\Diffbot\Abstracts\Entity; class EntityIterator implements \Countable, \Iterator, \ArrayAccess @@ -134,6 +134,7 @@ public function offsetGet($offset) if ($this->offsetExists($offset)) { return $this->data[$offset]; } + throw new \OutOfBoundsException("Offset '$offset' not present"); } /** diff --git a/src/Factory/Entity.php b/src/Factory/Entity.php index 65b2252..e4ce237 100644 --- a/src/Factory/Entity.php +++ b/src/Factory/Entity.php @@ -2,7 +2,7 @@ namespace Swader\Diffbot\Factory; -use GuzzleHttp\Message\ResponseInterface as Response; +use Psr\Http\Message\ResponseInterface as Response; use Swader\Diffbot\Entity\EntityIterator; use Swader\Diffbot\Exceptions\DiffbotException; use Swader\Diffbot\Interfaces\EntityFactory; @@ -30,9 +30,8 @@ public function createAppropriateIterator(Response $response) { $this->checkResponseFormat($response); - set_error_handler(function() { /* ignore errors */ }); - $arr = $response->json(['big_int_strings' => true]); + $arr = json_decode((string)$response->getBody(), true, 512, 1); restore_error_handler(); $objects = []; @@ -58,7 +57,7 @@ public function createAppropriateIterator(Response $response) protected function checkResponseFormat(Response $response) { set_error_handler(function() { /* ignore errors */ }); - $arr = $response->json(['big_int_strings' => true]); + $arr = json_decode((string)$response->getBody(), true, 512, 1); restore_error_handler(); if (isset($arr['error'])) { diff --git a/src/Interfaces/EntityFactory.php b/src/Interfaces/EntityFactory.php index 7e1b8fb..21c4d85 100644 --- a/src/Interfaces/EntityFactory.php +++ b/src/Interfaces/EntityFactory.php @@ -2,7 +2,7 @@ namespace Swader\Diffbot\Interfaces; -use GuzzleHttp\Message\ResponseInterface as Response; +use Psr\Http\Message\ResponseInterface as Response; use Swader\Diffbot\Entity\EntityIterator; interface EntityFactory diff --git a/tests/Abstracts/EntityIteratorTest.php b/tests/Abstracts/EntityIteratorTest.php index 3545e90..a3e2e3a 100644 --- a/tests/Abstracts/EntityIteratorTest.php +++ b/tests/Abstracts/EntityIteratorTest.php @@ -2,13 +2,9 @@ namespace Swader\Diffbot\Test; -use GuzzleHttp\Client; -use GuzzleHttp\Message\Response; -use GuzzleHttp\Stream\Stream; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Abstracts\Entity; +use Swader\Diffbot\Factory\Entity; -class EntityIteratorTest extends \PHPUnit_Framework_TestCase +class EntityIteratorTest extends ResponseProvider { /** @var array */ @@ -19,28 +15,11 @@ class EntityIteratorTest extends \PHPUnit_Framework_TestCase 'Images/multi_images_smittenkitchen.json' ]; - protected function prepareResponses() - { - if (empty($this->responses)) { - $mockInput = []; - foreach ($this->files as $file) { - $mockInput[] = file_get_contents(__DIR__ . '/../Mocks/' . $file); - } - unset($file); - $mock = new Mock($mockInput); - $client = new Client(); - $client->getEmitter()->attach($mock); - foreach ($this->files as $file) { - $this->responses[$file] = $client->get('sampleurl.com'); - } - unset($file); - } - return $this->responses; - } + protected $folder = '/../Mocks/'; public function testBadMethodCall() { - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); $ei = $ef->createAppropriateIterator($this->prepareResponses()['Images/one_image_zola.json']); $this->setExpectedException('BadMethodCallException'); @@ -49,7 +28,7 @@ public function testBadMethodCall() public function testMagic() { - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); $ei = $ef->createAppropriateIterator($this->prepareResponses()['Images/one_image_zola.json']); $this->assertEquals('image', $ei->type); @@ -63,7 +42,7 @@ public function testCount() 'Images/multi_images_smittenkitchen.json' => 9 ]; - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); foreach ($fileExpectations as $fileName => $expectation) { $ei = $ef->createAppropriateIterator($this->prepareResponses()[$fileName]); @@ -73,17 +52,17 @@ public function testCount() public function testGetResponse() { - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); foreach ($this->files as $fileName) { $ei = $ef->createAppropriateIterator($this->prepareResponses()[$fileName]); - $this->assertInstanceOf('GuzzleHttp\Message\Response', $ei->getResponse()); + $this->assertInstanceOf('Psr\Http\Message\ResponseInterface', $ei->getResponse()); } } public function testIteration() { - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); foreach ($this->files as $fileName) { $ei = $ef->createAppropriateIterator($this->prepareResponses()[$fileName]); diff --git a/tests/Api/AnalyzeApiTest.php b/tests/Api/AnalyzeApiTest.php index abecf8e..78778bc 100644 --- a/tests/Api/AnalyzeApiTest.php +++ b/tests/Api/AnalyzeApiTest.php @@ -2,13 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; -use Swader\Diffbot\Entity\Article; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; class AnalyzeApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -19,27 +18,18 @@ class AnalyzeApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createAnalyzeAPI('https://article-mock.com'); } - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); - } - protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')) + ]); } return $this->validMock; diff --git a/tests/Api/ArticleApiTest.php b/tests/Api/ArticleApiTest.php index 0d02351..6f042d0 100644 --- a/tests/Api/ArticleApiTest.php +++ b/tests/Api/ArticleApiTest.php @@ -2,13 +2,13 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Entity\Article; class ArticleApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -19,27 +19,17 @@ class ArticleApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); - + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createArticleAPI('https://article-mock.com'); } - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); - } - protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')) + ]); } return $this->validMock; diff --git a/tests/Api/CrawlCustomMocksTest.php b/tests/Api/CrawlCustomMocksTest.php index e880c03..b517ab2 100644 --- a/tests/Api/CrawlCustomMocksTest.php +++ b/tests/Api/CrawlCustomMocksTest.php @@ -2,13 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; use Swader\Diffbot\Diffbot; use Swader\Diffbot\Entity\JobCrawl; class CrawlCustomMocksTest extends \PHPUnit_Framework_TestCase { + use setterUpper; /** @var Diffbot */ protected $diffbot; @@ -17,16 +16,13 @@ public function setUp() { $diffbot = new Diffbot('demo'); $diffbot->setEntityFactory(); - $fakeClient = new Client(); - $diffbot->setHttpClient($fakeClient); $this->diffbot = $diffbot; } public function testRoundStart() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_roundstart.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_roundstart.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -39,9 +35,8 @@ public function testRoundStart() public function testRestart() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_restart.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_restart.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -60,9 +55,8 @@ public function testRestart() public function testPauseOn() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_paused.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_paused.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -75,9 +69,8 @@ public function testPauseOn() public function testPauseOff() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_unpaused.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_unpaused.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -90,9 +83,8 @@ public function testPauseOff() public function testDelete() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/deletedSuccess.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/deletedSuccess.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -101,21 +93,19 @@ public function testDelete() public function test500() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_name.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_name.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath, 500)); $c = $this->diffbot->crawl('sitepoint_01'); - $this->setExpectedException('GuzzleHttp\Exception\ServerException'); + $this->setExpectedException('Http\Client\Exception\HttpException'); $c->call(); } public function testOtherError() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_response.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_response.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); diff --git a/tests/Api/CrawlTest.php b/tests/Api/CrawlTest.php index d0af2e1..da44460 100644 --- a/tests/Api/CrawlTest.php +++ b/tests/Api/CrawlTest.php @@ -2,12 +2,13 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Diffbot; class CrawlTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -16,22 +17,16 @@ class CrawlTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = new Diffbot('demo'); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); - - $this->diffbot = $diffbot; + $this->diffbot = $this->preSetUp(); } protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-18/sitepoint_01_maxCrawled.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-18/sitepoint_01_maxCrawled.json')) + ]); } return $this->validMock; diff --git a/tests/Api/CustomApiTest.php b/tests/Api/CustomApiTest.php index 3c36e97..8814a36 100644 --- a/tests/Api/CustomApiTest.php +++ b/tests/Api/CustomApiTest.php @@ -2,12 +2,13 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Diffbot; class CustomApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -16,27 +17,16 @@ class CustomApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); - - $this->diffbot = $diffbot; - } - - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); + $this->diffbot = $this->preSetUp(); } protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')) + ]); } return $this->validMock; diff --git a/tests/Api/DiscussionApiTest.php b/tests/Api/DiscussionApiTest.php index 1fc467b..3b84371 100644 --- a/tests/Api/DiscussionApiTest.php +++ b/tests/Api/DiscussionApiTest.php @@ -2,12 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; class DiscussionApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -18,27 +18,18 @@ class DiscussionApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createDiscussionAPI('https://discussion-mock.com'); } - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); - } - protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Discussions/15-05-01/sp_discourse_php7_recap.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Discussions/15-05-01/sp_discourse_php7_recap.json')) + ]); } return $this->validMock; diff --git a/tests/Api/ImageApiTest.php b/tests/Api/ImageApiTest.php index 5242c8a..90c8bbc 100644 --- a/tests/Api/ImageApiTest.php +++ b/tests/Api/ImageApiTest.php @@ -2,13 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; -use Swader\Diffbot\Entity\Image; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; class ImageApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -19,27 +18,18 @@ class ImageApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createImageAPI('https://article-mock.com'); } - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); - } - protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Images/one_image_zola.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Images/one_image_zola.json')) + ]); } return $this->validMock; @@ -47,8 +37,7 @@ protected function getValidMock() public function testCall() { - /** @var Image $image */ - $image = $this->apiWithMock->call(); + $this->apiWithMock->call(); } public function testBuildUrlNoCustomFields() diff --git a/tests/Api/ProductApiTest.php b/tests/Api/ProductApiTest.php index 9d84ace..8f857ca 100644 --- a/tests/Api/ProductApiTest.php +++ b/tests/Api/ProductApiTest.php @@ -2,13 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; -use Swader\Diffbot\Entity\Product; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; class ProductApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -17,32 +16,27 @@ class ProductApiTest extends \PHPUnit_Framework_TestCase */ protected $apiWithMock; - protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); + protected function setUp() + { + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createProductAPI('https://dogbrush-mock.com'); } - protected function getValidDiffbotInstance() + protected function getValidMock() { - return new Diffbot('demo'); - } - - protected function getValidMock(){ if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__.'/../Mocks/Products/dogbrush.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Products/dogbrush.json')) + ]); } + return $this->validMock; } - public function testCall() { + public function testCall() + { $products = $this->apiWithMock->call(); foreach ($products as $product) { @@ -50,7 +44,8 @@ public function testCall() { } } - public function testBuildUrlNoCustomFields() { + public function testBuildUrlNoCustomFields() + { $url = $this ->apiWithMock ->buildUrl(); @@ -58,7 +53,8 @@ public function testBuildUrlNoCustomFields() { $this->assertEquals($expectedUrl, $url); } - public function testBuildUrlMultipleCustomFields() { + public function testBuildUrlMultipleCustomFields() + { $url = $this ->apiWithMock ->setColors(true) @@ -69,7 +65,8 @@ public function testBuildUrlMultipleCustomFields() { $this->assertEquals($expectedUrl, $url); } - public function testBuildUrlMultipleCustomFieldsAndOtherOptions() { + public function testBuildUrlMultipleCustomFieldsAndOtherOptions() + { $url = $this ->apiWithMock ->setColors(true) diff --git a/tests/Api/SearchCustomMocksTest.php b/tests/Api/SearchCustomMocksTest.php index 4a805aa..810b185 100644 --- a/tests/Api/SearchCustomMocksTest.php +++ b/tests/Api/SearchCustomMocksTest.php @@ -2,13 +2,11 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; use Swader\Diffbot\Diffbot; -use Swader\Diffbot\Entity\JobCrawl; class SearchCustomMocksTest extends \PHPUnit_Framework_TestCase { + use setterUpper; /** @var Diffbot */ protected $diffbot; @@ -20,8 +18,6 @@ public function setUp() { $diffbot = new Diffbot('demo'); $diffbot->setEntityFactory(); - $fakeClient = new Client(); - $diffbot->setHttpClient($fakeClient); $this->diffbot = $diffbot; } @@ -45,9 +41,7 @@ public function resultCountProvider() */ public function testResultCount($case, $expectations) { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents($this->mockPrefix . $case['file'])] - )); + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($this->mockPrefix . $case['file'])); $search = $this->diffbot->search($case['q'])->call(); @@ -115,9 +109,7 @@ public function searchInfoProvider() public function testSearchInfo($case, $expectations) { $this->markTestSkipped('Bugged due to JSONC: https://github.com/Swader/diffbot-php-client/issues/12'); - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents($this->mockPrefix . $case['file'])] - )); + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($this->mockPrefix . $case['file'])); $searchInfo = $this->diffbot->search($case['q'])->call(true); diff --git a/tests/Api/SearchTest.php b/tests/Api/SearchTest.php index 8fe7ff9..0e044d8 100644 --- a/tests/Api/SearchTest.php +++ b/tests/Api/SearchTest.php @@ -2,13 +2,14 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Api\Search; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Diffbot; +use Swader\Diffbot\Api\Search; class SearchTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -17,22 +18,16 @@ class SearchTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = new Diffbot('demo'); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); - - $this->diffbot = $diffbot; + $this->diffbot = $this->preSetUp(); } protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Search/15-05-24/test.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Search/15-05-24/test.json')) + ]); } return $this->validMock; diff --git a/tests/Api/setterUpper.php b/tests/Api/setterUpper.php new file mode 100644 index 0000000..00cf453 --- /dev/null +++ b/tests/Api/setterUpper.php @@ -0,0 +1,54 @@ +getValidDiffbotInstance(); + + $handler = HandlerStack::create($this->getValidMock()); + $guzzleClient = new Client(['handler' => $handler]); + + $methodsClient = new HttpMethodsClient( + new Guzzle6HttpAdapter($guzzleClient), + new GuzzleFactory()); + + $diffbot->setHttpClient($methodsClient); + $diffbot->setEntityFactory(); + + return $diffbot; + } + + public function getCustomMockFakeClient($filepath, $code = 200) + { + $handler = HandlerStack::create(new MockHandler([ + new Response($code, [], + file_get_contents($filepath)) + ])); + + $guzzleClient = new Client(['handler' => $handler]); + + return new HttpMethodsClient( + new Guzzle6HttpAdapter($guzzleClient), + new GuzzleFactory()); + + } +} \ No newline at end of file diff --git a/tests/DiffbotTest.php b/tests/DiffbotTest.php index 0533566..dda75d3 100644 --- a/tests/DiffbotTest.php +++ b/tests/DiffbotTest.php @@ -3,7 +3,12 @@ namespace Swader\Diffbot\Test; use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\HandlerStack; +use GuzzleHttp\Psr7\Response; +use Http\Adapter\Guzzle6HttpAdapter; +use Http\Client\Utils\HttpMethodsClient; +use Http\Discovery\MessageFactory\GuzzleFactory; use Swader\Diffbot\Diffbot; class DiffbotTest extends \PHPUnit_Framework_TestCase @@ -93,14 +98,19 @@ public function testGetToken() public function testSetHttpClient() { $bot = new Diffbot('token'); - $validMock = new Mock( - [file_get_contents(__DIR__ . '/Mocks/Products/dogbrush.json')] - ); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($validMock); + $mock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/Mocks/Products/dogbrush.json')) + ]); + $handler = HandlerStack::create($mock); + $guzzleClient = new Client(['handler' => $handler]); + + $methodsClient = new HttpMethodsClient( + new Guzzle6HttpAdapter($guzzleClient), + new GuzzleFactory()); try { - $bot->setHttpClient($fakeClient); + $bot->setHttpClient($methodsClient); } catch (\Exception $e) { $this->fail("Could not set fake client: " . $e->getMessage()); } @@ -108,7 +118,13 @@ public function testSetHttpClient() public function methodnameProvider() { - return [['product'], ['image'], ['analyze'], ['article'], ['discussion']]; + return [ + ['product'], + ['image'], + ['analyze'], + ['article'], + ['discussion'] + ]; } /** diff --git a/tests/Entity/CrawlJobTest.php b/tests/Entity/CrawlJobTest.php index 3faa36a..37262e3 100644 --- a/tests/Entity/CrawlJobTest.php +++ b/tests/Entity/CrawlJobTest.php @@ -2,7 +2,7 @@ namespace Swader\Diffbot\Test\Entity; -use GuzzleHttp\Message\ResponseInterface; +use Psr\Http\Message\ResponseInterface; use Swader\Diffbot\Entity\EntityIterator; use Swader\Diffbot\Entity\Image; use Swader\Diffbot\Entity\JobCrawl as Job; @@ -24,7 +24,7 @@ protected function ei($file) /** @var ResponseInterface $response */ $response = $this->responses[$file]; $jobs = []; - foreach ($response->json()['jobs'] as $data) { + foreach (json_decode($response->getBody(), true)['jobs'] as $data) { $jobs[] = new Job($data); } diff --git a/tests/Factory/EntityTest.php b/tests/Factory/EntityTest.php index d925727..f5911a5 100644 --- a/tests/Factory/EntityTest.php +++ b/tests/Factory/EntityTest.php @@ -2,8 +2,7 @@ namespace Swader\Diffbot\Test\Factory; -use GuzzleHttp\Message\Response; -use GuzzleHttp\Stream\Stream; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Diffbot; use Swader\Diffbot\Factory\Entity; @@ -21,23 +20,16 @@ public function setUp() $this->ef = new Entity(); } - public function testInvalidResponseBodyFail() - { - $this->responseOk->setBody(Stream::factory('Pure text content')); - $this->setExpectedException('GuzzleHttp\Exception\ParseException'); - $this->ef->createAppropriateIterator($this->responseOk); - } - public function testMissingObjectsFail() { - $this->responseOk->setBody(Stream::factory(json_encode(['foo' => 'bar']))); + $this->responseOk = $this->responseOk->withBody(\GuzzleHttp\Psr7\stream_for(json_encode(['foo' => 'bar']))); $this->setExpectedException('Swader\Diffbot\Exceptions\DiffbotException'); $this->ef->createAppropriateIterator($this->responseOk); } public function testMissingRequestFail() { - $this->responseOk->setBody(Stream::factory(json_encode([ + $this->responseOk = $this->responseOk->withBody(\GuzzleHttp\Psr7\stream_for(json_encode([ 'objects' => 'foo', 'req' => 'bar' ]))); @@ -47,7 +39,7 @@ public function testMissingRequestFail() public function testProductEntityPass() { - $this->responseOk->setBody(Stream::factory(json_encode([ + $this->responseOk = $this->responseOk->withBody(\GuzzleHttp\Psr7\stream_for(json_encode([ 'objects' => [['type' => 'product']], 'request' => ['api' => 'product', 'foo' => 2] ]))); @@ -56,7 +48,7 @@ public function testProductEntityPass() public function testWildCardEntityPass() { - $this->responseOk->setBody(Stream::factory(json_encode([ + $this->responseOk = $this->responseOk->withBody(\GuzzleHttp\Psr7\stream_for(json_encode([ 'objects' => [['type' => 'mysterious_api']], 'request' => ['api' => 'mysterious_api', 'foo' => 2] ]))); diff --git a/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic-old.json b/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic-old.json new file mode 100644 index 0000000..e21fac8 --- /dev/null +++ b/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic-old.json @@ -0,0 +1,10 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Sun, 19 Apr 2015 20:09:22 GMT +Content-Type: application/json;charset=utf-8 +Transfer-Encoding: chunked +Connection: keep-alive +Vary: Accept-Encoding +Access-Control-Allow-Origin: * + +{"title":"Diffbot: Crawling with Visual Machine Learning","request":{"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning","api":"analyze","resolvedPageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","options":["mentos"],"version":3},"humanLanguage":"en","type":"article","objects":[{"tags":[{"id":4585348,"count":9,"prevalence":0.6428571428571428,"label":"Diffbot","type":"organization","uri":"http://dbpedia.org/resource/Diffbot"},{"id":175464,"count":13,"prevalence":0.9285714285714286,"label":"Application programming interface","uri":"http://dbpedia.org/resource/Application_programming_interface"},{"id":1936869,"count":3,"prevalence":0.21428571428571427,"label":"Laravel","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/Laravel"},{"id":208652,"count":3,"prevalence":0.21428571428571427,"label":"PHP","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/PHP"},{"id":91320,"count":2,"prevalence":0.14285714285714285,"label":"SitePoint","type":"organization","uri":"http://dbpedia.org/resource/SitePoint"}],"icon":"http://www.sitepoint.com/wp-content/themes/sitepoint/assets/images/apple-touch-icon-144x144-precomposed.png","text":"Have you ever wondered how social networks do URL previews so well when you share links? How do they know which images to grab, whom to cite as an author, or which tags to attach to the preview? Is it all crawling with complex regexes over source code? Actually, more often than not, it isn\u2019t. Meta information defined in the source can be unreliable, and sites with less than stellar reputation often use them as keyword carriers, attempting to get search engines to rank them higher. Isn\u2019t what we, the humans, see in front of us what matters anyway?\nIf you want to build a URL preview snippet or a news aggregator, there are many automatic crawlers available online, both proprietary and open source, but you seldom find something as niche as visual machine learning. This is exactly what Diffbot is \u2013 a \u201cvisual learning robot\u201d which renders a URL you request in full and then visually extracts data, helping itself with some metadata from the page source as needed.\nAfter covering some theory, in this post we\u2019ll do a demo API call at one of SitePoint\u2019s posts.\nPHP Library\nThe PHP library for Diffbot is somewhat out of date, and as such we won\u2019t be using it in this demo. We\u2019ll be performing raw API calls, and in some future posts we\u2019ll build our own library for API interaction.\nIf you\u2019d like to take a look at the PHP library nonetheless, see here, and if you\u2019re interested in libraries for other languages, Diffbot has a directory.\nJavaScript Content\nWe said in the introductory section that Diffbot renders the request in full and then analyzes it. But, what about JavaScript content? Nowadays, websites often render some HTML above the fold, and then finish the CSS, JS, and dynamic content loading afterwards. Can the Diffbot API see that?\nAs a matter of fact, yes. Diffbot literally renders the page in full, and then inspects it visually, as explained in my StackOverflow Q&A here. There are some caveats, though, so make sure you read the answer carefully.\nPricing and API Health\nDiffbot has several usage tiers. There\u2019s a free trial tier which kills your API token after 7 days or 10000 calls, whichever comes first. The commercial tokens can be purchased at various prices, and never expire, but do have limitations. A special case by case approach is afforded to open source and/or educational projects which provides an older model of the free token \u2013 10k calls per month, once per second max, but never expires. You need to contact them directly if you think you qualify.\nDiffbot guarantees a high uptime, but failures sometimes do happen \u2013 especially in the most resource intensive API of the bunch: Crawlbot. Crawlbot is used to crawl entire domains, not just individual pages, and as such has a lower reliability rate than other APIs. Not by a lot, but enough to be noticeable in the API Health screen \u2013 the screen you can check to see if an API is up and running or currently unavailable if your calls run into issues or return error 500.\nDemo\nTo prepare your environment, please boot up a Homestead Improved instance.\nCreate Project\nCreate a starter Laravel project by SSHing into the VM with vagrant ssh, going into the Code folder, and executing composer create-project laravel/laravel Laravel --prefer-dist. This will let you access the Laravel greeting page via http://homestead.app:8000 from the host\u2019s browser.\nAdd a Route and Action\nIn app/routes.php add the following route:\n1\nRoute::get('/diffbot', 'HomeController@diffbotDemo');\nIn app/controllers/HomeController add the following action:\n1\n2\n3\npublic function diffbotDemo() {\ndie(\"hi\");\n}\nIf http://homestead.app:8000/diffbot now outputs \u201chi\u201d on the screen, we\u2019re ready to start playing with the API.\nGet a Token\nTo interact with the Diffbot API, you need a token. Sign up for one on their pricing page. For the sake of this demo, let\u2019s call our token $TOKEN, and we\u2019ll refer to it as such in URLs. Replace $TOKEN with your own value where appropriate.\nInstall Guzzle\nWe\u2019ll be using Guzzle as our HTTP client. It\u2019s not required, but I do recommend you get familiar with it through a past article of ours.\nAdd the \"guzzlehttp/guzzle\": \"4.1.*@dev\" to your composer.json so the require block looks like this:\n1\n2\n3\n4\n\"require\": {\n\"laravel/framework\": \"4.2.*\",\n\"guzzlehttp/guzzle\": \"4.1.*@dev\"\n},\nIn the project root, run composer update.\nFetch Article Data\nIn the first example, we\u2019ll crawl a SitePoint post with the default Article API from Diffbot. To do this, we refer to the docs which do an excellent job at explaining the workflow. Change the body of the diffbotDemo action to the following code:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\npublic function diffbotDemo() {\n$token = \"$TOKEN\";\n$version = 'v3';\n$response = $client->get($version.'/article', ['query' => [\n'token' => $token,\n]]);\ndie(var_dump($response->json()));\n}\nFirst, we set our token. Then, we define a variable that\u2019ll hold the API version. Next, it\u2019s up to us to create a new Guzzle client, and we also give it a base URL so we don\u2019t have to type it in every time we make another request.\nNext up, we create a response object by sending a GET request to the API\u2019s URL, and we add in an array of query parameters in key => value format. In this case, we only pass in the token and the URL, the most basic of parameters.\nFinally, since the Diffbot API returns JSON data, we use Guzzle\u2019s json() method to automatically decode it into an array. We then pretty-print this data:\nAs you can see, we got some information back rather quickly. There\u2019s the icon that was used, a preview of the text, the title, even the language, date and HTML have been returned. You\u2019ll notice there\u2019s no author, however. Let\u2019s change this and request some more values.\nIf we add the \u201cfields\u201d parameter to the query params list and give it a value of \u201ctags\u201d, Diffbot will attempt to extract tags/categories from the URL provided. Add this line to the query array:\n1\n'fields' => 'tags'\nand then change the die part to this:\n1\n2\n$data = $response->json();\ndie(var_dump($data['objects'][0]['tags']));\nRefreshing the screen now gives us this:\nBut, the source code of the article notes several other tags:\nWhy is the result so very different? It\u2019s precisely due to the reason we mentioned at the end of the very first paragraph of this post: what we humans see takes precedence. Diffbot is a visual learning robot, and as such its AI deducts the tags from the actual rendered content \u2013 what it can see \u2013 rather than from looking at the source code which is far too easily spiced up for SEO purposes.\nIs there a way to get the tags from the source code, though, if one really needs them? Furthermore, can we make Diffbot recognize the author on SitePoint articles? Yes. With the Custom API.\nMeta Tags and Author with Custom API\nThe Custom API is a feature which allows you to not only tweak existing Diffbot API to your liking by adding new fields and rules for content extraction, but also allows you to create completely new APIs (accessed via a dedicated URL, too) for custom content processing.\nGo to the dev dashboard and log in with your token. Then, go into \u201cCustom API\u201d. Activate the \u201cCreate a Rule\u201d tab at the bottom, and input the URL of the article we\u2019re crawling into the URL box, then click Test. Your screen should look something like this:\nYou\u2019ll immediately notice the Author field is empty. You can tweak the author-searching rule by clicking Edit next to it, and finding the Author element in the live preview window that opens, then click on it to get the desired result. However, due to some, well, less than perfect CSS on SitePoint\u2019s end, it\u2019s very difficult to provide Diffbot\u2019s API with a consistent path to the author name, especially by clicking on elements. Instead, add the following rule manually: .contributor--large .contributor_name a and click Save.\nYou\u2019ll notice the Preview window now correctly populates the Author field:\nIn fact, this new rule is automatically applied to all SitePoint links for your token. If you try to preview another SitePoint article, like this one, you\u2019ll notice Peter Nijssen is successfully extracted:\nOk, let\u2019s modify the API further. We need the article:tag values that are visible in source code. Doing this requires a two-step process.\nStep 1: Define a Collection\nA collection is exactly what it sounds like \u2013 a collection of values grabbed via a specific ruleset. We\u2019ll call our collection \u201cMetaTags\u201d, and give it the following selector: meta[property=article:tag]. This means \u201cfind all meta elements in the HTML that have the property attribute with the value article:tag\u201c.\nStep 2: Define Collection Fields\nCollection fields are individual entries in a collection \u2013 in our case, the various tags. Click on \u201cAdd a custom field to this collection\u201d, and add the following values:\nClick Save. You\u2019ll immediately have access to the list of Tags in the result window:\n1\ndie(var_dump($data['objects'][0]['metaTags']));\nIf you now refresh the URL we tested with (http://homestead.app:8000/diffbot), you\u2019ll notice the author and meta tags values are there. Here\u2019s the output the above line of code produces:\nWe have our tags!\nConclusion\nDiffbot is a powerful data extractor for the web \u2013 whether you need to consolidate many sites into a single search index without combining their back-ends, want to build a news aggregator, have an idea for a URL preview web component, or want to regularly harvest the contents of competitors\u2019 public pricing lists, Diffbot can help. With dead simple API calls and highly structured responses, you\u2019ll be up and running in next to no time. In a later article, we\u2019ll build a brand new API for using Diffbot with PHP, and redo the calls above with it. We\u2019ll also host the library on Packagist, so you can easily install it with Composer. Stay tuned!","discussion":{"tags":[{"id":78932,"count":2,"prevalence":0.2857142857142857,"label":"HTML","uri":"http://dbpedia.org/resource/HTML"},{"id":91320,"count":1,"prevalence":0.14285714285714285,"label":"SitePoint","type":"organization","uri":"http://dbpedia.org/resource/SitePoint"},{"id":5062,"count":1,"prevalence":0.14285714285714285,"label":"Nice","type":"place","uri":"http://dbpedia.org/resource/Nice"},{"id":491384,"count":1,"prevalence":0.14285714285714285,"label":"IOS","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/IOS"},{"id":4585348,"count":1,"prevalence":0.14285714285714285,"label":"Diffbot","type":"organization","uri":"http://dbpedia.org/resource/Diffbot"}],"numPosts":7,"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","posts":[{"id":0,"tags":[{"id":5062,"count":1,"label":"Nice","type":"place","uri":"http://dbpedia.org/resource/Nice"},{"id":4585348,"count":1,"label":"Diffbot","type":"organization","uri":"http://dbpedia.org/resource/Diffbot"},{"id":78932,"count":1,"label":"HTML","uri":"http://dbpedia.org/resource/HTML"}],"author":"Peter Nijssen","text":"Nice article! Just wondering; since diffbot is unable to grab the author, can you conclude that it is not actually represented correctly within the website? I mean, you would think that should be an easy field to grab if HTML has been formatted correctly.","diffbotUri":"post|3|-459867678","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":1,"authorUrl":"https://disqus.com/by/peter_nijssen/","humanLanguage":"en","html":"

Nice article! Just wondering; since diffbot is unable to grab the author, can you conclude that it is not actually represented correctly within the website? I mean, you would think that should be an easy field to grab if HTML has been formatted correctly.<\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"tags":[{"id":91320,"count":1,"label":"SitePoint","type":"organization","uri":"http://dbpedia.org/resource/SitePoint"}],"text":"Correct - there's definitely more that could be done in terms of element declaration in SitePoint's design. Also, notice this: http://www.quora.com/APIs/Are-...","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"humanLanguage":"en","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT","id":1,"parentId":0,"author":"Bruno Skvorc","diffbotUri":"post|3|1753418734","authorUrl":"https://disqus.com/by/brunoskvorc/","html":"

Correct - there's definitely more that could be done in terms of element declaration in SitePoint's design. Also, notice this: http://www.quora.com/APIs/Are-...<\/a><\/p>"},{"id":2,"tags":[{"id":491384,"count":1,"label":"IOS","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/IOS"},{"id":78932,"count":1,"label":"HTML","uri":"http://dbpedia.org/resource/HTML"}],"author":"Stefan Sturm","text":"Great article, but after scraping the article we need to display it somewhere...\nFor me I want to display it on iOS devices.\nDo you know any good libs or HTML templates to use the diffbot text in?\nThanks for your help:)","diffbotUri":"post|3|959497238","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"authorUrl":"https://disqus.com/by/stefansturm/","humanLanguage":"en","html":"

Great article, but after scraping the article we need to display it somewhere...
\nFor me I want to display it on iOS devices.
\nDo you know any good libs or HTML templates to use the diffbot text in?<\/p>\n

Thanks for your help:)<\/p>","type":"post","date":"Sun, 19 Oct 2014 00:00:00 GMT"},{"id":3,"author":"Taher","text":"Is there any open source projects as good as diffbots?","diffbotUri":"post|3|-197836079","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"authorUrl":"https://disqus.com/by/disqus_CWGq6zNflN/","humanLanguage":"en","html":"

Is there any open source projects as good as diffbots?<\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"id":4,"parentId":3,"author":"Bruno Skvorc","text":"The answers here might help you out: https://www.quora.com/Web-Scra...","diffbotUri":"post|3|-1403587915","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":1,"authorUrl":"https://disqus.com/by/brunoskvorc/","humanLanguage":"en","html":"

The answers here might help you out: https://www.quora.com/Web-Scra...<\/a><\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"id":5,"author":"anonymous","text":"i just tried out the demo on diffbots website, and out of 14 pages i feeded it, it could only properly process 3. with most of 'em it was just showing me the copyright notice / legal bla foo embeded in those pages","diffbotUri":"post|3|-2120915058","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"humanLanguage":"en","html":"

i just tried out the demo on diffbots website, and out of 14 pages i feeded it, it could only properly process 3. with most of 'em it was just showing me the copyright notice / legal bla foo embeded in those pages<\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"tags":[{"id":175464,"count":1,"label":"Application programming interface","uri":"http://dbpedia.org/resource/Application_programming_interface"}],"text":"That's where custom API comes in to save the day. Out of curiosity, though, which URLs did you try, and which information was missing?","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"humanLanguage":"en","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT","id":6,"parentId":5,"author":"Bruno Skvorc","diffbotUri":"post|3|1316965997","authorUrl":"https://disqus.com/by/brunoskvorc/","html":"

That's where custom API comes in to save the day. Out of curiosity, though, which URLs did you try, and which information was missing?<\/p>"}],"provider":"Disqus","humanLanguage":"en","confidence":1,"type":"discussion","participants":5,"rssUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/feed/","title":"Diffbot: Crawling with Visual Machine Learning","diffbotUri":"discussion|3|-1039854465","numPages":1},"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning","metaTags":[{"name":"ai"},{"name":"Artificial Intelligence"},{"name":"crawling"},{"name":"Diffbot"},{"name":"framework"},{"name":"laravel"},{"name":"machine learning"},{"name":"OOPHP"},{"name":"PHP"},{"name":"scraping"},{"name":"visual learning"}],"humanLanguage":"en","type":"article","date":"Sun, 27 Jul 2014 00:00:00 GMT","resolvedPageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","author":"Bruno Skvorc","title":"Diffbot: Crawling with Visual Machine Learning","diffbotUri":"article|3|-938093421","images":[{"height":533,"naturalHeight":727,"diffbotUri":"image|3|-851701004","primary":true,"width":780,"naturalWidth":1063,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624455201.png"},{"height":216,"naturalHeight":216,"diffbotUri":"image|3|762494522","width":523,"naturalWidth":523,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624487602.png"},{"height":184,"naturalHeight":184,"diffbotUri":"image|3|302236938","width":664,"naturalWidth":664,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624509003.png"},{"height":784,"naturalHeight":972,"diffbotUri":"image|3|-1836356546","width":780,"naturalWidth":966,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624552704.png"},{"height":156,"naturalHeight":184,"diffbotUri":"image|3|1297360030","width":780,"naturalWidth":918,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624650505.png"},{"height":157,"naturalHeight":188,"diffbotUri":"image|3|502449852","width":780,"naturalWidth":929,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624665606.png"},{"height":175,"naturalHeight":237,"diffbotUri":"image|3|-2007985802","width":780,"naturalWidth":1053,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624803507.png"},{"title":"Change the final output of the diffbotDemo() action to this:","height":520,"naturalHeight":604,"diffbotUri":"image|3|-140134863","width":780,"naturalWidth":906,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624809508.png"},{"height":533,"naturalHeight":727,"diffbotUri":"image|3|1129235416","width":780,"naturalWidth":1063,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624834309.png"}],"html":"

Have you ever wondered how social networks do URL previews so well when you share links? How do they know which images to grab, whom to cite as an author, or which tags to attach to the preview? Is it all crawling with complex regexes over source code? Actually, more often than not, it isn’t. Meta information defined in the source can be unreliable, and sites with less than stellar reputation often use them as keyword carriers, attempting to get search engines to rank them higher. Isn’t what we, the humans, see in front of us what matters anyway?<\/p>\n

If you want to build a URL preview snippet or a news aggregator, there are many automatic crawlers available online, both proprietary and open source, but you seldom find something as niche as visual machine learning. This is exactly what Diffbot<\/a> is – a “visual learning robot” which renders a URL you request in full and then visually extracts data, helping itself with some metadata from the page source as needed.<\/p>\n

After covering some theory, in this post we’ll do a demo API call at one of SitePoint’s posts.<\/p>\n

PHP Library<\/h2>\n

The PHP library for Diffbot is somewhat out of date, and as such we won’t be using it in this demo. We’ll be performing raw API calls, and in some future posts we’ll build our own library for API interaction.<\/p>\n

If you’d like to take a look at the PHP library nonetheless, see here<\/a>, and if you’re interested in libraries for other languages, Diffbot has a directory<\/a>.<\/p>\n

JavaScript Content<\/h2>\n

We said in the introductory section that Diffbot renders the request in full and then analyzes it. But, what about JavaScript content? Nowadays, websites often render some HTML above the fold, and then finish the CSS, JS, and dynamic content loading afterwards. Can the Diffbot API see that?<\/p>\n

As a matter of fact, yes. Diffbot literally renders the page in full, and then inspects it visually, as explained in my StackOverflow Q&A here<\/a>. There are some caveats, though, so make sure you read the answer carefully.<\/p>\n

Pricing and API Health<\/h2>\n

Diffbot has several usage tiers. There’s a free trial tier which kills your API token after 7 days or 10000 calls, whichever comes first. The commercial tokens can be purchased at various prices<\/a>, and never expire, but do have limitations. A special case by case approach is afforded to open source and/or educational projects which provides an older model of the free token – 10k calls per month, once per second max, but never expires. You need to contact them directly if you think you qualify.<\/p>\n

Diffbot guarantees a high uptime, but failures sometimes do happen – especially in the most resource intensive API of the bunch: Crawlbot. Crawlbot is used to crawl entire domains, not just individual pages, and as such has a lower reliability rate than other APIs. Not by a lot, but enough to be noticeable in the API Health<\/a> screen – the screen you can check to see if an API is up and running or currently unavailable if your calls run into issues or return error 500.<\/p>\n

Demo<\/h2>\n

To prepare your environment, please boot up a Homestead Improved<\/a> instance.<\/p>\n

Create Project<\/h3>\n

Create a starter Laravel project by SSHing into the VM with vagrant ssh<\/code>, going into the Code<\/code> folder, and executing composer create-project laravel/laravel Laravel --prefer-dist<\/code>. This will let you access the Laravel greeting page via http://homestead.app:8000<\/code> from the host’s browser.<\/p>\n

Add a Route and Action<\/h3>\n

In app/routes.php<\/code> add the following route:<\/p>\n
1<\/td>Route::get(<\/code>'/diffbot'<\/code>, <\/code>'HomeController@diffbotDemo'<\/code>);<\/code><\/td><\/tr><\/tbody><\/table>\n

In app/controllers/HomeController<\/code> add the following action:<\/p>\n
123<\/td>public<\/code> function<\/code> diffbotDemo() {<\/code>die<\/code>(<\/code>"hi"<\/code>);<\/code>}<\/code><\/td><\/tr><\/tbody><\/table>\n

If http://homestead.app:8000/diffbot<\/code> now outputs “hi” on the screen, we’re ready to start playing with the API.<\/p>\n

Get a Token<\/h3>\n

To interact with the Diffbot API, you need a token. Sign up for one on their pricing page<\/a>. For the sake of this demo, let’s call our token $TOKEN<\/code>, and we’ll refer to it as such in URLs. Replace $TOKEN<\/code> with your own value where appropriate.<\/p>\n

Install Guzzle<\/h3>\n

We’ll be using Guzzle as our HTTP client. It’s not required, but I do recommend you get familiar with it through a past article of ours<\/a>.<\/p>\n

Add the "guzzlehttp/guzzle": "4.1.*@dev"<\/code> to your composer.json<\/code> so the require block looks like this:<\/p>\n
1234<\/td>"require": {<\/code>"laravel/framework": "4.2.*",<\/code>"guzzlehttp/guzzle": "4.1.*@dev"<\/code>},<\/code><\/td><\/tr><\/tbody><\/table>\n

In the project root, run composer update<\/code>.<\/p>\n

Fetch Article Data<\/h3>\n

In the first example, we’ll crawl a SitePoint post with the default Article API from Diffbot. To do this, we refer to the docs<\/a> which do an excellent job at explaining the workflow. Change the body of the diffbotDemo<\/code> action to the following code:<\/p>\n
1234567891011121314<\/td>public<\/code> function<\/code> diffbotDemo() {<\/code>$token<\/code> = <\/code>"$TOKEN"<\/code>;<\/code>$version<\/code> = <\/code>'v3'<\/code>;<\/code>$response<\/code> = <\/code>$client<\/code>->get(<\/code>$version<\/code>.<\/code>'/article'<\/code>, [<\/code>'query'<\/code> => [<\/code>'token'<\/code> => <\/code>$token<\/code>,<\/code>]]);<\/code>die<\/code>(var_dump(<\/code>$response<\/code>->json()));<\/code>}<\/code><\/td><\/tr><\/tbody><\/table>\n

First, we set our token. Then, we define a variable that’ll hold the API version. Next, it’s up to us to create a new Guzzle client, and we also give it a base URL so we don’t have to type it in every time we make another request.<\/p>\n

Next up, we create a response object by sending a GET request to the API’s URL, and we add in an array of query parameters in key => value format. In this case, we only pass in the token and the URL, the most basic of parameters.<\/p>\n

Finally, since the Diffbot API returns JSON data, we use Guzzle’s json()<\/code> method to automatically decode it into an array. We then pretty-print this data:<\/p>\n

\"\"<\/img><\/figure>\n

As you can see, we got some information back rather quickly. There’s the icon that was used, a preview of the text, the title, even the language, date and HTML have been returned. You’ll notice there’s no author, however. Let’s change this and request some more values.<\/p>\n

If we add the “fields” parameter to the query params list and give it a value of “tags”, Diffbot will attempt to extract tags/categories from the URL provided. Add this line to the query<\/code> array:<\/p>\n
1<\/td>'fields'<\/code> => <\/code>'tags'<\/code><\/td><\/tr><\/tbody><\/table>\n

and then change the die<\/code> part to this:<\/p>\n
12<\/td>$data<\/code> = <\/code>$response<\/code>->json();<\/code>die<\/code>(var_dump(<\/code>$data<\/code>[<\/code>'objects'<\/code>][0][<\/code>'tags'<\/code>]));<\/code><\/td><\/tr><\/tbody><\/table>\n

Refreshing the screen now gives us this:<\/p>\n

\"\"<\/img><\/figure>\n

But, the source code of the article notes several other tags:<\/p>\n

\"\"<\/img><\/figure>\n

Why is the result so very different? It’s precisely due to the reason we mentioned at the end of the very first paragraph of this post: what we humans see takes precedence. Diffbot is a visual learning robot, and as such its AI deducts the tags from the actual rendered content – what it can see – rather than from looking at the source code which is far too easily spiced up for SEO purposes.<\/p>\n

Is there a way to get the tags from the source code, though, if one really needs them? Furthermore, can we make Diffbot recognize the author on SitePoint articles? Yes. With the Custom API.<\/p>\n

Meta Tags and Author with Custom API<\/h3>\n

The Custom API is a feature which allows you to not only tweak existing Diffbot API to your liking by adding new fields and rules for content extraction, but also allows you to create completely new APIs (accessed via a dedicated URL, too) for custom content processing.<\/p>\n

Go to the dev dashboard<\/a> and log in with your token. Then, go into “Custom API”. Activate the “Create a Rule” tab at the bottom, and input the URL of the article we’re crawling into the URL box, then click Test. Your screen should look something like this:<\/p>\n

\"\"<\/img><\/figure>\n

You’ll immediately notice the Author field is empty. You can tweak the author-searching rule by clicking Edit next to it, and finding the Author element in the live preview window that opens, then click on it to get the desired result. However, due to some, well, less than perfect CSS on SitePoint’s end, it’s very difficult to provide Diffbot’s API with a consistent path to the author name, especially by clicking on elements. Instead, add the following rule manually: .contributor--large .contributor_name a<\/code> and click Save.<\/p>\n

You’ll notice the Preview window now correctly populates the Author field:<\/p>\n

\"\"<\/img><\/figure>\n

In fact, this new rule is automatically applied to all SitePoint links for your token. If you try to preview another SitePoint article, like this one<\/a>, you’ll notice Peter Nijssen is successfully extracted:<\/p>\n

\"\"<\/img><\/figure>\n

Ok, let’s modify the API further. We need the article:tag<\/code> values that are visible in source code. Doing this requires a two-step process.<\/p>\n

Step 1: Define a Collection<\/h4>\n

A collection is exactly what it sounds like – a collection of values grabbed via a specific ruleset. We’ll call our collection “MetaTags”, and give it the following selector: meta[property=article:tag]<\/code>. This means “find all meta elements in the HTML that have the property<\/code> attribute with the value article:tag<\/code>“.<\/p>\n

Step 2: Define Collection Fields<\/h4>\n

Collection fields are individual entries in a collection – in our case, the various tags. Click on “Add a custom field to this collection”, and add the following values:<\/p>\n

\"\"<\/img><\/figure>\n

Click Save. You’ll immediately have access to the list of Tags in the result window:<\/p>\n

\"\"<\/img>
Change the final output of the diffbotDemo() action to this:<\/figcaption><\/figure>\n
1<\/td>die<\/code>(var_dump(<\/code>$data<\/code>[<\/code>'objects'<\/code>][0][<\/code>'metaTags'<\/code>]));<\/code><\/td><\/tr><\/tbody><\/table>\n

If you now refresh the URL we tested with (http://homestead.app:8000/diffbot<\/code>), you’ll notice the author and meta tags values are there. Here’s the output the above line of code produces:<\/p>\n

\"\"<\/img><\/figure>\n

We have our tags!<\/p>\n

Conclusion<\/h2>\n

Diffbot is a powerful data extractor for the web – whether you need to consolidate many sites into a single search index without combining their back-ends, want to build a news aggregator, have an idea for a URL preview web component, or want to regularly harvest the contents of competitors’ public pricing lists, Diffbot can help. With dead simple API calls and highly structured responses, you’ll be up and running in next to no time. In a later article, we’ll build a brand new API for using Diffbot with PHP, and redo the calls above with it. We’ll also host the library on Packagist, so you can easily install it with Composer. Stay tuned!<\/p>"}]} \ No newline at end of file diff --git a/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic.json b/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic.json index e21fac8..1da0f60 100644 --- a/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic.json +++ b/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic.json @@ -1,10 +1 @@ -HTTP/1.1 200 OK -Server: nginx/1.6.3 -Date: Sun, 19 Apr 2015 20:09:22 GMT -Content-Type: application/json;charset=utf-8 -Transfer-Encoding: chunked -Connection: keep-alive -Vary: Accept-Encoding -Access-Control-Allow-Origin: * - {"title":"Diffbot: Crawling with Visual Machine Learning","request":{"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning","api":"analyze","resolvedPageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","options":["mentos"],"version":3},"humanLanguage":"en","type":"article","objects":[{"tags":[{"id":4585348,"count":9,"prevalence":0.6428571428571428,"label":"Diffbot","type":"organization","uri":"http://dbpedia.org/resource/Diffbot"},{"id":175464,"count":13,"prevalence":0.9285714285714286,"label":"Application programming interface","uri":"http://dbpedia.org/resource/Application_programming_interface"},{"id":1936869,"count":3,"prevalence":0.21428571428571427,"label":"Laravel","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/Laravel"},{"id":208652,"count":3,"prevalence":0.21428571428571427,"label":"PHP","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/PHP"},{"id":91320,"count":2,"prevalence":0.14285714285714285,"label":"SitePoint","type":"organization","uri":"http://dbpedia.org/resource/SitePoint"}],"icon":"http://www.sitepoint.com/wp-content/themes/sitepoint/assets/images/apple-touch-icon-144x144-precomposed.png","text":"Have you ever wondered how social networks do URL previews so well when you share links? How do they know which images to grab, whom to cite as an author, or which tags to attach to the preview? Is it all crawling with complex regexes over source code? Actually, more often than not, it isn\u2019t. Meta information defined in the source can be unreliable, and sites with less than stellar reputation often use them as keyword carriers, attempting to get search engines to rank them higher. Isn\u2019t what we, the humans, see in front of us what matters anyway?\nIf you want to build a URL preview snippet or a news aggregator, there are many automatic crawlers available online, both proprietary and open source, but you seldom find something as niche as visual machine learning. This is exactly what Diffbot is \u2013 a \u201cvisual learning robot\u201d which renders a URL you request in full and then visually extracts data, helping itself with some metadata from the page source as needed.\nAfter covering some theory, in this post we\u2019ll do a demo API call at one of SitePoint\u2019s posts.\nPHP Library\nThe PHP library for Diffbot is somewhat out of date, and as such we won\u2019t be using it in this demo. We\u2019ll be performing raw API calls, and in some future posts we\u2019ll build our own library for API interaction.\nIf you\u2019d like to take a look at the PHP library nonetheless, see here, and if you\u2019re interested in libraries for other languages, Diffbot has a directory.\nJavaScript Content\nWe said in the introductory section that Diffbot renders the request in full and then analyzes it. But, what about JavaScript content? Nowadays, websites often render some HTML above the fold, and then finish the CSS, JS, and dynamic content loading afterwards. Can the Diffbot API see that?\nAs a matter of fact, yes. Diffbot literally renders the page in full, and then inspects it visually, as explained in my StackOverflow Q&A here. There are some caveats, though, so make sure you read the answer carefully.\nPricing and API Health\nDiffbot has several usage tiers. There\u2019s a free trial tier which kills your API token after 7 days or 10000 calls, whichever comes first. The commercial tokens can be purchased at various prices, and never expire, but do have limitations. A special case by case approach is afforded to open source and/or educational projects which provides an older model of the free token \u2013 10k calls per month, once per second max, but never expires. You need to contact them directly if you think you qualify.\nDiffbot guarantees a high uptime, but failures sometimes do happen \u2013 especially in the most resource intensive API of the bunch: Crawlbot. Crawlbot is used to crawl entire domains, not just individual pages, and as such has a lower reliability rate than other APIs. Not by a lot, but enough to be noticeable in the API Health screen \u2013 the screen you can check to see if an API is up and running or currently unavailable if your calls run into issues or return error 500.\nDemo\nTo prepare your environment, please boot up a Homestead Improved instance.\nCreate Project\nCreate a starter Laravel project by SSHing into the VM with vagrant ssh, going into the Code folder, and executing composer create-project laravel/laravel Laravel --prefer-dist. This will let you access the Laravel greeting page via http://homestead.app:8000 from the host\u2019s browser.\nAdd a Route and Action\nIn app/routes.php add the following route:\n1\nRoute::get('/diffbot', 'HomeController@diffbotDemo');\nIn app/controllers/HomeController add the following action:\n1\n2\n3\npublic function diffbotDemo() {\ndie(\"hi\");\n}\nIf http://homestead.app:8000/diffbot now outputs \u201chi\u201d on the screen, we\u2019re ready to start playing with the API.\nGet a Token\nTo interact with the Diffbot API, you need a token. Sign up for one on their pricing page. For the sake of this demo, let\u2019s call our token $TOKEN, and we\u2019ll refer to it as such in URLs. Replace $TOKEN with your own value where appropriate.\nInstall Guzzle\nWe\u2019ll be using Guzzle as our HTTP client. It\u2019s not required, but I do recommend you get familiar with it through a past article of ours.\nAdd the \"guzzlehttp/guzzle\": \"4.1.*@dev\" to your composer.json so the require block looks like this:\n1\n2\n3\n4\n\"require\": {\n\"laravel/framework\": \"4.2.*\",\n\"guzzlehttp/guzzle\": \"4.1.*@dev\"\n},\nIn the project root, run composer update.\nFetch Article Data\nIn the first example, we\u2019ll crawl a SitePoint post with the default Article API from Diffbot. To do this, we refer to the docs which do an excellent job at explaining the workflow. Change the body of the diffbotDemo action to the following code:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\npublic function diffbotDemo() {\n$token = \"$TOKEN\";\n$version = 'v3';\n$response = $client->get($version.'/article', ['query' => [\n'token' => $token,\n]]);\ndie(var_dump($response->json()));\n}\nFirst, we set our token. Then, we define a variable that\u2019ll hold the API version. Next, it\u2019s up to us to create a new Guzzle client, and we also give it a base URL so we don\u2019t have to type it in every time we make another request.\nNext up, we create a response object by sending a GET request to the API\u2019s URL, and we add in an array of query parameters in key => value format. In this case, we only pass in the token and the URL, the most basic of parameters.\nFinally, since the Diffbot API returns JSON data, we use Guzzle\u2019s json() method to automatically decode it into an array. We then pretty-print this data:\nAs you can see, we got some information back rather quickly. There\u2019s the icon that was used, a preview of the text, the title, even the language, date and HTML have been returned. You\u2019ll notice there\u2019s no author, however. Let\u2019s change this and request some more values.\nIf we add the \u201cfields\u201d parameter to the query params list and give it a value of \u201ctags\u201d, Diffbot will attempt to extract tags/categories from the URL provided. Add this line to the query array:\n1\n'fields' => 'tags'\nand then change the die part to this:\n1\n2\n$data = $response->json();\ndie(var_dump($data['objects'][0]['tags']));\nRefreshing the screen now gives us this:\nBut, the source code of the article notes several other tags:\nWhy is the result so very different? It\u2019s precisely due to the reason we mentioned at the end of the very first paragraph of this post: what we humans see takes precedence. Diffbot is a visual learning robot, and as such its AI deducts the tags from the actual rendered content \u2013 what it can see \u2013 rather than from looking at the source code which is far too easily spiced up for SEO purposes.\nIs there a way to get the tags from the source code, though, if one really needs them? Furthermore, can we make Diffbot recognize the author on SitePoint articles? Yes. With the Custom API.\nMeta Tags and Author with Custom API\nThe Custom API is a feature which allows you to not only tweak existing Diffbot API to your liking by adding new fields and rules for content extraction, but also allows you to create completely new APIs (accessed via a dedicated URL, too) for custom content processing.\nGo to the dev dashboard and log in with your token. Then, go into \u201cCustom API\u201d. Activate the \u201cCreate a Rule\u201d tab at the bottom, and input the URL of the article we\u2019re crawling into the URL box, then click Test. Your screen should look something like this:\nYou\u2019ll immediately notice the Author field is empty. You can tweak the author-searching rule by clicking Edit next to it, and finding the Author element in the live preview window that opens, then click on it to get the desired result. However, due to some, well, less than perfect CSS on SitePoint\u2019s end, it\u2019s very difficult to provide Diffbot\u2019s API with a consistent path to the author name, especially by clicking on elements. Instead, add the following rule manually: .contributor--large .contributor_name a and click Save.\nYou\u2019ll notice the Preview window now correctly populates the Author field:\nIn fact, this new rule is automatically applied to all SitePoint links for your token. If you try to preview another SitePoint article, like this one, you\u2019ll notice Peter Nijssen is successfully extracted:\nOk, let\u2019s modify the API further. We need the article:tag values that are visible in source code. Doing this requires a two-step process.\nStep 1: Define a Collection\nA collection is exactly what it sounds like \u2013 a collection of values grabbed via a specific ruleset. We\u2019ll call our collection \u201cMetaTags\u201d, and give it the following selector: meta[property=article:tag]. This means \u201cfind all meta elements in the HTML that have the property attribute with the value article:tag\u201c.\nStep 2: Define Collection Fields\nCollection fields are individual entries in a collection \u2013 in our case, the various tags. Click on \u201cAdd a custom field to this collection\u201d, and add the following values:\nClick Save. You\u2019ll immediately have access to the list of Tags in the result window:\n1\ndie(var_dump($data['objects'][0]['metaTags']));\nIf you now refresh the URL we tested with (http://homestead.app:8000/diffbot), you\u2019ll notice the author and meta tags values are there. Here\u2019s the output the above line of code produces:\nWe have our tags!\nConclusion\nDiffbot is a powerful data extractor for the web \u2013 whether you need to consolidate many sites into a single search index without combining their back-ends, want to build a news aggregator, have an idea for a URL preview web component, or want to regularly harvest the contents of competitors\u2019 public pricing lists, Diffbot can help. With dead simple API calls and highly structured responses, you\u2019ll be up and running in next to no time. In a later article, we\u2019ll build a brand new API for using Diffbot with PHP, and redo the calls above with it. We\u2019ll also host the library on Packagist, so you can easily install it with Composer. Stay tuned!","discussion":{"tags":[{"id":78932,"count":2,"prevalence":0.2857142857142857,"label":"HTML","uri":"http://dbpedia.org/resource/HTML"},{"id":91320,"count":1,"prevalence":0.14285714285714285,"label":"SitePoint","type":"organization","uri":"http://dbpedia.org/resource/SitePoint"},{"id":5062,"count":1,"prevalence":0.14285714285714285,"label":"Nice","type":"place","uri":"http://dbpedia.org/resource/Nice"},{"id":491384,"count":1,"prevalence":0.14285714285714285,"label":"IOS","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/IOS"},{"id":4585348,"count":1,"prevalence":0.14285714285714285,"label":"Diffbot","type":"organization","uri":"http://dbpedia.org/resource/Diffbot"}],"numPosts":7,"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","posts":[{"id":0,"tags":[{"id":5062,"count":1,"label":"Nice","type":"place","uri":"http://dbpedia.org/resource/Nice"},{"id":4585348,"count":1,"label":"Diffbot","type":"organization","uri":"http://dbpedia.org/resource/Diffbot"},{"id":78932,"count":1,"label":"HTML","uri":"http://dbpedia.org/resource/HTML"}],"author":"Peter Nijssen","text":"Nice article! Just wondering; since diffbot is unable to grab the author, can you conclude that it is not actually represented correctly within the website? I mean, you would think that should be an easy field to grab if HTML has been formatted correctly.","diffbotUri":"post|3|-459867678","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":1,"authorUrl":"https://disqus.com/by/peter_nijssen/","humanLanguage":"en","html":"

Nice article! Just wondering; since diffbot is unable to grab the author, can you conclude that it is not actually represented correctly within the website? I mean, you would think that should be an easy field to grab if HTML has been formatted correctly.<\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"tags":[{"id":91320,"count":1,"label":"SitePoint","type":"organization","uri":"http://dbpedia.org/resource/SitePoint"}],"text":"Correct - there's definitely more that could be done in terms of element declaration in SitePoint's design. Also, notice this: http://www.quora.com/APIs/Are-...","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"humanLanguage":"en","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT","id":1,"parentId":0,"author":"Bruno Skvorc","diffbotUri":"post|3|1753418734","authorUrl":"https://disqus.com/by/brunoskvorc/","html":"

Correct - there's definitely more that could be done in terms of element declaration in SitePoint's design. Also, notice this: http://www.quora.com/APIs/Are-...<\/a><\/p>"},{"id":2,"tags":[{"id":491384,"count":1,"label":"IOS","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/IOS"},{"id":78932,"count":1,"label":"HTML","uri":"http://dbpedia.org/resource/HTML"}],"author":"Stefan Sturm","text":"Great article, but after scraping the article we need to display it somewhere...\nFor me I want to display it on iOS devices.\nDo you know any good libs or HTML templates to use the diffbot text in?\nThanks for your help:)","diffbotUri":"post|3|959497238","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"authorUrl":"https://disqus.com/by/stefansturm/","humanLanguage":"en","html":"

Great article, but after scraping the article we need to display it somewhere...
\nFor me I want to display it on iOS devices.
\nDo you know any good libs or HTML templates to use the diffbot text in?<\/p>\n

Thanks for your help:)<\/p>","type":"post","date":"Sun, 19 Oct 2014 00:00:00 GMT"},{"id":3,"author":"Taher","text":"Is there any open source projects as good as diffbots?","diffbotUri":"post|3|-197836079","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"authorUrl":"https://disqus.com/by/disqus_CWGq6zNflN/","humanLanguage":"en","html":"

Is there any open source projects as good as diffbots?<\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"id":4,"parentId":3,"author":"Bruno Skvorc","text":"The answers here might help you out: https://www.quora.com/Web-Scra...","diffbotUri":"post|3|-1403587915","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":1,"authorUrl":"https://disqus.com/by/brunoskvorc/","humanLanguage":"en","html":"

The answers here might help you out: https://www.quora.com/Web-Scra...<\/a><\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"id":5,"author":"anonymous","text":"i just tried out the demo on diffbots website, and out of 14 pages i feeded it, it could only properly process 3. with most of 'em it was just showing me the copyright notice / legal bla foo embeded in those pages","diffbotUri":"post|3|-2120915058","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"humanLanguage":"en","html":"

i just tried out the demo on diffbots website, and out of 14 pages i feeded it, it could only properly process 3. with most of 'em it was just showing me the copyright notice / legal bla foo embeded in those pages<\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"tags":[{"id":175464,"count":1,"label":"Application programming interface","uri":"http://dbpedia.org/resource/Application_programming_interface"}],"text":"That's where custom API comes in to save the day. Out of curiosity, though, which URLs did you try, and which information was missing?","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"humanLanguage":"en","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT","id":6,"parentId":5,"author":"Bruno Skvorc","diffbotUri":"post|3|1316965997","authorUrl":"https://disqus.com/by/brunoskvorc/","html":"

That's where custom API comes in to save the day. Out of curiosity, though, which URLs did you try, and which information was missing?<\/p>"}],"provider":"Disqus","humanLanguage":"en","confidence":1,"type":"discussion","participants":5,"rssUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/feed/","title":"Diffbot: Crawling with Visual Machine Learning","diffbotUri":"discussion|3|-1039854465","numPages":1},"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning","metaTags":[{"name":"ai"},{"name":"Artificial Intelligence"},{"name":"crawling"},{"name":"Diffbot"},{"name":"framework"},{"name":"laravel"},{"name":"machine learning"},{"name":"OOPHP"},{"name":"PHP"},{"name":"scraping"},{"name":"visual learning"}],"humanLanguage":"en","type":"article","date":"Sun, 27 Jul 2014 00:00:00 GMT","resolvedPageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","author":"Bruno Skvorc","title":"Diffbot: Crawling with Visual Machine Learning","diffbotUri":"article|3|-938093421","images":[{"height":533,"naturalHeight":727,"diffbotUri":"image|3|-851701004","primary":true,"width":780,"naturalWidth":1063,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624455201.png"},{"height":216,"naturalHeight":216,"diffbotUri":"image|3|762494522","width":523,"naturalWidth":523,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624487602.png"},{"height":184,"naturalHeight":184,"diffbotUri":"image|3|302236938","width":664,"naturalWidth":664,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624509003.png"},{"height":784,"naturalHeight":972,"diffbotUri":"image|3|-1836356546","width":780,"naturalWidth":966,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624552704.png"},{"height":156,"naturalHeight":184,"diffbotUri":"image|3|1297360030","width":780,"naturalWidth":918,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624650505.png"},{"height":157,"naturalHeight":188,"diffbotUri":"image|3|502449852","width":780,"naturalWidth":929,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624665606.png"},{"height":175,"naturalHeight":237,"diffbotUri":"image|3|-2007985802","width":780,"naturalWidth":1053,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624803507.png"},{"title":"Change the final output of the diffbotDemo() action to this:","height":520,"naturalHeight":604,"diffbotUri":"image|3|-140134863","width":780,"naturalWidth":906,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624809508.png"},{"height":533,"naturalHeight":727,"diffbotUri":"image|3|1129235416","width":780,"naturalWidth":1063,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624834309.png"}],"html":"

Have you ever wondered how social networks do URL previews so well when you share links? How do they know which images to grab, whom to cite as an author, or which tags to attach to the preview? Is it all crawling with complex regexes over source code? Actually, more often than not, it isn’t. Meta information defined in the source can be unreliable, and sites with less than stellar reputation often use them as keyword carriers, attempting to get search engines to rank them higher. Isn’t what we, the humans, see in front of us what matters anyway?<\/p>\n

If you want to build a URL preview snippet or a news aggregator, there are many automatic crawlers available online, both proprietary and open source, but you seldom find something as niche as visual machine learning. This is exactly what Diffbot<\/a> is – a “visual learning robot” which renders a URL you request in full and then visually extracts data, helping itself with some metadata from the page source as needed.<\/p>\n

After covering some theory, in this post we’ll do a demo API call at one of SitePoint’s posts.<\/p>\n

PHP Library<\/h2>\n

The PHP library for Diffbot is somewhat out of date, and as such we won’t be using it in this demo. We’ll be performing raw API calls, and in some future posts we’ll build our own library for API interaction.<\/p>\n

If you’d like to take a look at the PHP library nonetheless, see here<\/a>, and if you’re interested in libraries for other languages, Diffbot has a directory<\/a>.<\/p>\n

JavaScript Content<\/h2>\n

We said in the introductory section that Diffbot renders the request in full and then analyzes it. But, what about JavaScript content? Nowadays, websites often render some HTML above the fold, and then finish the CSS, JS, and dynamic content loading afterwards. Can the Diffbot API see that?<\/p>\n

As a matter of fact, yes. Diffbot literally renders the page in full, and then inspects it visually, as explained in my StackOverflow Q&A here<\/a>. There are some caveats, though, so make sure you read the answer carefully.<\/p>\n

Pricing and API Health<\/h2>\n

Diffbot has several usage tiers. There’s a free trial tier which kills your API token after 7 days or 10000 calls, whichever comes first. The commercial tokens can be purchased at various prices<\/a>, and never expire, but do have limitations. A special case by case approach is afforded to open source and/or educational projects which provides an older model of the free token – 10k calls per month, once per second max, but never expires. You need to contact them directly if you think you qualify.<\/p>\n

Diffbot guarantees a high uptime, but failures sometimes do happen – especially in the most resource intensive API of the bunch: Crawlbot. Crawlbot is used to crawl entire domains, not just individual pages, and as such has a lower reliability rate than other APIs. Not by a lot, but enough to be noticeable in the API Health<\/a> screen – the screen you can check to see if an API is up and running or currently unavailable if your calls run into issues or return error 500.<\/p>\n

Demo<\/h2>\n

To prepare your environment, please boot up a Homestead Improved<\/a> instance.<\/p>\n

Create Project<\/h3>\n

Create a starter Laravel project by SSHing into the VM with vagrant ssh<\/code>, going into the Code<\/code> folder, and executing composer create-project laravel/laravel Laravel --prefer-dist<\/code>. This will let you access the Laravel greeting page via http://homestead.app:8000<\/code> from the host’s browser.<\/p>\n

Add a Route and Action<\/h3>\n

In app/routes.php<\/code> add the following route:<\/p>\n
1<\/td>Route::get(<\/code>'/diffbot'<\/code>, <\/code>'HomeController@diffbotDemo'<\/code>);<\/code><\/td><\/tr><\/tbody><\/table>\n

In app/controllers/HomeController<\/code> add the following action:<\/p>\n
123<\/td>public<\/code> function<\/code> diffbotDemo() {<\/code>die<\/code>(<\/code>"hi"<\/code>);<\/code>}<\/code><\/td><\/tr><\/tbody><\/table>\n

If http://homestead.app:8000/diffbot<\/code> now outputs “hi” on the screen, we’re ready to start playing with the API.<\/p>\n

Get a Token<\/h3>\n

To interact with the Diffbot API, you need a token. Sign up for one on their pricing page<\/a>. For the sake of this demo, let’s call our token $TOKEN<\/code>, and we’ll refer to it as such in URLs. Replace $TOKEN<\/code> with your own value where appropriate.<\/p>\n

Install Guzzle<\/h3>\n

We’ll be using Guzzle as our HTTP client. It’s not required, but I do recommend you get familiar with it through a past article of ours<\/a>.<\/p>\n

Add the "guzzlehttp/guzzle": "4.1.*@dev"<\/code> to your composer.json<\/code> so the require block looks like this:<\/p>\n
1234<\/td>"require": {<\/code>"laravel/framework": "4.2.*",<\/code>"guzzlehttp/guzzle": "4.1.*@dev"<\/code>},<\/code><\/td><\/tr><\/tbody><\/table>\n

In the project root, run composer update<\/code>.<\/p>\n

Fetch Article Data<\/h3>\n

In the first example, we’ll crawl a SitePoint post with the default Article API from Diffbot. To do this, we refer to the docs<\/a> which do an excellent job at explaining the workflow. Change the body of the diffbotDemo<\/code> action to the following code:<\/p>\n
1234567891011121314<\/td>public<\/code> function<\/code> diffbotDemo() {<\/code>$token<\/code> = <\/code>"$TOKEN"<\/code>;<\/code>$version<\/code> = <\/code>'v3'<\/code>;<\/code>$response<\/code> = <\/code>$client<\/code>->get(<\/code>$version<\/code>.<\/code>'/article'<\/code>, [<\/code>'query'<\/code> => [<\/code>'token'<\/code> => <\/code>$token<\/code>,<\/code>]]);<\/code>die<\/code>(var_dump(<\/code>$response<\/code>->json()));<\/code>}<\/code><\/td><\/tr><\/tbody><\/table>\n

First, we set our token. Then, we define a variable that’ll hold the API version. Next, it’s up to us to create a new Guzzle client, and we also give it a base URL so we don’t have to type it in every time we make another request.<\/p>\n

Next up, we create a response object by sending a GET request to the API’s URL, and we add in an array of query parameters in key => value format. In this case, we only pass in the token and the URL, the most basic of parameters.<\/p>\n

Finally, since the Diffbot API returns JSON data, we use Guzzle’s json()<\/code> method to automatically decode it into an array. We then pretty-print this data:<\/p>\n

\"\"<\/img><\/figure>\n

As you can see, we got some information back rather quickly. There’s the icon that was used, a preview of the text, the title, even the language, date and HTML have been returned. You’ll notice there’s no author, however. Let’s change this and request some more values.<\/p>\n

If we add the “fields” parameter to the query params list and give it a value of “tags”, Diffbot will attempt to extract tags/categories from the URL provided. Add this line to the query<\/code> array:<\/p>\n
1<\/td>'fields'<\/code> => <\/code>'tags'<\/code><\/td><\/tr><\/tbody><\/table>\n

and then change the die<\/code> part to this:<\/p>\n
12<\/td>$data<\/code> = <\/code>$response<\/code>->json();<\/code>die<\/code>(var_dump(<\/code>$data<\/code>[<\/code>'objects'<\/code>][0][<\/code>'tags'<\/code>]));<\/code><\/td><\/tr><\/tbody><\/table>\n

Refreshing the screen now gives us this:<\/p>\n

\"\"<\/img><\/figure>\n

But, the source code of the article notes several other tags:<\/p>\n

\"\"<\/img><\/figure>\n

Why is the result so very different? It’s precisely due to the reason we mentioned at the end of the very first paragraph of this post: what we humans see takes precedence. Diffbot is a visual learning robot, and as such its AI deducts the tags from the actual rendered content – what it can see – rather than from looking at the source code which is far too easily spiced up for SEO purposes.<\/p>\n

Is there a way to get the tags from the source code, though, if one really needs them? Furthermore, can we make Diffbot recognize the author on SitePoint articles? Yes. With the Custom API.<\/p>\n

Meta Tags and Author with Custom API<\/h3>\n

The Custom API is a feature which allows you to not only tweak existing Diffbot API to your liking by adding new fields and rules for content extraction, but also allows you to create completely new APIs (accessed via a dedicated URL, too) for custom content processing.<\/p>\n

Go to the dev dashboard<\/a> and log in with your token. Then, go into “Custom API”. Activate the “Create a Rule” tab at the bottom, and input the URL of the article we’re crawling into the URL box, then click Test. Your screen should look something like this:<\/p>\n

\"\"<\/img><\/figure>\n

You’ll immediately notice the Author field is empty. You can tweak the author-searching rule by clicking Edit next to it, and finding the Author element in the live preview window that opens, then click on it to get the desired result. However, due to some, well, less than perfect CSS on SitePoint’s end, it’s very difficult to provide Diffbot’s API with a consistent path to the author name, especially by clicking on elements. Instead, add the following rule manually: .contributor--large .contributor_name a<\/code> and click Save.<\/p>\n

You’ll notice the Preview window now correctly populates the Author field:<\/p>\n

\"\"<\/img><\/figure>\n

In fact, this new rule is automatically applied to all SitePoint links for your token. If you try to preview another SitePoint article, like this one<\/a>, you’ll notice Peter Nijssen is successfully extracted:<\/p>\n

\"\"<\/img><\/figure>\n

Ok, let’s modify the API further. We need the article:tag<\/code> values that are visible in source code. Doing this requires a two-step process.<\/p>\n

Step 1: Define a Collection<\/h4>\n

A collection is exactly what it sounds like – a collection of values grabbed via a specific ruleset. We’ll call our collection “MetaTags”, and give it the following selector: meta[property=article:tag]<\/code>. This means “find all meta elements in the HTML that have the property<\/code> attribute with the value article:tag<\/code>“.<\/p>\n

Step 2: Define Collection Fields<\/h4>\n

Collection fields are individual entries in a collection – in our case, the various tags. Click on “Add a custom field to this collection”, and add the following values:<\/p>\n

\"\"<\/img><\/figure>\n

Click Save. You’ll immediately have access to the list of Tags in the result window:<\/p>\n

\"\"<\/img>
Change the final output of the diffbotDemo() action to this:<\/figcaption><\/figure>\n
1<\/td>die<\/code>(var_dump(<\/code>$data<\/code>[<\/code>'objects'<\/code>][0][<\/code>'metaTags'<\/code>]));<\/code><\/td><\/tr><\/tbody><\/table>\n

If you now refresh the URL we tested with (http://homestead.app:8000/diffbot<\/code>), you’ll notice the author and meta tags values are there. Here’s the output the above line of code produces:<\/p>\n

\"\"<\/img><\/figure>\n

We have our tags!<\/p>\n

Conclusion<\/h2>\n

Diffbot is a powerful data extractor for the web – whether you need to consolidate many sites into a single search index without combining their back-ends, want to build a news aggregator, have an idea for a URL preview web component, or want to regularly harvest the contents of competitors’ public pricing lists, Diffbot can help. With dead simple API calls and highly structured responses, you’ll be up and running in next to no time. In a later article, we’ll build a brand new API for using Diffbot with PHP, and redo the calls above with it. We’ll also host the library on Packagist, so you can easily install it with Composer. Stay tuned!<\/p>"}]} \ No newline at end of file diff --git a/tests/Mocks/Analyze/15-04-19/Images/500px_zola_basic-old.json b/tests/Mocks/Analyze/15-04-19/Images/500px_zola_basic-old.json new file mode 100644 index 0000000..f6aefd0 --- /dev/null +++ b/tests/Mocks/Analyze/15-04-19/Images/500px_zola_basic-old.json @@ -0,0 +1,9 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Sun, 19 Apr 2015 20:17:58 GMT +Content-Type: application/json;charset=utf-8 +Content-Length: 738 +Connection: keep-alive +Access-Control-Allow-Origin: * + +{"title":"Zola by Bruno Skvorc / 500px","request":{"pageUrl":"https://500px.com/photo/78703451/zola-by-bruno-skvorc?from=user_library","api":"analyze","version":3},"humanLanguage":"en","type":"image","objects":[{"title":"Photograph Zola by Bruno Skvorc on 500px","pageUrl":"https://500px.com/photo/78703451/zola-by-bruno-skvorc?from=user_library","diffbotUri":"image|3|-1494041765","naturalHeight":1365,"humanLanguage":"en","type":"image","date":"Aug 1, 2014","naturalWidth":2048,"url":"https://drscdn.500px.org/photo/78703451/m%3D2048/956d2879591f57e2352d2064e98f461b","xpath":"/HTML/BODY/DIV[@class='photo_show minimal has_next_photo has_previous_photo']/DIV[@class='photo segment']/DIV[@id='photo_78703451']/IMG[@class='the_photo']"}]} \ No newline at end of file diff --git a/tests/Mocks/Analyze/15-04-19/Images/500px_zola_basic.json b/tests/Mocks/Analyze/15-04-19/Images/500px_zola_basic.json index f6aefd0..4d650f4 100644 --- a/tests/Mocks/Analyze/15-04-19/Images/500px_zola_basic.json +++ b/tests/Mocks/Analyze/15-04-19/Images/500px_zola_basic.json @@ -1,9 +1 @@ -HTTP/1.1 200 OK -Server: nginx/1.6.3 -Date: Sun, 19 Apr 2015 20:17:58 GMT -Content-Type: application/json;charset=utf-8 -Content-Length: 738 -Connection: keep-alive -Access-Control-Allow-Origin: * - {"title":"Zola by Bruno Skvorc / 500px","request":{"pageUrl":"https://500px.com/photo/78703451/zola-by-bruno-skvorc?from=user_library","api":"analyze","version":3},"humanLanguage":"en","type":"image","objects":[{"title":"Photograph Zola by Bruno Skvorc on 500px","pageUrl":"https://500px.com/photo/78703451/zola-by-bruno-skvorc?from=user_library","diffbotUri":"image|3|-1494041765","naturalHeight":1365,"humanLanguage":"en","type":"image","date":"Aug 1, 2014","naturalWidth":2048,"url":"https://drscdn.500px.org/photo/78703451/m%3D2048/956d2879591f57e2352d2064e98f461b","xpath":"/HTML/BODY/DIV[@class='photo_show minimal has_next_photo has_previous_photo']/DIV[@class='photo segment']/DIV[@id='photo_78703451']/IMG[@class='the_photo']"}]} \ No newline at end of file diff --git a/tests/Mocks/Analyze/15-04-19/Products/hobbit_amazon_basic-old.json b/tests/Mocks/Analyze/15-04-19/Products/hobbit_amazon_basic-old.json new file mode 100644 index 0000000..4e11d3c --- /dev/null +++ b/tests/Mocks/Analyze/15-04-19/Products/hobbit_amazon_basic-old.json @@ -0,0 +1,10 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Sun, 19 Apr 2015 20:26:16 GMT +Content-Type: application/json;charset=utf-8 +Transfer-Encoding: chunked +Connection: keep-alive +Vary: Accept-Encoding +Access-Control-Allow-Origin: * + +{"title":"The Hobbit Trilogy Limited Edition with Bilbo's Journal Blu-ray 3D + Blu-ray 2012 Region Free: Amazon.co.uk: Martin Freeman, Richard Armitage, Ian McKellen, Peter Jackson: DVD & Blu-ray","request":{"pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","api":"analyze","version":3},"humanLanguage":"en","type":"product","objects":[{"text":"Reviews\nProduct Description\nHobbit Trilogy - Journal Exclusive (3D/S/AZ)\nSynopsis\nTHE HOBBIT: AN UNEXPECTED JOURNEYTM Follow Bilbo Baggins, who \u2013 along with the Wizard Gandalf and 13 Dwarves, led by Thorin Oakenshield \u2013 is swept into an epic and treacherous quest to reclaim the lost Dwarf Kingdom of Erebor. THE HOBBIT: THE DESOLATION OF SMAUGTM Our heroes escape the giant Spiders and Wood-elves of Mirkwood before encountering the mysterious Bard, who smuggles them into Lake-town. Finally reaching the Lonely Mountain, they confront the Dragon Smaug. THE HOBBIT: THE BATTLE OF THE FIVE ARMIESTM The Dwarves of Erebor have reclaimed their homeland, but face the consequences of unleashing Smaug. As five great armies go to war, Bilbo fights for his life, and the races of Dwarves, Elves and Men must unite or be destroyed.","discussion":{"tags":[{"id":463187,"count":2,"prevalence":0.16666666666666666,"label":"The Hobbit","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/The_Hobbit"},{"id":3018688,"count":2,"prevalence":0.16666666666666666,"label":"Bilbo Baggins","uri":"http://dbpedia.org/resource/Bilbo_Baggins"},{"id":4838482,"count":1,"prevalence":0.08333333333333333,"label":"Martin Freeman","type":"person","uri":"http://dbpedia.org/resource/Martin_Freeman"},{"id":1160426,"count":1,"prevalence":0.08333333333333333,"label":"Ian McKellen","type":"person","uri":"http://dbpedia.org/resource/Ian_McKellen"},{"id":4563319,"count":1,"prevalence":0.08333333333333333,"label":"Botfa","uri":"http://dbpedia.org/resource/Botfa"}],"title":"The Hobbit Trilogy (Limited Edition with Bilbo's Journal) [Blu-ray 3D + Blu-ray] [2012] [Region Free]","nextPage":"http://www.amazon.co.uk/gp/bestsellers/dvd/535556031/ref=pd_zg_hrsr_d_h__1_2_last/275-0224565-8961175","numPosts":8,"pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","diffbotUri":"discussion|3|-177083457","posts":[{"id":0,"author":"Nate","text":"Amazing trilogy, superb picture quality and loads of extras. Excellent value for money","diffbotUri":"post|3|1326325813","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A23M0BG1B3H05D/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Amazing trilogy, superb picture quality and loads of extras. Excellent value for money<\/p>","rating":{"minValue":1,"value":5,"maxValue":5},"type":"post","date":"Sun, 19 Apr 2015 00:00:00 GMT"},{"id":1,"author":"J Clark","text":"Disappointed as Bilbo did not appear to write anything as the journal is full of blank pages only bought this set for the journal should have waited for extended version","diffbotUri":"post|3|-685041964","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A2PZILNN3SUII6/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Disappointed as Bilbo did not appear to write anything as the journal is full of blank pages only bought this set for the journal should have waited for extended version<\/p>","rating":{"minValue":1,"value":3,"maxValue":5},"type":"post","date":"Sun, 19 Apr 2015 00:00:00 GMT"},{"id":2,"author":"Movieguy","text":"Just to say that this is an extremely good price for the whole trilogy of films. The fact that it is 3D just adds to the reliability of this price, if you haven't already got any of these films and are looking then buy this.\nOn the other hand, If you're patient you should wait for the Extended steel book edition which should be out later this year. The first two (An Unexpected Journey & The Desolation of Smaug) are already available but at quite a price. The final part ( The Battle of the five Armies), as I have said, should be out later this year and I would say is worth the wait for those extra few sequences and special features.\nPersonally, I prefer to get them immediately as they are released as I'm not that bothered about any extended parts. I now regret this as I find myself trying to find a good price for the previous extended editions!\nI am an avid steel book collector but I just wouldn't wait for extended editions of anything unless I was really into it.\nAt the end of the day they are all the same films and you will get an equally great experience from the normal edition as opposed to the extended edition. If you are a hardcore fan (which I am not but I still enjoy these films) then you will most likely want to purchase the extended editions and marvel in the creation of these extravagant sets and breathtaking action sequences.\nBut for me I just want to enjoy the acting and story lines!","diffbotUri":"post|3|-1605113004","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A3SWD24RO7DBVX/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Just to say that this is an extremely good price for the whole trilogy of films. The fact that it is 3D just adds to the reliability of this price, if you haven't already got any of these films and are looking then buy this.<\/p>\n

On the other hand, If you're patient you should wait for the Extended steel book edition which should be out later this year. The first two (An Unexpected Journey & The Desolation of Smaug) are already available but at quite a price. The final part ( The Battle of the five Armies), as I have said, should be out later this year and I would say is worth the wait for those extra few sequences and special features.<\/p>\n

Personally, I prefer to get them immediately as they are released as I'm not that bothered about any extended parts. I now regret this as I find myself trying to find a good price for the previous extended editions!
\nI am an avid steel book collector but I just wouldn't wait for extended editions of anything unless I was really into it.<\/p>\n

At the end of the day they are all the same films and you will get an equally great experience from the normal edition as opposed to the extended edition. If you are a hardcore fan (which I am not but I still enjoy these films) then you will most likely want to purchase the extended editions and marvel in the creation of these extravagant sets and breathtaking action sequences.<\/p>\n

But for me I just want to enjoy the acting and story lines!<\/p>","rating":{"minValue":1,"value":4,"maxValue":5},"type":"post","date":"Sun, 22 Feb 2015 00:00:00 GMT"},{"id":3,"tags":[{"id":1876312,"count":1,"label":"3D film","uri":"http://dbpedia.org/resource/3D_film"}],"author":"STEVENSEAGALFAN+THE WIFE","text":"Bargain or what £33 pounds for three 3D films, I can't wait to watch them again, plus you get a limited edition Bilbo's journal this is a must for all collectors like me. I know some people are not happy because they want to have a choice of buying the extended versions, but to tell you the truth the cinema release will be good enough for now, plus one day there might be a ultimate limited edition Box-set with the Hobbit/Lord of the Rings all extended all in 3D, plus around 48 discs, you never Know it might be shaped as a \"GOLD RING\", now that will be worth buying.\nOne thing I don't agree with is how amazon allow reviews on a box-set that's not even out yet? I have done this review because of people giving one star because they don't agree with the release, they want the extended versions, I personally think that amazon should put a stop until the box-set has been released, even mine.","diffbotUri":"post|3|-1035465232","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A19IDOPXFNS85X/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Bargain or what £33 pounds for three 3D films, I can't wait to watch them again, plus you get a limited edition Bilbo's journal this is a must for all collectors like me. I know some people are not happy because they want to have a choice of buying the extended versions, but to tell you the truth the cinema release will be good enough for now, plus one day there might be a ultimate limited edition Box-set with the Hobbit/Lord of the Rings all extended all in 3D, plus around 48 discs, you never Know it might be shaped as a "GOLD RING", now that will be worth buying.<\/p>\n

One thing I don't agree with is how amazon allow reviews on a box-set that's not even out yet? I have done this review because of people giving one star because they don't agree with the release, they want the extended versions, I personally think that amazon should put a stop until the box-set has been released, even mine.<\/p>","rating":{"minValue":1,"value":5,"maxValue":5},"type":"post","date":"Mon, 23 Feb 2015 00:00:00 GMT"},{"id":4,"author":"Lesley Aspey","text":"While the films themselves are good, the packaging unfortunately lets this one down for me. The reason we bought the Trilogy is because we needed the product in compact packaging but this one wastefully sends them in individual packages in a sleeve so takes up just as much room as buying them individually.\nI do like having a single key though to add the trilogy to the Ultraviolet library.","diffbotUri":"post|3|-1340814396","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A1KNK3NICMXD06/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

While the films themselves are good, the packaging unfortunately lets this one down for me. The reason we bought the Trilogy is because we needed the product in compact packaging but this one wastefully sends them in individual packages in a sleeve so takes up just as much room as buying them individually.<\/p>\n

I do like having a single key though to add the trilogy to the Ultraviolet library.<\/p>","rating":{"minValue":1,"value":3,"maxValue":5},"type":"post","date":"Sun, 19 Apr 2015 00:00:00 GMT"},{"id":5,"tags":[{"id":2546017,"count":1,"label":"Richard Armitage (actor)","type":"person","uri":"http://dbpedia.org/resource/Richard_Armitage_(actor)"},{"id":1351067,"count":1,"label":"Computer-generated imagery","uri":"http://dbpedia.org/resource/Computer-generated_imagery"},{"id":2472653,"count":1,"label":"Thorin Oakenshield","uri":"http://dbpedia.org/resource/Thorin_Oakenshield"},{"id":4563319,"count":1,"label":"Botfa","uri":"http://dbpedia.org/resource/Botfa"}],"author":"Helena","text":"I love these films with a passion. Ok sure the CGI is a bit overdone but hey hum I can see past that unlike some people! lol :P\nThe actors are fantastic too just like they were in LOTR. I'm a big Richard Armitage fan and he excels as Thorin Oakenshield. Cannot wait for BOTFA To be released to complete the trilogy and then watch it all again right up to Return of the King!!!","diffbotUri":"post|3|678339081","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/ABFI1MVAG7TIF/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

I love these films with a passion. Ok sure the CGI is a bit overdone but hey hum I can see past that unlike some people! lol :P
\nThe actors are fantastic too just like they were in LOTR. I'm a big Richard Armitage fan and he excels as Thorin Oakenshield. Cannot wait for BOTFA To be released to complete the trilogy and then watch it all again right up to Return of the King!!!<\/p>","rating":{"minValue":1,"value":5,"maxValue":5},"type":"post","date":"Thu, 16 Apr 2015 00:00:00 GMT"},{"id":6,"author":"M. D. Fleming","text":"Can't wait for this! Got to be purchased with this though to watch the whole series start to finish!.. The Lord of the Rings Trilogy... http://www.amazon.co.uk/gp/product/B0002VJT2C/ref=s9_simh_gw_p74_d0_i3?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=desktop-1&pf_rd_r=1CZRRY7MP2CZ15GA8TW5&pf_rd_t=36701&pf_rd_p=577048427&pf_rd_i=desktop","diffbotUri":"post|3|767709913","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/AWZRXJA6XRSRV/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Can't wait for this! Got to be purchased with this though to watch the whole series start to finish!.. The Lord of the Rings Trilogy... http://www.amazon.co.uk/gp/product/B0002VJT2C/ref=s9_simh_gw_p74_d0_i3?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=desktop-1&pf_rd_r=1CZRRY7MP2CZ15GA8TW5&pf_rd_t=36701&pf_rd_p=577048427&pf_rd_i=desktop<\/p>","rating":{"minValue":1,"value":5,"maxValue":5},"type":"post","date":"Fri, 17 Apr 2015 00:00:00 GMT"},{"id":7,"tags":[{"id":3018688,"count":2,"label":"Bilbo Baggins","uri":"http://dbpedia.org/resource/Bilbo_Baggins"},{"id":463187,"count":2,"label":"The Hobbit","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/The_Hobbit"},{"id":1160426,"count":1,"label":"Ian McKellen","type":"person","uri":"http://dbpedia.org/resource/Ian_McKellen"},{"id":2242160,"count":1,"label":"Lonely Mountain","uri":"http://dbpedia.org/resource/Lonely_Mountain"},{"id":4838482,"count":1,"label":"Martin Freeman","type":"person","uri":"http://dbpedia.org/resource/Martin_Freeman"}],"author":"E. A Solinas","text":"Ever since the classic \"Lord of the Rings\" trilogy ended, fans were clamoring for JRR Tolkien's \"The Hobbit\" to be adapted for film as well. After all, \"The Hobbit\" contains the seeds of the sequel trilogy's plot, so it made sense.\nBut instead of a straightforward adaptation of Bilbo Baggins' linear adventures, director Peter Jackson sets the stage for everything to come in his earlier movies. In addition to Bilbo finding the Ring, it is about the corruption of Middle-Earth as the Dark Lord returns to conquer everything. It has some notable flaws (primarily the contrived love story) and isn't quite as brilliant as the \"Rings\" trilogy, but the overall effect is a strong, epic story with a sublimely talented cast.\nBilbo Baggins (Martin Freeman) is a nice boring gentlehobbit who has no interest in adventures. Then the wizard Gandalf (Ian McKellen) descends on Bag End with thirteen dwarves. They are setting out for the lost city of Erebor, which the dragon Smaug stole many years ago, and now Thorin Oakenshield (Richard Armitage) wants to reclaim it. He especially wants the Arkenstone, a jewel that symbolizes kingship of the dwarves. Bilgo soon gets into the swing of the journey, despite vicious trolls, goblins, giants and an albino orc who literally wants Thorin's head (preferably separated from the rest of him).\nBut unknown to the dwarves, Bilbo has encountered a grotesque creature known as Gollum (Andy Serkis), and found a golden ring that gives invisibility. He uses this Ring -- and his newly acquired courage -- to survive the attacks by giant spiders of Mirkwood, and later avoid imprisonment by the deadly wood-elf King Thranduil (Lee Pace) and his son Legolas (Orlando Bloom).\nWhen they reach Erebor, Bilbo is sent in alone to find the Arkenstone...","diffbotUri":"post|3|-48445325","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A1D2C0WDCSHUWZ/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Ever since the classic "Lord of the Rings" trilogy ended, fans were clamoring for JRR Tolkien's "The Hobbit" to be adapted for film as well. After all, "The Hobbit" contains the seeds of the sequel trilogy's plot, so it made sense.<\/p>\n

But instead of a straightforward adaptation of Bilbo Baggins' linear adventures, director Peter Jackson sets the stage for everything to come in his earlier movies. In addition to Bilbo finding the Ring, it is about the corruption of Middle-Earth as the Dark Lord returns to conquer everything. It has some notable flaws (primarily the contrived love story) and isn't quite as brilliant as the "Rings" trilogy, but the overall effect is a strong, epic story with a sublimely talented cast.<\/p>\n

Bilbo Baggins (Martin Freeman) is a nice boring gentlehobbit who has no interest in adventures. Then the wizard Gandalf (Ian McKellen) descends on Bag End with thirteen dwarves. They are setting out for the lost city of Erebor, which the dragon Smaug stole many years ago, and now Thorin Oakenshield (Richard Armitage) wants to reclaim it. He especially wants the Arkenstone, a jewel that symbolizes kingship of the dwarves. Bilgo soon gets into the swing of the journey, despite vicious trolls, goblins, giants and an albino orc who literally wants Thorin's head (preferably separated from the rest of him).<\/p>\n

But unknown to the dwarves, Bilbo has encountered a grotesque creature known as Gollum (Andy Serkis), and found a golden ring that gives invisibility. He uses this Ring -- and his newly acquired courage -- to survive the attacks by giant spiders of Mirkwood, and later avoid imprisonment by the deadly wood-elf King Thranduil (Lee Pace) and his son Legolas (Orlando Bloom).<\/p>\n

When they reach Erebor, Bilbo is sent in alone to find the Arkenstone...<\/p>","rating":{"ratingCount":500,"minValue":1,"value":4,"maxValue":5},"type":"post","date":"Thu, 19 Feb 2015 00:00:00 GMT"}],"humanLanguage":"en","confidence":0.415,"type":"discussion","numPages":1,"participants":8},"pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","humanLanguage":"en","offerPriceDetails":{"amount":33,"text":"£33.00","symbol":"£"},"type":"product","sku":"B00TE2AQXG","breadcrumb":[{"link":"http://www.amazon.co.uk/b/ref=dp_bc_1/275-0224565-8961175?ie=UTF8&node=235597011","name":"Featured Categories"},{"link":"http://www.amazon.co.uk/b/ref=dp_bc_2/275-0224565-8961175?ie=UTF8&node=6265430031","name":"Blu-ray"}],"productId":"B00TE2AQXG","title":"The Hobbit Trilogy (Limited Edition with Bilbo's Journal) [Blu-ray 3D + Blu-ray] [2012] [Region Free]","diffbotUri":"product|3|238470115","offerPrice":"£33.00","brand":"The Hobbit","images":[{"title":"\nRoll over image to zoom in ","height":476,"naturalHeight":674,"diffbotUri":"image|3|858047555","primary":true,"width":369,"naturalWidth":522,"url":"http://ecx.images-amazon.com/images/I/A1wFaPGyN6L._SX522_.jpg","xpath":"/html[1]/body[1]/div[3]/div[5]/div[4]/div[1]/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/ul[1]/li[1]/span[1]/span[1]/div[1]/img[1]"}],"specs":{"yes":"No","comment":"Yes \n No"},"availability":true}]} \ No newline at end of file diff --git a/tests/Mocks/Analyze/15-04-19/Products/hobbit_amazon_basic.json b/tests/Mocks/Analyze/15-04-19/Products/hobbit_amazon_basic.json index 4e11d3c..2d8e653 100644 --- a/tests/Mocks/Analyze/15-04-19/Products/hobbit_amazon_basic.json +++ b/tests/Mocks/Analyze/15-04-19/Products/hobbit_amazon_basic.json @@ -1,10 +1 @@ -HTTP/1.1 200 OK -Server: nginx/1.6.3 -Date: Sun, 19 Apr 2015 20:26:16 GMT -Content-Type: application/json;charset=utf-8 -Transfer-Encoding: chunked -Connection: keep-alive -Vary: Accept-Encoding -Access-Control-Allow-Origin: * - {"title":"The Hobbit Trilogy Limited Edition with Bilbo's Journal Blu-ray 3D + Blu-ray 2012 Region Free: Amazon.co.uk: Martin Freeman, Richard Armitage, Ian McKellen, Peter Jackson: DVD & Blu-ray","request":{"pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","api":"analyze","version":3},"humanLanguage":"en","type":"product","objects":[{"text":"Reviews\nProduct Description\nHobbit Trilogy - Journal Exclusive (3D/S/AZ)\nSynopsis\nTHE HOBBIT: AN UNEXPECTED JOURNEYTM Follow Bilbo Baggins, who \u2013 along with the Wizard Gandalf and 13 Dwarves, led by Thorin Oakenshield \u2013 is swept into an epic and treacherous quest to reclaim the lost Dwarf Kingdom of Erebor. THE HOBBIT: THE DESOLATION OF SMAUGTM Our heroes escape the giant Spiders and Wood-elves of Mirkwood before encountering the mysterious Bard, who smuggles them into Lake-town. Finally reaching the Lonely Mountain, they confront the Dragon Smaug. THE HOBBIT: THE BATTLE OF THE FIVE ARMIESTM The Dwarves of Erebor have reclaimed their homeland, but face the consequences of unleashing Smaug. As five great armies go to war, Bilbo fights for his life, and the races of Dwarves, Elves and Men must unite or be destroyed.","discussion":{"tags":[{"id":463187,"count":2,"prevalence":0.16666666666666666,"label":"The Hobbit","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/The_Hobbit"},{"id":3018688,"count":2,"prevalence":0.16666666666666666,"label":"Bilbo Baggins","uri":"http://dbpedia.org/resource/Bilbo_Baggins"},{"id":4838482,"count":1,"prevalence":0.08333333333333333,"label":"Martin Freeman","type":"person","uri":"http://dbpedia.org/resource/Martin_Freeman"},{"id":1160426,"count":1,"prevalence":0.08333333333333333,"label":"Ian McKellen","type":"person","uri":"http://dbpedia.org/resource/Ian_McKellen"},{"id":4563319,"count":1,"prevalence":0.08333333333333333,"label":"Botfa","uri":"http://dbpedia.org/resource/Botfa"}],"title":"The Hobbit Trilogy (Limited Edition with Bilbo's Journal) [Blu-ray 3D + Blu-ray] [2012] [Region Free]","nextPage":"http://www.amazon.co.uk/gp/bestsellers/dvd/535556031/ref=pd_zg_hrsr_d_h__1_2_last/275-0224565-8961175","numPosts":8,"pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","diffbotUri":"discussion|3|-177083457","posts":[{"id":0,"author":"Nate","text":"Amazing trilogy, superb picture quality and loads of extras. Excellent value for money","diffbotUri":"post|3|1326325813","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A23M0BG1B3H05D/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Amazing trilogy, superb picture quality and loads of extras. Excellent value for money<\/p>","rating":{"minValue":1,"value":5,"maxValue":5},"type":"post","date":"Sun, 19 Apr 2015 00:00:00 GMT"},{"id":1,"author":"J Clark","text":"Disappointed as Bilbo did not appear to write anything as the journal is full of blank pages only bought this set for the journal should have waited for extended version","diffbotUri":"post|3|-685041964","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A2PZILNN3SUII6/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Disappointed as Bilbo did not appear to write anything as the journal is full of blank pages only bought this set for the journal should have waited for extended version<\/p>","rating":{"minValue":1,"value":3,"maxValue":5},"type":"post","date":"Sun, 19 Apr 2015 00:00:00 GMT"},{"id":2,"author":"Movieguy","text":"Just to say that this is an extremely good price for the whole trilogy of films. The fact that it is 3D just adds to the reliability of this price, if you haven't already got any of these films and are looking then buy this.\nOn the other hand, If you're patient you should wait for the Extended steel book edition which should be out later this year. The first two (An Unexpected Journey & The Desolation of Smaug) are already available but at quite a price. The final part ( The Battle of the five Armies), as I have said, should be out later this year and I would say is worth the wait for those extra few sequences and special features.\nPersonally, I prefer to get them immediately as they are released as I'm not that bothered about any extended parts. I now regret this as I find myself trying to find a good price for the previous extended editions!\nI am an avid steel book collector but I just wouldn't wait for extended editions of anything unless I was really into it.\nAt the end of the day they are all the same films and you will get an equally great experience from the normal edition as opposed to the extended edition. If you are a hardcore fan (which I am not but I still enjoy these films) then you will most likely want to purchase the extended editions and marvel in the creation of these extravagant sets and breathtaking action sequences.\nBut for me I just want to enjoy the acting and story lines!","diffbotUri":"post|3|-1605113004","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A3SWD24RO7DBVX/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Just to say that this is an extremely good price for the whole trilogy of films. The fact that it is 3D just adds to the reliability of this price, if you haven't already got any of these films and are looking then buy this.<\/p>\n

On the other hand, If you're patient you should wait for the Extended steel book edition which should be out later this year. The first two (An Unexpected Journey & The Desolation of Smaug) are already available but at quite a price. The final part ( The Battle of the five Armies), as I have said, should be out later this year and I would say is worth the wait for those extra few sequences and special features.<\/p>\n

Personally, I prefer to get them immediately as they are released as I'm not that bothered about any extended parts. I now regret this as I find myself trying to find a good price for the previous extended editions!
\nI am an avid steel book collector but I just wouldn't wait for extended editions of anything unless I was really into it.<\/p>\n

At the end of the day they are all the same films and you will get an equally great experience from the normal edition as opposed to the extended edition. If you are a hardcore fan (which I am not but I still enjoy these films) then you will most likely want to purchase the extended editions and marvel in the creation of these extravagant sets and breathtaking action sequences.<\/p>\n

But for me I just want to enjoy the acting and story lines!<\/p>","rating":{"minValue":1,"value":4,"maxValue":5},"type":"post","date":"Sun, 22 Feb 2015 00:00:00 GMT"},{"id":3,"tags":[{"id":1876312,"count":1,"label":"3D film","uri":"http://dbpedia.org/resource/3D_film"}],"author":"STEVENSEAGALFAN+THE WIFE","text":"Bargain or what £33 pounds for three 3D films, I can't wait to watch them again, plus you get a limited edition Bilbo's journal this is a must for all collectors like me. I know some people are not happy because they want to have a choice of buying the extended versions, but to tell you the truth the cinema release will be good enough for now, plus one day there might be a ultimate limited edition Box-set with the Hobbit/Lord of the Rings all extended all in 3D, plus around 48 discs, you never Know it might be shaped as a \"GOLD RING\", now that will be worth buying.\nOne thing I don't agree with is how amazon allow reviews on a box-set that's not even out yet? I have done this review because of people giving one star because they don't agree with the release, they want the extended versions, I personally think that amazon should put a stop until the box-set has been released, even mine.","diffbotUri":"post|3|-1035465232","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A19IDOPXFNS85X/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Bargain or what £33 pounds for three 3D films, I can't wait to watch them again, plus you get a limited edition Bilbo's journal this is a must for all collectors like me. I know some people are not happy because they want to have a choice of buying the extended versions, but to tell you the truth the cinema release will be good enough for now, plus one day there might be a ultimate limited edition Box-set with the Hobbit/Lord of the Rings all extended all in 3D, plus around 48 discs, you never Know it might be shaped as a "GOLD RING", now that will be worth buying.<\/p>\n

One thing I don't agree with is how amazon allow reviews on a box-set that's not even out yet? I have done this review because of people giving one star because they don't agree with the release, they want the extended versions, I personally think that amazon should put a stop until the box-set has been released, even mine.<\/p>","rating":{"minValue":1,"value":5,"maxValue":5},"type":"post","date":"Mon, 23 Feb 2015 00:00:00 GMT"},{"id":4,"author":"Lesley Aspey","text":"While the films themselves are good, the packaging unfortunately lets this one down for me. The reason we bought the Trilogy is because we needed the product in compact packaging but this one wastefully sends them in individual packages in a sleeve so takes up just as much room as buying them individually.\nI do like having a single key though to add the trilogy to the Ultraviolet library.","diffbotUri":"post|3|-1340814396","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A1KNK3NICMXD06/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

While the films themselves are good, the packaging unfortunately lets this one down for me. The reason we bought the Trilogy is because we needed the product in compact packaging but this one wastefully sends them in individual packages in a sleeve so takes up just as much room as buying them individually.<\/p>\n

I do like having a single key though to add the trilogy to the Ultraviolet library.<\/p>","rating":{"minValue":1,"value":3,"maxValue":5},"type":"post","date":"Sun, 19 Apr 2015 00:00:00 GMT"},{"id":5,"tags":[{"id":2546017,"count":1,"label":"Richard Armitage (actor)","type":"person","uri":"http://dbpedia.org/resource/Richard_Armitage_(actor)"},{"id":1351067,"count":1,"label":"Computer-generated imagery","uri":"http://dbpedia.org/resource/Computer-generated_imagery"},{"id":2472653,"count":1,"label":"Thorin Oakenshield","uri":"http://dbpedia.org/resource/Thorin_Oakenshield"},{"id":4563319,"count":1,"label":"Botfa","uri":"http://dbpedia.org/resource/Botfa"}],"author":"Helena","text":"I love these films with a passion. Ok sure the CGI is a bit overdone but hey hum I can see past that unlike some people! lol :P\nThe actors are fantastic too just like they were in LOTR. I'm a big Richard Armitage fan and he excels as Thorin Oakenshield. Cannot wait for BOTFA To be released to complete the trilogy and then watch it all again right up to Return of the King!!!","diffbotUri":"post|3|678339081","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/ABFI1MVAG7TIF/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

I love these films with a passion. Ok sure the CGI is a bit overdone but hey hum I can see past that unlike some people! lol :P
\nThe actors are fantastic too just like they were in LOTR. I'm a big Richard Armitage fan and he excels as Thorin Oakenshield. Cannot wait for BOTFA To be released to complete the trilogy and then watch it all again right up to Return of the King!!!<\/p>","rating":{"minValue":1,"value":5,"maxValue":5},"type":"post","date":"Thu, 16 Apr 2015 00:00:00 GMT"},{"id":6,"author":"M. D. Fleming","text":"Can't wait for this! Got to be purchased with this though to watch the whole series start to finish!.. The Lord of the Rings Trilogy... http://www.amazon.co.uk/gp/product/B0002VJT2C/ref=s9_simh_gw_p74_d0_i3?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=desktop-1&pf_rd_r=1CZRRY7MP2CZ15GA8TW5&pf_rd_t=36701&pf_rd_p=577048427&pf_rd_i=desktop","diffbotUri":"post|3|767709913","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/AWZRXJA6XRSRV/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Can't wait for this! Got to be purchased with this though to watch the whole series start to finish!.. The Lord of the Rings Trilogy... http://www.amazon.co.uk/gp/product/B0002VJT2C/ref=s9_simh_gw_p74_d0_i3?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=desktop-1&pf_rd_r=1CZRRY7MP2CZ15GA8TW5&pf_rd_t=36701&pf_rd_p=577048427&pf_rd_i=desktop<\/p>","rating":{"minValue":1,"value":5,"maxValue":5},"type":"post","date":"Fri, 17 Apr 2015 00:00:00 GMT"},{"id":7,"tags":[{"id":3018688,"count":2,"label":"Bilbo Baggins","uri":"http://dbpedia.org/resource/Bilbo_Baggins"},{"id":463187,"count":2,"label":"The Hobbit","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/The_Hobbit"},{"id":1160426,"count":1,"label":"Ian McKellen","type":"person","uri":"http://dbpedia.org/resource/Ian_McKellen"},{"id":2242160,"count":1,"label":"Lonely Mountain","uri":"http://dbpedia.org/resource/Lonely_Mountain"},{"id":4838482,"count":1,"label":"Martin Freeman","type":"person","uri":"http://dbpedia.org/resource/Martin_Freeman"}],"author":"E. A Solinas","text":"Ever since the classic \"Lord of the Rings\" trilogy ended, fans were clamoring for JRR Tolkien's \"The Hobbit\" to be adapted for film as well. After all, \"The Hobbit\" contains the seeds of the sequel trilogy's plot, so it made sense.\nBut instead of a straightforward adaptation of Bilbo Baggins' linear adventures, director Peter Jackson sets the stage for everything to come in his earlier movies. In addition to Bilbo finding the Ring, it is about the corruption of Middle-Earth as the Dark Lord returns to conquer everything. It has some notable flaws (primarily the contrived love story) and isn't quite as brilliant as the \"Rings\" trilogy, but the overall effect is a strong, epic story with a sublimely talented cast.\nBilbo Baggins (Martin Freeman) is a nice boring gentlehobbit who has no interest in adventures. Then the wizard Gandalf (Ian McKellen) descends on Bag End with thirteen dwarves. They are setting out for the lost city of Erebor, which the dragon Smaug stole many years ago, and now Thorin Oakenshield (Richard Armitage) wants to reclaim it. He especially wants the Arkenstone, a jewel that symbolizes kingship of the dwarves. Bilgo soon gets into the swing of the journey, despite vicious trolls, goblins, giants and an albino orc who literally wants Thorin's head (preferably separated from the rest of him).\nBut unknown to the dwarves, Bilbo has encountered a grotesque creature known as Gollum (Andy Serkis), and found a golden ring that gives invisibility. He uses this Ring -- and his newly acquired courage -- to survive the attacks by giant spiders of Mirkwood, and later avoid imprisonment by the deadly wood-elf King Thranduil (Lee Pace) and his son Legolas (Orlando Bloom).\nWhen they reach Erebor, Bilbo is sent in alone to find the Arkenstone...","diffbotUri":"post|3|-48445325","pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","authorUrl":"http://www.amazon.co.uk/gp/pdp/profile/A1D2C0WDCSHUWZ/ref=cm_cr_dp_pdp/275-0224565-8961175","humanLanguage":"en","html":"

Ever since the classic "Lord of the Rings" trilogy ended, fans were clamoring for JRR Tolkien's "The Hobbit" to be adapted for film as well. After all, "The Hobbit" contains the seeds of the sequel trilogy's plot, so it made sense.<\/p>\n

But instead of a straightforward adaptation of Bilbo Baggins' linear adventures, director Peter Jackson sets the stage for everything to come in his earlier movies. In addition to Bilbo finding the Ring, it is about the corruption of Middle-Earth as the Dark Lord returns to conquer everything. It has some notable flaws (primarily the contrived love story) and isn't quite as brilliant as the "Rings" trilogy, but the overall effect is a strong, epic story with a sublimely talented cast.<\/p>\n

Bilbo Baggins (Martin Freeman) is a nice boring gentlehobbit who has no interest in adventures. Then the wizard Gandalf (Ian McKellen) descends on Bag End with thirteen dwarves. They are setting out for the lost city of Erebor, which the dragon Smaug stole many years ago, and now Thorin Oakenshield (Richard Armitage) wants to reclaim it. He especially wants the Arkenstone, a jewel that symbolizes kingship of the dwarves. Bilgo soon gets into the swing of the journey, despite vicious trolls, goblins, giants and an albino orc who literally wants Thorin's head (preferably separated from the rest of him).<\/p>\n

But unknown to the dwarves, Bilbo has encountered a grotesque creature known as Gollum (Andy Serkis), and found a golden ring that gives invisibility. He uses this Ring -- and his newly acquired courage -- to survive the attacks by giant spiders of Mirkwood, and later avoid imprisonment by the deadly wood-elf King Thranduil (Lee Pace) and his son Legolas (Orlando Bloom).<\/p>\n

When they reach Erebor, Bilbo is sent in alone to find the Arkenstone...<\/p>","rating":{"ratingCount":500,"minValue":1,"value":4,"maxValue":5},"type":"post","date":"Thu, 19 Feb 2015 00:00:00 GMT"}],"humanLanguage":"en","confidence":0.415,"type":"discussion","numPages":1,"participants":8},"pageUrl":"http://www.amazon.co.uk/gp/product/B00TE2AQXG/ref=s9_hps_gw_g74_i1?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-3&pf_rd_r=15F2QFNCSMQMRR9E78MM&pf_rd_t=101&pf_rd_p=560281487&pf_rd_i=468294","humanLanguage":"en","offerPriceDetails":{"amount":33,"text":"£33.00","symbol":"£"},"type":"product","sku":"B00TE2AQXG","breadcrumb":[{"link":"http://www.amazon.co.uk/b/ref=dp_bc_1/275-0224565-8961175?ie=UTF8&node=235597011","name":"Featured Categories"},{"link":"http://www.amazon.co.uk/b/ref=dp_bc_2/275-0224565-8961175?ie=UTF8&node=6265430031","name":"Blu-ray"}],"productId":"B00TE2AQXG","title":"The Hobbit Trilogy (Limited Edition with Bilbo's Journal) [Blu-ray 3D + Blu-ray] [2012] [Region Free]","diffbotUri":"product|3|238470115","offerPrice":"£33.00","brand":"The Hobbit","images":[{"title":"\nRoll over image to zoom in ","height":476,"naturalHeight":674,"diffbotUri":"image|3|858047555","primary":true,"width":369,"naturalWidth":522,"url":"http://ecx.images-amazon.com/images/I/A1wFaPGyN6L._SX522_.jpg","xpath":"/html[1]/body[1]/div[3]/div[5]/div[4]/div[1]/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/ul[1]/li[1]/span[1]/span[1]/div[1]/img[1]"}],"specs":{"yes":"No","comment":"Yes \n No"},"availability":true}]} \ No newline at end of file diff --git a/tests/Mocks/Articles/apple-watch-verge-basic-old.json b/tests/Mocks/Articles/apple-watch-verge-basic-old.json new file mode 100644 index 0000000..0026f00 --- /dev/null +++ b/tests/Mocks/Articles/apple-watch-verge-basic-old.json @@ -0,0 +1,10 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.0 +Date: Sun, 12 Apr 2015 13:06:42 GMT +Content-Type: application/json;charset=utf-8 +Transfer-Encoding: chunked +Connection: keep-alive +Vary: Accept-Encoding +Access-Control-Allow-Origin: * + +{"request":{"pageUrl":"http://www.theverge.com/a/apple-watch-review","api":"article","version":3,"options":["mentos"]},"objects":[{"tags":[{"id":1101322,"count":9,"prevalence":0.8999999999999999,"label":"Apple Watch","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#DesignedArtifact","uri":"http://dbpedia.org/resource/Apple_Watch"},{"id":628498,"count":3,"prevalence":0.30000000000000004,"label":"Smartwatch","uri":"http://dbpedia.org/resource/Smartwatch"},{"id":3108485,"count":2,"prevalence":0.2,"label":"The White Game","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/The_White_Game"},{"id":827308,"count":2,"prevalence":0.2,"label":"IPhone","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#DesignedArtifact","uri":"http://dbpedia.org/resource/IPhone"},{"id":3751011,"count":2,"prevalence":0.2,"label":"Apple Pay","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/Apple_Pay"}],"icon":"http://www.theverge.com/images/verge/apple-touch-icon.png","text":"The Apple Watch is an extraordinarily small and personal device. It is designed to participate in nearly every moment of your day, but almost never directly interact with anyone else. It knows when you\u2019re wearing it. You can talk to it. You can poke it \u2014 and it can poke back.\nEvery so often, the Apple Watch thinks about your heartbeat.\nBut the Apple Watch is also an enormous device. It\u2019s the first entirely new Apple product in five years, and the first Apple product developed after the death of Steve Jobs. It\u2019s full of new hardware, new software, and entirely new ideas about how the worlds of fashion and technology should intersect.\nIt\u2019s also the first smartwatch that might legitimately become a mainstream product, even as competitors flood the market. Apple has the marketing prowess, the retail store network, and the sheer determination to actually make this thing happen.\nIt just has to answer one question: would you actually use the Apple Watch instead of your phone?\nScroll down to start the day ↓\n7:36AM 100%\nGood Morning Beautiful\nHardware and Software\nLet\u2019s just get this out of the way: the Apple Watch, as I reviewed it for the past week and a half, is kind of slow. There\u2019s no getting around it, no way to talk about all of its interface ideas and obvious potential and hints of genius without noting that sometimes it stutters loading notifications. Sometimes pulling location information and data from your iPhone over Bluetooth and Wi-Fi takes a long time. Sometimes apps take forever to load, and sometimes third-party apps never really load at all. Sometimes it\u2019s just unresponsive for a few seconds while it thinks and then it comes back.\nApple tells me that upcoming software updates will address these performance issues, but for right now, they\u2019re there, and they\u2019re what I\u2019ve been thinking about every morning as I get ready for work. Wearing a smartwatch like the Apple Watch is a far deeper commitment than carrying a smartphone in your pocket; you are literally putting the technology on your body and allowing it to touch and measure you while you display it to the rest of the world. Committing to technology that\u2019s a little slow to respond to you is dicey at best, especially when it\u2019s supposed to step in for your phone. If the Watch is slow, I\u2019m going to pull out my phone. But if I keep pulling out my phone, I\u2019ll never use the Watch. So I have resolved to wait it out.\nI\u2019m putting my phone in my pocket and this Watch on my wrist, and we\u2019re taking this trip together.\nThese mornings have been full of self-reflection, moody contemplation as I gather my screens of all sizes and pack them in a bag, work alerts flashing across an array of devices that are all less important than my phone. I love my phone. Everyone loves their phone. The only real question a smartwatch like the Apple Watch needs to answer is \u201cwhy would I use this instead of my phone?\u201d The answers so far haven\u2019t been apparent; the Watch seems like it can do a little bit of everything instead of one thing really well. So I\u2019m putting my phone in my pocket and this Watch on my wrist, and we\u2019re taking this trip together.\nWe are going to need more coffee.\nAs an object, it makes sense that the Watch is not nearly as cold and minimal as Apple\u2019s recent phones and tablets and laptops. It has to be warmer, cozier. It has to invite you to touch it and take it with you all the time. Take the bands off and it\u2019s a little miracle of technology and engineering and manufacturing, a dense package containing more sensors and processing power than anyone could have even dreamed a few decades ago. It\u2019s a supercomputer on your wrist, but it\u2019s also a bulbous, friendly little thing, far more round than I expected, recalling nothing quite so much as the first-generation iPhone. It is unbelievably high tech and a little bit silly, a masterpiece of engineering with a Mickey Mouse face. It is quintessentially Apple.\nIt\u2019s also surprisingly heavy. I noticed when I was wearing it, and everyone who held it commented on the weight. That might simply be a function of how unfamiliar watches have become; my stainless steel Apple Watch with leather loop band weighs 2.9 ounces, which is more than my plastic Nixon\u2019s 1.7 ounces or the 1.8-ounce Moto 360, but much less than my 5-ounce Baume and Mercier. All in all, the Apple Watch isn\u2019t light enough to fade away, but it\u2019s also not so heavy that it\u2019s a distraction.\nOn the right side of the Watch you\u2019ll find the Digital Crown scroll wheel and a dedicated button (the official name is just \u201cside button\u201d) that opens your favorites list with one tap and activates Apple Pay with two taps. This side button is extraordinarily confusing \u2014 it looks and feels so much like an iPhone sleep / wake button that I still hit it to turn the screen on and off, even though I know I\u2019m doing the wrong thing.\nOn the back of the Watch, there\u2019s a slight dome that holds the optical heart rate sensor and the inductive charging system. You\u2019ll also find a pair of buttons that release the watchbands. They\u2019re flush with the case but relatively easy to depress, and the bands slide right out. You can make the Watch work in basically any orientation you\u2019d like by flipping the screen with a setting in the iPhone app \u2014 a boon for the left-handed. It\u2019s a fairly simple system, so expect to see tons of third-party Watch bands; Apple says it has no problem with that.\nApple Pay is my favorite feature on the Watch.\nApple gave me three bands to play with: the leather loop, the Milanese loop, and the white sport band. I mostly stuck with the leather loop, which feels more like plastic than leather but which I found super comfortable because it was so easy to readjust throughout the day. The white sport band basically felt like any other plastic band I\u2019ve worn. I felt ridiculous wearing the Milanese Loop, so I didn\u2019t.\nThe face of the Watch curves up off the sides, leaving a noticeable air gap above the display underneath. But besides that small complaint, the display is simply terrific. It carries the same Retina branding as the iPhone display and it delivers, with imperceptible pixels and inky blacks that allow the screen to blend right into the curved sides of the glass. It\u2019s easily the best smartwatch display on the market, and it would be unassailable if not for the air gap. It\u2019s light-years beyond everything else.\nThe back of the Watch is arguably more beautiful than the front.\nOn Your Wrist\nOnce you actually start living with the Watch, it quickly becomes clear that there are three main ways to actually use the thing: the watch face, the app launcher, and the communications app.\nApple is insistent that one of the main functions of the Watch is simply to be a great watch, so when you raise your wrist, you\u2019ll see the time by default, just like a regular watch. The lone exception out of the box is the workout app, which Apple says is \u201csticky\u201d so people can check their exercise stats quickly at the gym.\nIn the first of many moments where the Watch felt underpowered, I found that the screen lit up a couple of ticks too slowly: I\u2019d raise my wrist, wait a beat, and then the screen would turn on. This sounds like a minor quibble, but in the context of a watch you\u2019re glancing at dozens of times a day, it\u2019s quickly distracting. Other smartwatches like the Pebble and the LG G Watch R simply leave their screens on all the time; having a screen that constantly flips on and off is definitely behind the curve.\nTelling Time\nThe main watch face really is a complete self-contained experience: if the Apple Watch had no other functionality except for what you can do from the watch face, it would still be competitive. Customizing the watch Face is the first time you\u2019ll use Force Touch: you push a little bit harder on the screen, and you can swipe between Apple\u2019s selection of watch face templates, each of which can be customized and saved as individual variations. Most of the templates are minor riffs on the same basic analog watch, but others are very strange indeed, like the animated butterfly and jellyfish. There\u2019s no particularly great digital face, and there\u2019s no ability to load up your own watch faces or buy new ones from the store, which is a clearly missed opportunity.\nIf the Apple Watch had no other functionality except for what you can do from the watch face, it would still be competitive.\nThe Watch app is literally the most central experience on the Watch \u2014 you can rearrange every app icon on the homescreen except the Watch icon, which is always in the middle. What\u2019s fascinating and somewhat confusing is that so many of the Watch\u2019s core abilities are only in the Watch app, so interface ideas you learn there don\u2019t work anywhere else.\nFor example, the Watch app is the only place to access notifications after they appear. Notifications are the most important part of any smartwatch experience, but on the Apple Watch you can only swipe down to see your notifications when you\u2019re on the watch face. Once you click the Digital Crown and open the app launcher, the notification drawer goes away entirely and swiping down does nothing. Same with Glances, which are essentially single-screen status updates from various apps you access by swiping up from the Watch app. They\u2019re a major piece of the Watch experience, but they disappear everywhere else in the operating system. These are radically different interface patterns than iOS, where you can access the notification center and control center from virtually everywhere, and it makes navigating the Watch interface more confusing until you get it.\nThe Law of Wearable Success\nIn order to be successful, any given piece of wearable technology has to be useful the entire time it\u2019s on your body. Prescription glasses sit on your face, but improve your vision all the time, so they\u2019re successful. Sunglasses sit on your face and make you look cooler all the time, so they\u2019re successful. Google Glass sits on your face, but mostly does nothing, so it\u2019s a failure. It\u2019s a simple formula.\nUnderstanding that the Watch app is an entire primary experience unto itself is the key to understanding what happens when you press either of the buttons on the side of the Watch \u2014 they launch the other two main Watch experiences. Pressing the side button takes you to a totally unique contacts screen, which is where you send the ephemeral Digital Touch messages. Clicking the Digital Crown on the watch face opens the honeycomb app launcher, which is where you can open the various other apps on the Watch.\nAll of this sounds complex, but you\u2019re not really supposed to use it all at once \u2014 the aim is for the Watch to shine in 10- 15-second burst throughout the day, not in extended usage sessions. And that was borne out every morning, because I didn\u2019t have any reason to wear the Watch until I left the house.\nI was half-hoping to put on the Watch in the morning and use it instead of my phone, but that didn\u2019t happen. I grab my phone first thing in the morning and use it nonstop to prepare for the day: I organize my calendar, catch up on The Verge, check Twitter, and bang out replies on Slack and email. None of this is even possible to do on the Watch. Apple spent tons of effort and millions of dollars promoting the iPad as a business and creation platform instead of just a consumption machine, but there\u2019s no fighting the tiny display and limited input options of the Watch \u2014 this thing is all about quickly glancing at information, not really doing anything with it.\nIt becomes far more valuable once you\u2019re on the move.\nAll the Ways to Use an Apple Watch\nA cheat sheet to the all the major new ways to use Apple\u2019s smallest screen.\nDigital Crown: The scroll wheel on the right side. Push in to access the homescreen.\nDigital Touch: Click the side button once to send ephemeral taps, heartbeats, and drawings to other Watch owners.\nForce Touch: You can push down a bit harder on the screen to bring up additional controls. It\u2019s like right click, for your Watch.\nHey, Siri: You can raise the Watch up and then say \u201cHey, Siri\u201d to activate Siri once the screen is turned on.\nApple Pay: Click the side button twice to activate Apple Pay.\n9:32AM 95%\nWalk It Off\nNotifications, Music, Apple Pay\nIt turns out that I\u2019ve gotten really good at using my phone with one hand while I walk to the train. I\u2019m really good at looking at notifications come in on my phone screen and dismissing them with my thumb, or pressing the volume buttons to turn up the music, or even sending a quick text message with one thumb. I can even do some of that without looking very carefully at what I\u2019m doing, since there\u2019s muscle memory involved.\nBut you simply can\u2019t one-hand the Apple Watch. It\u2019s the simplest thing, but it\u2019s true: because it\u2019s a tiny screen with a tiny control wheel strapped to your wrist, you have to use both hands to use it, and you have to actually look at it to make sure you\u2019re hitting the right parts of the screen. You have to carry your coffee cup in your other hand if you\u2019re not interested in spilling on yourself. If you\u2019re like me and you refuse to use both backpack straps so you can be a One Strap Cool Guy, this means your bag will sometimes fall off your shoulder while you screw with your smartwatch, and you will be a No Straps Smartwatch Guy Murdered By NYC Traffic.\nPlease do not die this way.\nThe Watch made it a lot easier to keep my phone in my pocket on the walk to the train.\nOf course, you can\u2019t one-hand any smartwatch; that\u2019s just part of the deal. But no other smartwatch has this much going on \u2014 the Apple Watch literally has buttons and knobs \u2014 and no other smartwatch has so many lightly concealed designs on one day becoming a platform as powerful as your phone. If the existential question for the Apple Watch is \u201cwhy would I use this instead of my phone?\u201d then the answer almost always has to involve \u201cbecause it\u2019s more convenient.\u201d That\u2019s sometimes true of the Apple Watch, and sometimes not.\nThe white sport band is pretty comfortable.\nBut when it\u2019s more convenient, it\u2019s far more convenient.\nI usually spend most of my commute to work with my phone in my hand \u2014 listening to music and checking messages as I walk to the train, and reading saved articles on the subway. The Watch made it a lot easier to keep my phone in my pocket on the walk to the train \u2014 I saw notifications coming in on my wrist, and I could control the music apps on my iPhone from the Now Playing Glance on the Watch. The Watch also started tracking my steps and logging my movement into the Activity app, for a pleasant morning jolt of gamified living. So far, so good. But there\u2019s more work to be done here.\nNotifications\nNotifications on the Apple Watch work pretty much just like notifications on any other smartwatch: you feel a buzz, you look at your wrist, and it shows you some information. Apple\u2019s big trick with the Watch is dramatically improved buzzing with what it calls the \u201cTaptic Engine.\u201d It\u2019s a haptic feedback system that feels wildly different from the fuzzy, cumbersome vibrations of other devices. Apple\u2019s Taptics are more like the Watch tapping your wrist. The taps can come in different patterns and strengths; Apple says the Taptic Engine plays a vibration waveform related to the audio waveform of associated notification sound. Imagine a set of stereo speakers, but the right channel is insistently poking you along with the music.\nI muted the sounds. Is there any way to be a worse person than having high-pitched dings alert everyone that you\u2019re about to look at your watch?\nIf anything, Apple has been underselling the Taptic Engine, and I sort of understand why \u2014 you have to feel it to get just how different and powerful of an idea it is. But it\u2019s also pretty clear that taptics on the Watch are only the first half of a brilliant idea. There are a ton of missing pieces that need to get filled in before the Taptic Engine lives up to its potential.\nIt\u2019s also pretty clear that taptics on the Watch are only the first half of a brilliant idea.\nFirst, the Taptic notifications are fairly weak and fairly short \u2014 if the audio alert is a beep, you\u2019ll get one insistent poke and that\u2019s it. They\u2019re easy to miss. To counter this, Apple\u2019s built a setting called \u201cprominent haptics,\u201d which basically revs the engine at full speed like a more traditional vibration to get your attention before playing the far more subtle Taptic notification. It\u2019s the haptic equivalent of having an assistant blow a reggaeton horn before discreetly handing you a note in a meeting.\nBut the biggest missed opportunity is that there\u2019s no way to customize the notification sounds and Taptics on the Watch. I couldn\u2019t set a different alert for messages than for mail or calendar invites; they all just sort of felt the same. Without this ability, the Taptic Engine is just a small improvement over existing smartwatches. Let me create and set my own notifications, and it\u2019s a revolution.\nGetting notifications on the way to work also highlighted a key issue that the Apple Watch shares with Google\u2019s Android Wear: you have to be really bought into a single ecosystem for everything to work well out of the box. If you\u2019re not a believer in all of Apple\u2019s apps and services, the Apple Watch is going to be a little frustrating until developers build more support for it. For example, it\u2019s easy to send iMessages from the Watch, but there\u2019s no way to use WhatsApp or Hangouts. I spend a huge part of my day in Slack; it\u2019s somewhat useful to know people are mentioning you in a chat room because of taps on your wrist, but it would be much better if you could actually do something about it. There\u2019s a lot of work left to be done here.\nYou customize which notifications you receive in the Apple Watch app on your phone, which is a complicated affair. There\u2019s not a lot of intelligence or customization: apps that have been updated to support the Watch will let you either mirror your iPhone or set up Watch-specific settings, while older apps just let you turn notifications on and off. There\u2019s no master switch to turn all notifications on and off, which is a huge pain. Like every smartwatch vendor, Apple needs to put a lot more thought into which notifications it\u2019s showing you and why.\nMusic\nI\u2019ll just be super blunt about the music app on the Apple Watch: it\u2019s not as good as wearing an old iPod nano on your wrist. Remember when turning sixth-generation iPods into watches was a thing? That nano did a great job of displaying a lot of music information on a tiny screen, and the Apple Watch does not. Song and album titles get cut off in lists and on the Now Playing Screen, album art isn\u2019t as big, there\u2019s no ability to sync podcasts, and on and on. It does a fine job of controlling an iPhone, but as a dedicated music player it leaves a lot to be desired.\nGlances\nGlances also feel like they have enormous untapped potential. A Glance is just a status screen for an app on your phone, much like the app widgets on the Today screen of an iPhone. You swipe up from the bottom of the watch face to access Glances, and then swipe horizontally through the Glances you have installed. Apple says Glances are \u201creal time,\u201d but they\u2019re not \u2014 opening a Glance kicks off an update cycle, which usually means it\u2019s pulling data from your phone. The updates don\u2019t take long \u2014 unless the Watch is trying to grab your location, which always takes forever \u2014 but the delay means you can\u2019t just bang through Glances to see everything that\u2019s going on. The Twitter Glance is set to display top trends, but by the time it loads I could have pulled out my phone. Transit is set to show me the nearest mass transit options, but it takes so long to find my location I\u2026 could have just pulled out my phone. This is a theme.\nIf you don\u2019t have Bluetooth headphones connected, picking a song stored locally on the Apple Watch kicks off playback on your iPhone. Clever!\nAll of this will presumably get solved, of course \u2014 third parties just have to build in support for the Watch and figure out how to best use these features. But that will take time, and the Watch needs to sell in numbers that will justify that investment for the long tail of apps. And there\u2019s a real chance the solution is just a faster processor that uses less power in next year\u2019s Watch. Moore\u2019s Law tends to solve a lot of problems like that.\nApple Pay\nBut when all those pieces fall into place, it\u2019s incredible. Apple Pay is my favorite part of the entire Watch, a little blast from the future. Paying for coffee at The Café Grind in Manhattan involved nothing more than double-clicking the communications button on the Watch and holding my wrist over the terminal; it beeped and the payment processed instantly. Paying with the Watch is even faster than paying with an iPhone, since it doesn\u2019t have to read your fingerprint: it\u2019s ready to go anytime after you put it on your wrist and unlock your phone with your fingerprint. I love using Apple Pay with my phone, but it\u2019s even better with the Watch, some mild contortions to line it up with payment terminals aside. Apple Pay remains a shining example of what Apple is able to do when it has complete control over hardware, software, and services.\n10:30AM 85%\nFashion Technology\nFeaturing Racked\nI\u2019m really eager for The Verge to collaborate more with Racked, our sister site that covers fashion and shopping. Ultimately both of our sites are about trends and consumerism, and the crossover from fashion into tech and back again is definitely real \u2014 that\u2019s what the Apple Watch is all about. So I hijacked a meeting to talk about potential crossover ideas and talked about the Watch with Izzy Grinspan, Nicola Fumo, Julia Rubin, and Callia Hargrove instead.\nWhat\u2019s most interesting to me about their reaction to the Watch as a hardware object is how much it still comes off as a gadget, despite Apple\u2019s best efforts to make it a luxury item. It\u2019s still a screen; it\u2019s still a bunch of radios; it\u2019s still technology. They were hyper-critical of the materials and finishes, particularly the leather loop, and it was incredibly obvious that while a little bit of design goes a long way in the tech world, it\u2019s going to take a lot more time and a lot more work to play in the fashion game.\n12:42PM 70%\nLost in the Meeting Zone\nMessaging, Siri, Digital Touch\nAround lunchtime, I\u2019m usually running around the office at full speed: quick story updates, watching videos we have in the works, calls with our editors in other locations, talking to other teams around the company on projects we\u2019re doing together. I generally leave my laptop at my desk and try not to look at my phone while I do this so I can focus on the people I\u2019m talking to, but that also means I\u2019m ignoring a bunch of other people who are sending me notes.\nThe Watch helps with this \u2014 as long as you\u2019re using Apple\u2019s messaging apps, it lets you send quick messages and replies right from your wrist. Texting and iMessage are the easiest to use, since that\u2019s the most universal network the Watch is connected to: you reply to texts using canned replies, dicate a message with Siri, or send emojis. The canned choices are supposed to be smart: the Watch reads your texts and tries to figure out appropriate replies automatically. Unfortunately, this only seems to work well if the people texting you write complete questions with the answers embedded, like they\u2019re defense attorneys leading an aggressive cross-examination of a hostile witness. \u201cDo you want Mexican or Chinese for dinner?\u201d will trigger useful smart replies, but if you mostly text with vague lolspeakers like me, you\u2019re going to get a bunch of suggestions that make it seem like you\u2019re pushing off real answers because you\u2019re busy cheating on your wife.\nHappily, you can change the defaults.\nSiri\nYou can also dictate a message with Siri, but Siri on the Watch suffers from the same performance-related issues as everything else that requires a data connection to your phone and can be a little slow to respond. It\u2019s also extremely susceptible to background noise: I tried to text a friend in the office, and Siri picked up Sam Sheffer\u2019s voice from across the room. In a coffee shop, it was thwarted by the background music. I also never really got the raise-your-wrist-and-say-Hey-Siri move to work, mostly because it only really works after the screen flips on, and the screen delay wrecked my timing. When Siri did work, it was for the small stuff Siri is generally good at, like converting units in the kitchen or setting a timer. Anything more complicated generally resulted in Siri prompting me to use my iPhone.\nEmojis\nYou can also send emojis to people using the Watch, which is a decidedly mixed affair. Picking the emoji selector opens a four-panel interface, with a long list of the standard emoji on the fourth screen. The first three screens are Apple\u2019s own custom emoji, and they are\u2026 well, they\u2019re super creepy. You\u2019ve got a smiley face, a heart that explodes into other hearts, and what appears to be the disembodied hand of a mime, and you use the Digital Crown to smoothly transition these figures between their various states of emotional distress. These are the thirstiest emoji in history. I keep sending people a crying smiley face with its tongue hanging out just to see who my real friends are and who will call the police.\nA selection of passive-aggressive smart replies:\nI\u2019m on my way\nSorry, I can\u2019t talk right now\nCan I call you later?\nTalk later?\nCan\u2019t talk now\u2026\nHold on a sec\nCall you soon\nText you in a bit\nI don\u2019t know why Apple picked just these three emoji things, or if there will be more, but I do know they are super weird, and render as animated GIFs when you send them. Super weird animated GIFs that look like Facebook stickers.\nDigital Touch\nLastly, there\u2019s Digital Touch, which Apple has been promoting as a key communication feature of the Watch. There\u2019s no icon for Digital Touch on the homescreen, though. The only way to access it is to click the side button to open the favorites screen, then pick a friend who has an Apple Watch. Digital Touch will show up under their name as a small finger icon. You can send taps, draw small pictures, and the thumps of your heartbeat by holding two fingers on the screen for a few seconds. There\u2019s no send button \u2014 you just do whatever you\u2019re going to do, and the messages fly off into the ether.\nDigital Touch is remarkably small-time.\nBut here\u2019s the thing \u2014 it doesn\u2019t happen in real time. I had assumed that sending a heartbeat meant that my recipient would just start feeling my heart on their wrist like some sort of cosmic love connection, but that\u2019s not how it works. Instead, you get a regular notification which sends you into the Digital Touch canvas, where the message plays back: the taps come through, the drawings draw themselves, the heartbeats beat. A small button in the upper right fast-forwards to the end if you\u2019re impatient, and when the message is done playing, it\u2019s gone forever, Snapchat-style. Poof.\nIt\u2019s all remarkably small-time. It\u2019s cute, but it\u2019s a weird thing to hype as much as it\u2019s been hyped, especially because it has such a deep network effect problem \u2014 it\u2019s only useful if you know other people with Apple Watches. An extension of Digital Touch into iOS proper seems inevitable, especially if the next iPhone picks up the Taptic Engine. But for now, it\u2019s a cool demo and not much more.\nThere\u2019s no doubt that being able to send quick replies from your wrist is a powerful idea; it\u2019s the stuff of science fiction legend, and every smartwatch has to be able to do it. But the Apple Watch is just the first step towards making that reality. It\u2019s not anywhere close to being an actually-powerful communications tool, especially not when it\u2019s competing with the phone in your pocket. Mobile phones are among the most revolutionary communications tools in modern history, and it\u2019s going to take a lot more than a lonely mime flashing a peace sign and a few heartbeats to meaningfully extend their capabilities.\nI\u2019ve named this little guy Thirsty.\n3:10PM 50%\nBusiness Time\nApps and Performance\nIt\u2019s well after lunch. I\u2019ve had this thing on my wrist for something like six hours now, and the truth is that I\u2019ve barely used it. That\u2019s by design: again, you\u2019re only supposed to interact with the Apple Watch for 10 to 15 seconds at a time and then get back to your life. On one level, that all makes perfect sense: my regular watch has had a dead battery for over a year. I don\u2019t exactly use it for anything except looking cool. How much am I really supposed to use the Apple Watch to make it worth whatever price I\u2019ve paid for it?\nOn another level, everything about the Watch is designed to reinforce the idea that you have some sort of real life to return to once you\u2019re done using technology \u2014 that you\u2019re not just sitting at a desk in your office with your laptop and your phone, getting work done.\nThat\u2019s the situation I\u2019m in most afternoons \u2014 meetings have wrapped up, decisions have been made, and I\u2019m catching up on email, editing, reading the site, and generally setting up the next set of things I have to do. I\u2019m as plugged into the internet as I can possibly be, using my phone and my laptop for slightly different variations of the same task: communicating with people.\nThis is where the Watch\u2019s lack of speed comes to the forefront \u2014 there\u2019s virtually nothing I can\u2019t do faster or better with access to a laptop or a phone except perhaps check the time. It\u2019s not just the small screen or the quick in-and-out interaction design, it\u2019s actual slowness, particularly when it comes to loading data off the phone.\nThird-party apps are the main issue: Apple says it\u2019s still working on making them faster ahead of the April 24th launch, but it\u2019s clear that loading an app requires the Watch to pull a tremendous amount of data from the phone, and there\u2019s nothing fast about it. I sat through a number of interminable loading screens for apps like CNN, Twitter, The New York Times, and others. Apps that need to pull location data fare even worse: the Uber app takes so long to figure out where you are that you\u2019re better off walking home before someone notices you staring at your $700 Watch and makes a move.\nWhat good is a Watch that makes you wait?\nThis first set of Watch apps is really just loading additional screens from the apps on your phone; you might think of all of them as remote controls for your phone apps. True native apps are coming to the Watch later on, and I assume they\u2019ll be faster. That\u2019s a big deal: without a rich set of apps that extend the phone, it really is just another smartwatch.\nBut right now, it\u2019s disappointing to see the Watch struggle with performance. What good is a watch that makes you wait? Rendering notifications can slow everything down to a crawl. Buttons can take a couple taps to register. It feels like the Apple Watch has been deliberately pulled back in order to guarantee a full day of battery life. Improving performance is Apple\u2019s biggest challenge with the Watch, and it\u2019s clear that the company knows it.\nYou\u2019ll see this screen a lot.\n5:08PM 35%\nWork It Out\nHealth, Fitness, Activity Tracker\nApple\u2019s done an awful lot of work to position the Watch as a fitness device \u2014 in many ways, it\u2019s the only thing it can do that an iPhone can\u2019t do. With a built-in heart rate monitor, an accelerometer, and the advantage of always being on your wrist, the Watch feels like it should be the ultimate fitness wearable, a tiny supercomputer to put all those Fitbits and Ups to shame. But like so much else with the Watch, while the fitness capabilities are the first steps towards what eventually might become a juggernaut, they\u2019re nowhere near a complete solution.\nThe Watch\u2019s health and fitness features are broken up across two apps: Activity and Workout. The Activity app is beautiful, but extremely basic \u2014 it\u2019s what monitors your movement. You can set goals for your calories burned, exercise, and standing, which are displayed as three concentric rings. Red is calories, green is exercise, and blue is standing. I\u2019m not sure why standing is measured in \u201chours\u201d \u2014 the Watch just bugs you to stand up for a couple minutes every hour, and that\u2019s good enough. It\u2019ll also show you your steps and total distance, which is nice.\nThe Watch and phone work together to make it even more accurate.\nAll of this tracking worked fine while I was wearing the Watch, but there just wasn\u2019t much else going on. Unlike the Fitbit and other popular activity trackers, there\u2019s no social component here to let you compete with your friends, and there\u2019s no tracking of your calories burned against your weight or what you\u2019re eating. The data feeds into the iPhone\u2019s Health database, so other apps could pull from there and give you these other features, but out of the box it\u2019s just a very basic activity tracker.\nThe other health and fitness app is Workout, which offers you a series of presets geared towards various cardio workouts. It\u2019s not a huge list of choices: you\u2019ve got indoor and outdoor walking and running, elliptical, cycling, stair steppers, rowing, and the catchall \u201cother.\u201d Apple says these presets all trigger specialized algorithms that use the accelerometer and heart rate sensor in slightly different ways to capture extremely accurate data. If you\u2019ve got your iPhone in your pocket, the Watch and phone will work together to calibrate accelerometer data against the phone GPS to make it even more accurate. Neat.\nIt\u2019s definitely nice to have these presets built in, but again, it\u2019s all pretty much table stakes. There\u2019s nothing that captures lifting weights, yoga, or other exercises that don\u2019t either crank up your heart rate or trip the accelerometer with movement. You can use the \u201cother\u201d preset, which will always give you credit for a brisk walk even if the other sensors aren\u2019t returning a ton of data, but it\u2019s definitely not perfect. And I found that the heart rate sensor struggled during my workouts, especially when I was really sweaty; it consistently measured about half my correct heart rate instead of my full 148bpm.\nAgain, Apple will surely improve all of this with software updates; it\u2019s hard not to see them adding more workout types over time. But out of the box right now, the Apple Watch is a very expensive, barebones fitness tracker. It\u2019s much nicer than its competitors \u2014 I used it with the white sport band and thought it was really quite striking \u2014 but it\u2019s certainly not more full-featured.\n6:24PM 25%\nTwilight of Attention\nFeaturing Eater\nAfter the gym, I head to Betony for drinks with Eater managing editor Sonia Chopra so we can talk about a future of food series for later in the year. So far I\u2019ve mostly used the Watch either alone or in an office environment, but it\u2019s really different to have a smartwatch in a bar: here, even small distractions make you seem like a jerk. Sonia\u2019s trying to describe the project to me and find ways to work together, but I keep glancing at my wrist to see extremely unimportant emails fly by.\nIt turns out that checking your Watch over and over again is a gesture that carries a lot of cultural weight. Eventually, Sonia asks me if I need to be somewhere else. We\u2019re both embarrassed, and I\u2019ve mostly just ignored everyone. This is a little too much future all at once.\n11:00PM 7%\nBack to Base\nBattery Life, Thoughts, Feelings\nBy the end of each day, I was hyper-aware of how low the Apple Watch battery had gotten. After one particularly heavy day of use, I hit 10 percent battery at 7pm, triggering a wave of anxiety. But most days were actually fine. Apple had a big challenge getting a tiny computer like this to last a day, and it succeeded \u2014 even if that success seemingly comes at the expense of performance.\nYou only get a charging cable, which is lame. For $700, you should get a nice charging stand, like you get with the $249 Moto 360. Apple makes a stand, but it only comes with the $10,000-and-up Apple Watch Edition models. Crazy.\nBut do you want another tiny computer in your life that you have to worry about and charge every day? That\u2019s the real question of the Apple Watch. Does it offer so much to you that you\u2019re willing to deal with the hassles and idiosyncrasies of a new platform that is clearly still finding a true purpose?\nThe Apple Watch is one of the most ambitious products I\u2019ve ever seen; it wants to do and change so much about how we interact with technology. But that ambition robs it of focus.\nThere\u2019s no question that the Apple Watch is the most capable smartwatch available today. It is one of the most ambitious products I\u2019ve ever seen; it wants to do and change so much about how we interact with technology. But that ambition robs it of focus: it can do tiny bits of everything, instead of a few things extraordinarily well. For all of its technological marvel, the Apple Watch is still a smartwatch, and it\u2019s not clear that anyone\u2019s yet figured out what smartwatches are actually for.\nIf you are willing to go along on that journey, then you\u2019ll enjoy the Apple Watch. It is a bauble, after all, and baubles delight simply by their presence. Apple will update the software, and developers will make apps, and Google and Samsung and Microsoft will release competitors, and the people who love technology will have something to buy and argue about, talismans that display tribal affiliations.\nBut that\u2019s technology as fashion; it\u2019s not quite yet fashion itself. If you\u2019re going to buy an Apple Watch, I\u2019d recommend buying a Sport model; I wouldn\u2019t spend money on how it looks until Apple completes the task of figuring out what it does.\nGood Stuff\nEasily the nicest smartwatch available\nPlatform has endless room to grow, especially with native apps\nTaptic Engine is really cool\nBad Stuff\nPerformance issues, especially with apps and location services\nNotifications need way more granular settings\nMuch more expensive than other smartwatches\nAnimated emojis are nightmare fuel\nVerge Score\n7","title":"Apple Watch Review","diffbotUri":"article|3|-264792394","pageUrl":"http://www.theverge.com/a/apple-watch-review","videos":[{"diffbotUri":"video|3|-1261237288","primary":true,"url":"https://www.youtube.com/embed/noZAqbn92gM?autoplay=0&controls=1&autohide=1&wmode=opaque&showinfo=0&loop=0&mute=0&enablejsapi=1&origin=http%3A%2F%2Fwww.theverge.com"},{"diffbotUri":"video|3|-2039553703","primary":true,"url":"https://www.youtube.com/embed/oxOtIE2lEmA?autoplay=0&controls=1&autohide=1&wmode=opaque&showinfo=0&loop=0&mute=0&enablejsapi=1&origin=http%3A%2F%2Fwww.theverge.com"},{"diffbotUri":"video|3|879525727","primary":true,"url":"https://www.youtube.com/embed/UCPuhg-ceGE?autoplay=0&controls=1&autohide=1&wmode=opaque&showinfo=0&loop=0&mute=0&enablejsapi=1&origin=http%3A%2F%2Fwww.theverge.com"}],"images":[{"title":"The white sport band is pretty comfortable.","height":306,"diffbotUri":"image|3|-1873943448","naturalHeight":613,"width":291,"primary":true,"naturalWidth":584,"url":"http://ea-cdn.voxmedia.com/production/theverge-verge-apple-watch/images/workout-68b7d954.jpg"}],"html":"

The Apple Watch is an extraordinarily small and personal device. It is designed to participate in nearly every moment of your day, but almost never directly interact with anyone else. It knows when you’re wearing it. You can talk to it. You can poke it — and it can poke back.<\/p>\n

Every so often, the Apple Watch thinks about your heartbeat.<\/p>\n

But the Apple Watch is also an enormous device. It’s the first entirely new Apple product in five years, and the first Apple product developed after the death of Steve Jobs. It’s full of new hardware, new software, and entirely new ideas about how the worlds of fashion and technology should intersect.<\/p>\n

It’s also the first smartwatch that might legitimately become a mainstream product, even as competitors flood the market. Apple has the marketing prowess, the retail store network, and the sheer determination to actually make this thing happen.<\/p>\n

It just has to answer one question: would you actually use the Apple Watch instead of your phone?<\/p>\n