diff --git a/.travis.yml b/.travis.yml index 735d3b4..872d63d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,10 @@ php: - 7.0 - hhvm +# This triggers builds to run on the new TravisCI infrastructure. +# See: http://docs.travis-ci.com/user/workers/container-based-infrastructure/ +sudo: false + matrix: allow_failures: - php: 7.0 @@ -17,5 +21,4 @@ script: - phpunit --coverage-text --coverage-clover=coverage.clover after_script: - - wget https://scrutinizer-ci.com/ocular.phar - - php ocular.phar code-coverage:upload --format=php-clover coverage.clover + - if [[ $TRAVIS_PHP_VERSION != 'hhvm' && $TRAVIS_PHP_VERSION != '7.0' ]]; then php vendor/bin/ocular code-coverage:upload --format=php-clover coverage.clover; fi diff --git a/composer.json b/composer.json index 9e49f6a..d3ee030 100644 --- a/composer.json +++ b/composer.json @@ -16,11 +16,15 @@ ], "require": { "php" : ">=5.4.0", - "guzzlehttp/guzzle": "~5.0" + "php-http/client-implementation": "^1.0", + "php-http/utils": "^0.1.0@dev", + "php-http/discovery": "^0.2.0@dev" }, "require-dev": { "symfony/var-dumper": "~2", - "phpunit/phpunit": "^5.0" + "phpunit/phpunit": "^5.0", + "php-http/guzzle6-adapter": "~0.2@dev", + "scrutinizer/ocular": "^1.1" }, "autoload": { "psr-4": { @@ -36,5 +40,7 @@ "branch-alias": { "dev-master": "0.5-dev" } - } + }, + "prefer-stable": true, + "minimum-stability": "dev" } diff --git a/src/Api/Crawl.php b/src/Api/Crawl.php index a6d47e1..92d9f1f 100644 --- a/src/Api/Crawl.php +++ b/src/Api/Crawl.php @@ -442,7 +442,7 @@ public function call() { $response = $this->diffbot->getHttpClient()->get($this->buildUrl()); - $array = $response->json(); + $array = json_decode($response->getBody(), true); if (isset($array['jobs'])) { $jobs = []; diff --git a/src/Api/Search.php b/src/Api/Search.php index fccdad7..08c9980 100644 --- a/src/Api/Search.php +++ b/src/Api/Search.php @@ -138,7 +138,7 @@ public function call($info = false) $ei = parent::call(); set_error_handler(function() { /* ignore errors */ }); - $arr = $ei->getResponse()->json(['big_int_strings' => true]); + $arr = json_decode((string)$ei->getResponse()->getBody(), true, 512, 1); restore_error_handler(); unset($arr['request']); diff --git a/src/Diffbot.php b/src/Diffbot.php index 79dfe11..3d001d8 100644 --- a/src/Diffbot.php +++ b/src/Diffbot.php @@ -2,6 +2,8 @@ namespace Swader\Diffbot; +use Http\Discovery\HttpClientDiscovery; +use Http\Discovery\MessageFactoryDiscovery; use Swader\Diffbot\Api\Crawl; use Swader\Diffbot\Api\Custom; use Swader\Diffbot\Api\Search; @@ -11,7 +13,7 @@ use Swader\Diffbot\Api\Analyze; use Swader\Diffbot\Api\Article; use Swader\Diffbot\Api\Discussion; -use GuzzleHttp\Client; +use Http\Client\Utils\HttpMethodsClient as Client; use Swader\Diffbot\Factory\Entity; use Swader\Diffbot\Interfaces\Api; use Swader\Diffbot\Interfaces\EntityFactory; @@ -90,12 +92,16 @@ public function getToken() * Sets the client to be used for querying the API endpoints * * @param Client $client + * @see http://php-http.readthedocs.org/en/latest/utils/#httpmethodsclient * @return $this */ public function setHttpClient(Client $client = null) { if ($client === null) { - $client = new Client(); + $client = new Client( + HttpClientDiscovery::find(), + MessageFactoryDiscovery::find() + ); } $this->client = $client; return $this; diff --git a/src/Entity/EntityIterator.php b/src/Entity/EntityIterator.php index 05d0de9..a15820f 100644 --- a/src/Entity/EntityIterator.php +++ b/src/Entity/EntityIterator.php @@ -2,7 +2,7 @@ namespace Swader\Diffbot\Entity; -use GuzzleHttp\Message\ResponseInterface as Response; +use Psr\Http\Message\ResponseInterface as Response; use Swader\Diffbot\Abstracts\Entity; class EntityIterator implements \Countable, \Iterator, \ArrayAccess @@ -134,6 +134,7 @@ public function offsetGet($offset) if ($this->offsetExists($offset)) { return $this->data[$offset]; } + throw new \OutOfBoundsException("Offset '$offset' not present"); } /** diff --git a/src/Factory/Entity.php b/src/Factory/Entity.php index 65b2252..e4ce237 100644 --- a/src/Factory/Entity.php +++ b/src/Factory/Entity.php @@ -2,7 +2,7 @@ namespace Swader\Diffbot\Factory; -use GuzzleHttp\Message\ResponseInterface as Response; +use Psr\Http\Message\ResponseInterface as Response; use Swader\Diffbot\Entity\EntityIterator; use Swader\Diffbot\Exceptions\DiffbotException; use Swader\Diffbot\Interfaces\EntityFactory; @@ -30,9 +30,8 @@ public function createAppropriateIterator(Response $response) { $this->checkResponseFormat($response); - set_error_handler(function() { /* ignore errors */ }); - $arr = $response->json(['big_int_strings' => true]); + $arr = json_decode((string)$response->getBody(), true, 512, 1); restore_error_handler(); $objects = []; @@ -58,7 +57,7 @@ public function createAppropriateIterator(Response $response) protected function checkResponseFormat(Response $response) { set_error_handler(function() { /* ignore errors */ }); - $arr = $response->json(['big_int_strings' => true]); + $arr = json_decode((string)$response->getBody(), true, 512, 1); restore_error_handler(); if (isset($arr['error'])) { diff --git a/src/Interfaces/EntityFactory.php b/src/Interfaces/EntityFactory.php index 7e1b8fb..21c4d85 100644 --- a/src/Interfaces/EntityFactory.php +++ b/src/Interfaces/EntityFactory.php @@ -2,7 +2,7 @@ namespace Swader\Diffbot\Interfaces; -use GuzzleHttp\Message\ResponseInterface as Response; +use Psr\Http\Message\ResponseInterface as Response; use Swader\Diffbot\Entity\EntityIterator; interface EntityFactory diff --git a/tests/Abstracts/EntityIteratorTest.php b/tests/Abstracts/EntityIteratorTest.php index 3545e90..a3e2e3a 100644 --- a/tests/Abstracts/EntityIteratorTest.php +++ b/tests/Abstracts/EntityIteratorTest.php @@ -2,13 +2,9 @@ namespace Swader\Diffbot\Test; -use GuzzleHttp\Client; -use GuzzleHttp\Message\Response; -use GuzzleHttp\Stream\Stream; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Abstracts\Entity; +use Swader\Diffbot\Factory\Entity; -class EntityIteratorTest extends \PHPUnit_Framework_TestCase +class EntityIteratorTest extends ResponseProvider { /** @var array */ @@ -19,28 +15,11 @@ class EntityIteratorTest extends \PHPUnit_Framework_TestCase 'Images/multi_images_smittenkitchen.json' ]; - protected function prepareResponses() - { - if (empty($this->responses)) { - $mockInput = []; - foreach ($this->files as $file) { - $mockInput[] = file_get_contents(__DIR__ . '/../Mocks/' . $file); - } - unset($file); - $mock = new Mock($mockInput); - $client = new Client(); - $client->getEmitter()->attach($mock); - foreach ($this->files as $file) { - $this->responses[$file] = $client->get('sampleurl.com'); - } - unset($file); - } - return $this->responses; - } + protected $folder = '/../Mocks/'; public function testBadMethodCall() { - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); $ei = $ef->createAppropriateIterator($this->prepareResponses()['Images/one_image_zola.json']); $this->setExpectedException('BadMethodCallException'); @@ -49,7 +28,7 @@ public function testBadMethodCall() public function testMagic() { - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); $ei = $ef->createAppropriateIterator($this->prepareResponses()['Images/one_image_zola.json']); $this->assertEquals('image', $ei->type); @@ -63,7 +42,7 @@ public function testCount() 'Images/multi_images_smittenkitchen.json' => 9 ]; - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); foreach ($fileExpectations as $fileName => $expectation) { $ei = $ef->createAppropriateIterator($this->prepareResponses()[$fileName]); @@ -73,17 +52,17 @@ public function testCount() public function testGetResponse() { - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); foreach ($this->files as $fileName) { $ei = $ef->createAppropriateIterator($this->prepareResponses()[$fileName]); - $this->assertInstanceOf('GuzzleHttp\Message\Response', $ei->getResponse()); + $this->assertInstanceOf('Psr\Http\Message\ResponseInterface', $ei->getResponse()); } } public function testIteration() { - $ef = new \Swader\Diffbot\Factory\Entity(); + $ef = new Entity(); foreach ($this->files as $fileName) { $ei = $ef->createAppropriateIterator($this->prepareResponses()[$fileName]); diff --git a/tests/Api/AnalyzeApiTest.php b/tests/Api/AnalyzeApiTest.php index abecf8e..78778bc 100644 --- a/tests/Api/AnalyzeApiTest.php +++ b/tests/Api/AnalyzeApiTest.php @@ -2,13 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; -use Swader\Diffbot\Entity\Article; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; class AnalyzeApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -19,27 +18,18 @@ class AnalyzeApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createAnalyzeAPI('https://article-mock.com'); } - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); - } - protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')) + ]); } return $this->validMock; diff --git a/tests/Api/ArticleApiTest.php b/tests/Api/ArticleApiTest.php index 0d02351..6f042d0 100644 --- a/tests/Api/ArticleApiTest.php +++ b/tests/Api/ArticleApiTest.php @@ -2,13 +2,13 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Entity\Article; class ArticleApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -19,27 +19,17 @@ class ArticleApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); - + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createArticleAPI('https://article-mock.com'); } - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); - } - protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')) + ]); } return $this->validMock; diff --git a/tests/Api/CrawlCustomMocksTest.php b/tests/Api/CrawlCustomMocksTest.php index e880c03..b517ab2 100644 --- a/tests/Api/CrawlCustomMocksTest.php +++ b/tests/Api/CrawlCustomMocksTest.php @@ -2,13 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; use Swader\Diffbot\Diffbot; use Swader\Diffbot\Entity\JobCrawl; class CrawlCustomMocksTest extends \PHPUnit_Framework_TestCase { + use setterUpper; /** @var Diffbot */ protected $diffbot; @@ -17,16 +16,13 @@ public function setUp() { $diffbot = new Diffbot('demo'); $diffbot->setEntityFactory(); - $fakeClient = new Client(); - $diffbot->setHttpClient($fakeClient); $this->diffbot = $diffbot; } public function testRoundStart() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_roundstart.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_roundstart.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -39,9 +35,8 @@ public function testRoundStart() public function testRestart() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_restart.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_restart.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -60,9 +55,8 @@ public function testRestart() public function testPauseOn() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_paused.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_paused.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -75,9 +69,8 @@ public function testPauseOn() public function testPauseOff() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_unpaused.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_unpaused.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -90,9 +83,8 @@ public function testPauseOff() public function testDelete() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/deletedSuccess.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/deletedSuccess.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); @@ -101,21 +93,19 @@ public function testDelete() public function test500() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_name.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_name.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath, 500)); $c = $this->diffbot->crawl('sitepoint_01'); - $this->setExpectedException('GuzzleHttp\Exception\ServerException'); + $this->setExpectedException('Http\Client\Exception\HttpException'); $c->call(); } public function testOtherError() { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_response.json')] - )); + $filepath = __DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_response.json'; + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($filepath)); $c = $this->diffbot->crawl('sitepoint_01'); diff --git a/tests/Api/CrawlTest.php b/tests/Api/CrawlTest.php index d0af2e1..da44460 100644 --- a/tests/Api/CrawlTest.php +++ b/tests/Api/CrawlTest.php @@ -2,12 +2,13 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Diffbot; class CrawlTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -16,22 +17,16 @@ class CrawlTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = new Diffbot('demo'); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); - - $this->diffbot = $diffbot; + $this->diffbot = $this->preSetUp(); } protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-18/sitepoint_01_maxCrawled.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-18/sitepoint_01_maxCrawled.json')) + ]); } return $this->validMock; diff --git a/tests/Api/CustomApiTest.php b/tests/Api/CustomApiTest.php index 3c36e97..8814a36 100644 --- a/tests/Api/CustomApiTest.php +++ b/tests/Api/CustomApiTest.php @@ -2,12 +2,13 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Diffbot; class CustomApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -16,27 +17,16 @@ class CustomApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); - - $this->diffbot = $diffbot; - } - - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); + $this->diffbot = $this->preSetUp(); } protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Articles/hi_quicktip_basic.json')) + ]); } return $this->validMock; diff --git a/tests/Api/DiscussionApiTest.php b/tests/Api/DiscussionApiTest.php index 1fc467b..3b84371 100644 --- a/tests/Api/DiscussionApiTest.php +++ b/tests/Api/DiscussionApiTest.php @@ -2,12 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; class DiscussionApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -18,27 +18,18 @@ class DiscussionApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createDiscussionAPI('https://discussion-mock.com'); } - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); - } - protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Discussions/15-05-01/sp_discourse_php7_recap.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Discussions/15-05-01/sp_discourse_php7_recap.json')) + ]); } return $this->validMock; diff --git a/tests/Api/ImageApiTest.php b/tests/Api/ImageApiTest.php index 5242c8a..90c8bbc 100644 --- a/tests/Api/ImageApiTest.php +++ b/tests/Api/ImageApiTest.php @@ -2,13 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; -use Swader\Diffbot\Entity\Image; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; class ImageApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -19,27 +18,18 @@ class ImageApiTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createImageAPI('https://article-mock.com'); } - protected function getValidDiffbotInstance() - { - return new Diffbot('demo'); - } - protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Images/one_image_zola.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Images/one_image_zola.json')) + ]); } return $this->validMock; @@ -47,8 +37,7 @@ protected function getValidMock() public function testCall() { - /** @var Image $image */ - $image = $this->apiWithMock->call(); + $this->apiWithMock->call(); } public function testBuildUrlNoCustomFields() diff --git a/tests/Api/ProductApiTest.php b/tests/Api/ProductApiTest.php index 9d84ace..8f857ca 100644 --- a/tests/Api/ProductApiTest.php +++ b/tests/Api/ProductApiTest.php @@ -2,13 +2,12 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Diffbot; -use Swader\Diffbot\Entity\Product; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; class ProductApiTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -17,32 +16,27 @@ class ProductApiTest extends \PHPUnit_Framework_TestCase */ protected $apiWithMock; - protected function setUp() { - $diffbot = $this->getValidDiffbotInstance(); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); + protected function setUp() + { + $diffbot = $this->preSetUp(); $this->apiWithMock = $diffbot->createProductAPI('https://dogbrush-mock.com'); } - protected function getValidDiffbotInstance() + protected function getValidMock() { - return new Diffbot('demo'); - } - - protected function getValidMock(){ if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__.'/../Mocks/Products/dogbrush.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Products/dogbrush.json')) + ]); } + return $this->validMock; } - public function testCall() { + public function testCall() + { $products = $this->apiWithMock->call(); foreach ($products as $product) { @@ -50,7 +44,8 @@ public function testCall() { } } - public function testBuildUrlNoCustomFields() { + public function testBuildUrlNoCustomFields() + { $url = $this ->apiWithMock ->buildUrl(); @@ -58,7 +53,8 @@ public function testBuildUrlNoCustomFields() { $this->assertEquals($expectedUrl, $url); } - public function testBuildUrlMultipleCustomFields() { + public function testBuildUrlMultipleCustomFields() + { $url = $this ->apiWithMock ->setColors(true) @@ -69,7 +65,8 @@ public function testBuildUrlMultipleCustomFields() { $this->assertEquals($expectedUrl, $url); } - public function testBuildUrlMultipleCustomFieldsAndOtherOptions() { + public function testBuildUrlMultipleCustomFieldsAndOtherOptions() + { $url = $this ->apiWithMock ->setColors(true) diff --git a/tests/Api/SearchCustomMocksTest.php b/tests/Api/SearchCustomMocksTest.php index 4a805aa..810b185 100644 --- a/tests/Api/SearchCustomMocksTest.php +++ b/tests/Api/SearchCustomMocksTest.php @@ -2,13 +2,11 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; use Swader\Diffbot\Diffbot; -use Swader\Diffbot\Entity\JobCrawl; class SearchCustomMocksTest extends \PHPUnit_Framework_TestCase { + use setterUpper; /** @var Diffbot */ protected $diffbot; @@ -20,8 +18,6 @@ public function setUp() { $diffbot = new Diffbot('demo'); $diffbot->setEntityFactory(); - $fakeClient = new Client(); - $diffbot->setHttpClient($fakeClient); $this->diffbot = $diffbot; } @@ -45,9 +41,7 @@ public function resultCountProvider() */ public function testResultCount($case, $expectations) { - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents($this->mockPrefix . $case['file'])] - )); + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($this->mockPrefix . $case['file'])); $search = $this->diffbot->search($case['q'])->call(); @@ -115,9 +109,7 @@ public function searchInfoProvider() public function testSearchInfo($case, $expectations) { $this->markTestSkipped('Bugged due to JSONC: https://github.com/Swader/diffbot-php-client/issues/12'); - $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( - [file_get_contents($this->mockPrefix . $case['file'])] - )); + $this->diffbot->setHttpClient($this->getCustomMockFakeClient($this->mockPrefix . $case['file'])); $searchInfo = $this->diffbot->search($case['q'])->call(true); diff --git a/tests/Api/SearchTest.php b/tests/Api/SearchTest.php index 8fe7ff9..0e044d8 100644 --- a/tests/Api/SearchTest.php +++ b/tests/Api/SearchTest.php @@ -2,13 +2,14 @@ namespace Swader\Diffbot\Test\Api; -use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; -use Swader\Diffbot\Api\Search; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Diffbot; +use Swader\Diffbot\Api\Search; class SearchTest extends \PHPUnit_Framework_TestCase { + use setterUpper; protected $validMock; @@ -17,22 +18,16 @@ class SearchTest extends \PHPUnit_Framework_TestCase protected function setUp() { - $diffbot = new Diffbot('demo'); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($this->getValidMock()); - - $diffbot->setHttpClient($fakeClient); - $diffbot->setEntityFactory(); - - $this->diffbot = $diffbot; + $this->diffbot = $this->preSetUp(); } protected function getValidMock() { if (!$this->validMock) { - $this->validMock = new Mock( - [file_get_contents(__DIR__ . '/../Mocks/Search/15-05-24/test.json')] - ); + $this->validMock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/../Mocks/Search/15-05-24/test.json')) + ]); } return $this->validMock; diff --git a/tests/Api/setterUpper.php b/tests/Api/setterUpper.php new file mode 100644 index 0000000..00cf453 --- /dev/null +++ b/tests/Api/setterUpper.php @@ -0,0 +1,54 @@ +getValidDiffbotInstance(); + + $handler = HandlerStack::create($this->getValidMock()); + $guzzleClient = new Client(['handler' => $handler]); + + $methodsClient = new HttpMethodsClient( + new Guzzle6HttpAdapter($guzzleClient), + new GuzzleFactory()); + + $diffbot->setHttpClient($methodsClient); + $diffbot->setEntityFactory(); + + return $diffbot; + } + + public function getCustomMockFakeClient($filepath, $code = 200) + { + $handler = HandlerStack::create(new MockHandler([ + new Response($code, [], + file_get_contents($filepath)) + ])); + + $guzzleClient = new Client(['handler' => $handler]); + + return new HttpMethodsClient( + new Guzzle6HttpAdapter($guzzleClient), + new GuzzleFactory()); + + } +} \ No newline at end of file diff --git a/tests/DiffbotTest.php b/tests/DiffbotTest.php index 0533566..dda75d3 100644 --- a/tests/DiffbotTest.php +++ b/tests/DiffbotTest.php @@ -3,7 +3,12 @@ namespace Swader\Diffbot\Test; use GuzzleHttp\Client; -use GuzzleHttp\Subscriber\Mock; +use GuzzleHttp\Handler\MockHandler; +use GuzzleHttp\HandlerStack; +use GuzzleHttp\Psr7\Response; +use Http\Adapter\Guzzle6HttpAdapter; +use Http\Client\Utils\HttpMethodsClient; +use Http\Discovery\MessageFactory\GuzzleFactory; use Swader\Diffbot\Diffbot; class DiffbotTest extends \PHPUnit_Framework_TestCase @@ -93,14 +98,19 @@ public function testGetToken() public function testSetHttpClient() { $bot = new Diffbot('token'); - $validMock = new Mock( - [file_get_contents(__DIR__ . '/Mocks/Products/dogbrush.json')] - ); - $fakeClient = new Client(); - $fakeClient->getEmitter()->attach($validMock); + $mock = new MockHandler([ + new Response(200, [], + file_get_contents(__DIR__ . '/Mocks/Products/dogbrush.json')) + ]); + $handler = HandlerStack::create($mock); + $guzzleClient = new Client(['handler' => $handler]); + + $methodsClient = new HttpMethodsClient( + new Guzzle6HttpAdapter($guzzleClient), + new GuzzleFactory()); try { - $bot->setHttpClient($fakeClient); + $bot->setHttpClient($methodsClient); } catch (\Exception $e) { $this->fail("Could not set fake client: " . $e->getMessage()); } @@ -108,7 +118,13 @@ public function testSetHttpClient() public function methodnameProvider() { - return [['product'], ['image'], ['analyze'], ['article'], ['discussion']]; + return [ + ['product'], + ['image'], + ['analyze'], + ['article'], + ['discussion'] + ]; } /** diff --git a/tests/Entity/CrawlJobTest.php b/tests/Entity/CrawlJobTest.php index 3faa36a..37262e3 100644 --- a/tests/Entity/CrawlJobTest.php +++ b/tests/Entity/CrawlJobTest.php @@ -2,7 +2,7 @@ namespace Swader\Diffbot\Test\Entity; -use GuzzleHttp\Message\ResponseInterface; +use Psr\Http\Message\ResponseInterface; use Swader\Diffbot\Entity\EntityIterator; use Swader\Diffbot\Entity\Image; use Swader\Diffbot\Entity\JobCrawl as Job; @@ -24,7 +24,7 @@ protected function ei($file) /** @var ResponseInterface $response */ $response = $this->responses[$file]; $jobs = []; - foreach ($response->json()['jobs'] as $data) { + foreach (json_decode($response->getBody(), true)['jobs'] as $data) { $jobs[] = new Job($data); } diff --git a/tests/Factory/EntityTest.php b/tests/Factory/EntityTest.php index d925727..f5911a5 100644 --- a/tests/Factory/EntityTest.php +++ b/tests/Factory/EntityTest.php @@ -2,8 +2,7 @@ namespace Swader\Diffbot\Test\Factory; -use GuzzleHttp\Message\Response; -use GuzzleHttp\Stream\Stream; +use GuzzleHttp\Psr7\Response; use Swader\Diffbot\Diffbot; use Swader\Diffbot\Factory\Entity; @@ -21,23 +20,16 @@ public function setUp() $this->ef = new Entity(); } - public function testInvalidResponseBodyFail() - { - $this->responseOk->setBody(Stream::factory('Pure text content')); - $this->setExpectedException('GuzzleHttp\Exception\ParseException'); - $this->ef->createAppropriateIterator($this->responseOk); - } - public function testMissingObjectsFail() { - $this->responseOk->setBody(Stream::factory(json_encode(['foo' => 'bar']))); + $this->responseOk = $this->responseOk->withBody(\GuzzleHttp\Psr7\stream_for(json_encode(['foo' => 'bar']))); $this->setExpectedException('Swader\Diffbot\Exceptions\DiffbotException'); $this->ef->createAppropriateIterator($this->responseOk); } public function testMissingRequestFail() { - $this->responseOk->setBody(Stream::factory(json_encode([ + $this->responseOk = $this->responseOk->withBody(\GuzzleHttp\Psr7\stream_for(json_encode([ 'objects' => 'foo', 'req' => 'bar' ]))); @@ -47,7 +39,7 @@ public function testMissingRequestFail() public function testProductEntityPass() { - $this->responseOk->setBody(Stream::factory(json_encode([ + $this->responseOk = $this->responseOk->withBody(\GuzzleHttp\Psr7\stream_for(json_encode([ 'objects' => [['type' => 'product']], 'request' => ['api' => 'product', 'foo' => 2] ]))); @@ -56,7 +48,7 @@ public function testProductEntityPass() public function testWildCardEntityPass() { - $this->responseOk->setBody(Stream::factory(json_encode([ + $this->responseOk = $this->responseOk->withBody(\GuzzleHttp\Psr7\stream_for(json_encode([ 'objects' => [['type' => 'mysterious_api']], 'request' => ['api' => 'mysterious_api', 'foo' => 2] ]))); diff --git a/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic-old.json b/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic-old.json new file mode 100644 index 0000000..e21fac8 --- /dev/null +++ b/tests/Mocks/Analyze/15-04-19/Articles/sitepoint_diffbot_basic-old.json @@ -0,0 +1,10 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Sun, 19 Apr 2015 20:09:22 GMT +Content-Type: application/json;charset=utf-8 +Transfer-Encoding: chunked +Connection: keep-alive +Vary: Accept-Encoding +Access-Control-Allow-Origin: * + +{"title":"Diffbot: Crawling with Visual Machine Learning","request":{"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning","api":"analyze","resolvedPageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","options":["mentos"],"version":3},"humanLanguage":"en","type":"article","objects":[{"tags":[{"id":4585348,"count":9,"prevalence":0.6428571428571428,"label":"Diffbot","type":"organization","uri":"http://dbpedia.org/resource/Diffbot"},{"id":175464,"count":13,"prevalence":0.9285714285714286,"label":"Application programming interface","uri":"http://dbpedia.org/resource/Application_programming_interface"},{"id":1936869,"count":3,"prevalence":0.21428571428571427,"label":"Laravel","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/Laravel"},{"id":208652,"count":3,"prevalence":0.21428571428571427,"label":"PHP","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/PHP"},{"id":91320,"count":2,"prevalence":0.14285714285714285,"label":"SitePoint","type":"organization","uri":"http://dbpedia.org/resource/SitePoint"}],"icon":"http://www.sitepoint.com/wp-content/themes/sitepoint/assets/images/apple-touch-icon-144x144-precomposed.png","text":"Have you ever wondered how social networks do URL previews so well when you share links? How do they know which images to grab, whom to cite as an author, or which tags to attach to the preview? Is it all crawling with complex regexes over source code? Actually, more often than not, it isn\u2019t. Meta information defined in the source can be unreliable, and sites with less than stellar reputation often use them as keyword carriers, attempting to get search engines to rank them higher. Isn\u2019t what we, the humans, see in front of us what matters anyway?\nIf you want to build a URL preview snippet or a news aggregator, there are many automatic crawlers available online, both proprietary and open source, but you seldom find something as niche as visual machine learning. This is exactly what Diffbot is \u2013 a \u201cvisual learning robot\u201d which renders a URL you request in full and then visually extracts data, helping itself with some metadata from the page source as needed.\nAfter covering some theory, in this post we\u2019ll do a demo API call at one of SitePoint\u2019s posts.\nPHP Library\nThe PHP library for Diffbot is somewhat out of date, and as such we won\u2019t be using it in this demo. We\u2019ll be performing raw API calls, and in some future posts we\u2019ll build our own library for API interaction.\nIf you\u2019d like to take a look at the PHP library nonetheless, see here, and if you\u2019re interested in libraries for other languages, Diffbot has a directory.\nJavaScript Content\nWe said in the introductory section that Diffbot renders the request in full and then analyzes it. But, what about JavaScript content? Nowadays, websites often render some HTML above the fold, and then finish the CSS, JS, and dynamic content loading afterwards. Can the Diffbot API see that?\nAs a matter of fact, yes. Diffbot literally renders the page in full, and then inspects it visually, as explained in my StackOverflow Q&A here. There are some caveats, though, so make sure you read the answer carefully.\nPricing and API Health\nDiffbot has several usage tiers. There\u2019s a free trial tier which kills your API token after 7 days or 10000 calls, whichever comes first. The commercial tokens can be purchased at various prices, and never expire, but do have limitations. A special case by case approach is afforded to open source and/or educational projects which provides an older model of the free token \u2013 10k calls per month, once per second max, but never expires. You need to contact them directly if you think you qualify.\nDiffbot guarantees a high uptime, but failures sometimes do happen \u2013 especially in the most resource intensive API of the bunch: Crawlbot. Crawlbot is used to crawl entire domains, not just individual pages, and as such has a lower reliability rate than other APIs. Not by a lot, but enough to be noticeable in the API Health screen \u2013 the screen you can check to see if an API is up and running or currently unavailable if your calls run into issues or return error 500.\nDemo\nTo prepare your environment, please boot up a Homestead Improved instance.\nCreate Project\nCreate a starter Laravel project by SSHing into the VM with vagrant ssh, going into the Code folder, and executing composer create-project laravel/laravel Laravel --prefer-dist. This will let you access the Laravel greeting page via http://homestead.app:8000 from the host\u2019s browser.\nAdd a Route and Action\nIn app/routes.php add the following route:\n1\nRoute::get('/diffbot', 'HomeController@diffbotDemo');\nIn app/controllers/HomeController add the following action:\n1\n2\n3\npublic function diffbotDemo() {\ndie(\"hi\");\n}\nIf http://homestead.app:8000/diffbot now outputs \u201chi\u201d on the screen, we\u2019re ready to start playing with the API.\nGet a Token\nTo interact with the Diffbot API, you need a token. Sign up for one on their pricing page. For the sake of this demo, let\u2019s call our token $TOKEN, and we\u2019ll refer to it as such in URLs. Replace $TOKEN with your own value where appropriate.\nInstall Guzzle\nWe\u2019ll be using Guzzle as our HTTP client. It\u2019s not required, but I do recommend you get familiar with it through a past article of ours.\nAdd the \"guzzlehttp/guzzle\": \"4.1.*@dev\" to your composer.json so the require block looks like this:\n1\n2\n3\n4\n\"require\": {\n\"laravel/framework\": \"4.2.*\",\n\"guzzlehttp/guzzle\": \"4.1.*@dev\"\n},\nIn the project root, run composer update.\nFetch Article Data\nIn the first example, we\u2019ll crawl a SitePoint post with the default Article API from Diffbot. To do this, we refer to the docs which do an excellent job at explaining the workflow. Change the body of the diffbotDemo action to the following code:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\npublic function diffbotDemo() {\n$token = \"$TOKEN\";\n$version = 'v3';\n$response = $client->get($version.'/article', ['query' => [\n'token' => $token,\n]]);\ndie(var_dump($response->json()));\n}\nFirst, we set our token. Then, we define a variable that\u2019ll hold the API version. Next, it\u2019s up to us to create a new Guzzle client, and we also give it a base URL so we don\u2019t have to type it in every time we make another request.\nNext up, we create a response object by sending a GET request to the API\u2019s URL, and we add in an array of query parameters in key => value format. In this case, we only pass in the token and the URL, the most basic of parameters.\nFinally, since the Diffbot API returns JSON data, we use Guzzle\u2019s json() method to automatically decode it into an array. We then pretty-print this data:\nAs you can see, we got some information back rather quickly. There\u2019s the icon that was used, a preview of the text, the title, even the language, date and HTML have been returned. You\u2019ll notice there\u2019s no author, however. Let\u2019s change this and request some more values.\nIf we add the \u201cfields\u201d parameter to the query params list and give it a value of \u201ctags\u201d, Diffbot will attempt to extract tags/categories from the URL provided. Add this line to the query array:\n1\n'fields' => 'tags'\nand then change the die part to this:\n1\n2\n$data = $response->json();\ndie(var_dump($data['objects'][0]['tags']));\nRefreshing the screen now gives us this:\nBut, the source code of the article notes several other tags:\nWhy is the result so very different? It\u2019s precisely due to the reason we mentioned at the end of the very first paragraph of this post: what we humans see takes precedence. Diffbot is a visual learning robot, and as such its AI deducts the tags from the actual rendered content \u2013 what it can see \u2013 rather than from looking at the source code which is far too easily spiced up for SEO purposes.\nIs there a way to get the tags from the source code, though, if one really needs them? Furthermore, can we make Diffbot recognize the author on SitePoint articles? Yes. With the Custom API.\nMeta Tags and Author with Custom API\nThe Custom API is a feature which allows you to not only tweak existing Diffbot API to your liking by adding new fields and rules for content extraction, but also allows you to create completely new APIs (accessed via a dedicated URL, too) for custom content processing.\nGo to the dev dashboard and log in with your token. Then, go into \u201cCustom API\u201d. Activate the \u201cCreate a Rule\u201d tab at the bottom, and input the URL of the article we\u2019re crawling into the URL box, then click Test. Your screen should look something like this:\nYou\u2019ll immediately notice the Author field is empty. You can tweak the author-searching rule by clicking Edit next to it, and finding the Author element in the live preview window that opens, then click on it to get the desired result. However, due to some, well, less than perfect CSS on SitePoint\u2019s end, it\u2019s very difficult to provide Diffbot\u2019s API with a consistent path to the author name, especially by clicking on elements. Instead, add the following rule manually: .contributor--large .contributor_name a and click Save.\nYou\u2019ll notice the Preview window now correctly populates the Author field:\nIn fact, this new rule is automatically applied to all SitePoint links for your token. If you try to preview another SitePoint article, like this one, you\u2019ll notice Peter Nijssen is successfully extracted:\nOk, let\u2019s modify the API further. We need the article:tag values that are visible in source code. Doing this requires a two-step process.\nStep 1: Define a Collection\nA collection is exactly what it sounds like \u2013 a collection of values grabbed via a specific ruleset. We\u2019ll call our collection \u201cMetaTags\u201d, and give it the following selector: meta[property=article:tag]. This means \u201cfind all meta elements in the HTML that have the property attribute with the value article:tag\u201c.\nStep 2: Define Collection Fields\nCollection fields are individual entries in a collection \u2013 in our case, the various tags. Click on \u201cAdd a custom field to this collection\u201d, and add the following values:\nClick Save. You\u2019ll immediately have access to the list of Tags in the result window:\n1\ndie(var_dump($data['objects'][0]['metaTags']));\nIf you now refresh the URL we tested with (http://homestead.app:8000/diffbot), you\u2019ll notice the author and meta tags values are there. Here\u2019s the output the above line of code produces:\nWe have our tags!\nConclusion\nDiffbot is a powerful data extractor for the web \u2013 whether you need to consolidate many sites into a single search index without combining their back-ends, want to build a news aggregator, have an idea for a URL preview web component, or want to regularly harvest the contents of competitors\u2019 public pricing lists, Diffbot can help. With dead simple API calls and highly structured responses, you\u2019ll be up and running in next to no time. In a later article, we\u2019ll build a brand new API for using Diffbot with PHP, and redo the calls above with it. We\u2019ll also host the library on Packagist, so you can easily install it with Composer. Stay tuned!","discussion":{"tags":[{"id":78932,"count":2,"prevalence":0.2857142857142857,"label":"HTML","uri":"http://dbpedia.org/resource/HTML"},{"id":91320,"count":1,"prevalence":0.14285714285714285,"label":"SitePoint","type":"organization","uri":"http://dbpedia.org/resource/SitePoint"},{"id":5062,"count":1,"prevalence":0.14285714285714285,"label":"Nice","type":"place","uri":"http://dbpedia.org/resource/Nice"},{"id":491384,"count":1,"prevalence":0.14285714285714285,"label":"IOS","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/IOS"},{"id":4585348,"count":1,"prevalence":0.14285714285714285,"label":"Diffbot","type":"organization","uri":"http://dbpedia.org/resource/Diffbot"}],"numPosts":7,"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","posts":[{"id":0,"tags":[{"id":5062,"count":1,"label":"Nice","type":"place","uri":"http://dbpedia.org/resource/Nice"},{"id":4585348,"count":1,"label":"Diffbot","type":"organization","uri":"http://dbpedia.org/resource/Diffbot"},{"id":78932,"count":1,"label":"HTML","uri":"http://dbpedia.org/resource/HTML"}],"author":"Peter Nijssen","text":"Nice article! Just wondering; since diffbot is unable to grab the author, can you conclude that it is not actually represented correctly within the website? I mean, you would think that should be an easy field to grab if HTML has been formatted correctly.","diffbotUri":"post|3|-459867678","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":1,"authorUrl":"https://disqus.com/by/peter_nijssen/","humanLanguage":"en","html":"
Nice article! Just wondering; since diffbot is unable to grab the author, can you conclude that it is not actually represented correctly within the website? I mean, you would think that should be an easy field to grab if HTML has been formatted correctly.<\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"tags":[{"id":91320,"count":1,"label":"SitePoint","type":"organization","uri":"http://dbpedia.org/resource/SitePoint"}],"text":"Correct - there's definitely more that could be done in terms of element declaration in SitePoint's design. Also, notice this: http://www.quora.com/APIs/Are-...","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"humanLanguage":"en","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT","id":1,"parentId":0,"author":"Bruno Skvorc","diffbotUri":"post|3|1753418734","authorUrl":"https://disqus.com/by/brunoskvorc/","html":"
Correct - there's definitely more that could be done in terms of element declaration in SitePoint's design. Also, notice this: http://www.quora.com/APIs/Are-...<\/a><\/p>"},{"id":2,"tags":[{"id":491384,"count":1,"label":"IOS","type":"Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity","uri":"http://dbpedia.org/resource/IOS"},{"id":78932,"count":1,"label":"HTML","uri":"http://dbpedia.org/resource/HTML"}],"author":"Stefan Sturm","text":"Great article, but after scraping the article we need to display it somewhere...\nFor me I want to display it on iOS devices.\nDo you know any good libs or HTML templates to use the diffbot text in?\nThanks for your help:)","diffbotUri":"post|3|959497238","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"authorUrl":"https://disqus.com/by/stefansturm/","humanLanguage":"en","html":" Great article, but after scraping the article we need to display it somewhere... Thanks for your help:)<\/p>","type":"post","date":"Sun, 19 Oct 2014 00:00:00 GMT"},{"id":3,"author":"Taher","text":"Is there any open source projects as good as diffbots?","diffbotUri":"post|3|-197836079","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"authorUrl":"https://disqus.com/by/disqus_CWGq6zNflN/","humanLanguage":"en","html":" Is there any open source projects as good as diffbots?<\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"id":4,"parentId":3,"author":"Bruno Skvorc","text":"The answers here might help you out: https://www.quora.com/Web-Scra...","diffbotUri":"post|3|-1403587915","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":1,"authorUrl":"https://disqus.com/by/brunoskvorc/","humanLanguage":"en","html":" The answers here might help you out: https://www.quora.com/Web-Scra...<\/a><\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"id":5,"author":"anonymous","text":"i just tried out the demo on diffbots website, and out of 14 pages i feeded it, it could only properly process 3. with most of 'em it was just showing me the copyright notice / legal bla foo embeded in those pages","diffbotUri":"post|3|-2120915058","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"humanLanguage":"en","html":" i just tried out the demo on diffbots website, and out of 14 pages i feeded it, it could only properly process 3. with most of 'em it was just showing me the copyright notice / legal bla foo embeded in those pages<\/p>","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT"},{"tags":[{"id":175464,"count":1,"label":"Application programming interface","uri":"http://dbpedia.org/resource/Application_programming_interface"}],"text":"That's where custom API comes in to save the day. Out of curiosity, though, which URLs did you try, and which information was missing?","pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","votes":0,"humanLanguage":"en","type":"post","date":"Sat, 19 Jul 2014 00:00:00 GMT","id":6,"parentId":5,"author":"Bruno Skvorc","diffbotUri":"post|3|1316965997","authorUrl":"https://disqus.com/by/brunoskvorc/","html":" That's where custom API comes in to save the day. Out of curiosity, though, which URLs did you try, and which information was missing?<\/p>"}],"provider":"Disqus","humanLanguage":"en","confidence":1,"type":"discussion","participants":5,"rssUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/feed/","title":"Diffbot: Crawling with Visual Machine Learning","diffbotUri":"discussion|3|-1039854465","numPages":1},"pageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning","metaTags":[{"name":"ai"},{"name":"Artificial Intelligence"},{"name":"crawling"},{"name":"Diffbot"},{"name":"framework"},{"name":"laravel"},{"name":"machine learning"},{"name":"OOPHP"},{"name":"PHP"},{"name":"scraping"},{"name":"visual learning"}],"humanLanguage":"en","type":"article","date":"Sun, 27 Jul 2014 00:00:00 GMT","resolvedPageUrl":"http://www.sitepoint.com/diffbot-crawling-visual-machine-learning/","author":"Bruno Skvorc","title":"Diffbot: Crawling with Visual Machine Learning","diffbotUri":"article|3|-938093421","images":[{"height":533,"naturalHeight":727,"diffbotUri":"image|3|-851701004","primary":true,"width":780,"naturalWidth":1063,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624455201.png"},{"height":216,"naturalHeight":216,"diffbotUri":"image|3|762494522","width":523,"naturalWidth":523,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624487602.png"},{"height":184,"naturalHeight":184,"diffbotUri":"image|3|302236938","width":664,"naturalWidth":664,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624509003.png"},{"height":784,"naturalHeight":972,"diffbotUri":"image|3|-1836356546","width":780,"naturalWidth":966,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624552704.png"},{"height":156,"naturalHeight":184,"diffbotUri":"image|3|1297360030","width":780,"naturalWidth":918,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624650505.png"},{"height":157,"naturalHeight":188,"diffbotUri":"image|3|502449852","width":780,"naturalWidth":929,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624665606.png"},{"height":175,"naturalHeight":237,"diffbotUri":"image|3|-2007985802","width":780,"naturalWidth":1053,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624803507.png"},{"title":"Change the final output of the diffbotDemo() action to this:","height":520,"naturalHeight":604,"diffbotUri":"image|3|-140134863","width":780,"naturalWidth":906,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624809508.png"},{"height":533,"naturalHeight":727,"diffbotUri":"image|3|1129235416","width":780,"naturalWidth":1063,"url":"http://dab1nmslvvntp.cloudfront.net/wp-content/uploads/2014/07/140624834309.png"}],"html":" Have you ever wondered how social networks do URL previews so well when you share links? How do they know which images to grab, whom to cite as an author, or which tags to attach to the preview? Is it all crawling with complex regexes over source code? Actually, more often than not, it isn’t. Meta information defined in the source can be unreliable, and sites with less than stellar reputation often use them as keyword carriers, attempting to get search engines to rank them higher. Isn’t what we, the humans, see in front of us what matters anyway?<\/p>\n If you want to build a URL preview snippet or a news aggregator, there are many automatic crawlers available online, both proprietary and open source, but you seldom find something as niche as visual machine learning. This is exactly what Diffbot<\/a> is – a “visual learning robot” which renders a URL you request in full and then visually extracts data, helping itself with some metadata from the page source as needed.<\/p>\n After covering some theory, in this post we’ll do a demo API call at one of SitePoint’s posts.<\/p>\n The PHP library for Diffbot is somewhat out of date, and as such we won’t be using it in this demo. We’ll be performing raw API calls, and in some future posts we’ll build our own library for API interaction.<\/p>\n If you’d like to take a look at the PHP library nonetheless, see here<\/a>, and if you’re interested in libraries for other languages, Diffbot has a directory<\/a>.<\/p>\n We said in the introductory section that Diffbot renders the request in full and then analyzes it. But, what about JavaScript content? Nowadays, websites often render some HTML above the fold, and then finish the CSS, JS, and dynamic content loading afterwards. Can the Diffbot API see that?<\/p>\n As a matter of fact, yes. Diffbot literally renders the page in full, and then inspects it visually, as explained in my StackOverflow Q&A here<\/a>. There are some caveats, though, so make sure you read the answer carefully.<\/p>\n Diffbot has several usage tiers. There’s a free trial tier which kills your API token after 7 days or 10000 calls, whichever comes first. The commercial tokens can be purchased at various prices<\/a>, and never expire, but do have limitations. A special case by case approach is afforded to open source and/or educational projects which provides an older model of the free token – 10k calls per month, once per second max, but never expires. You need to contact them directly if you think you qualify.<\/p>\n Diffbot guarantees a high uptime, but failures sometimes do happen – especially in the most resource intensive API of the bunch: Crawlbot. Crawlbot is used to crawl entire domains, not just individual pages, and as such has a lower reliability rate than other APIs. Not by a lot, but enough to be noticeable in the API Health<\/a> screen – the screen you can check to see if an API is up and running or currently unavailable if your calls run into issues or return error 500.<\/p>\n To prepare your environment, please boot up a Homestead Improved<\/a> instance.<\/p>\n Create a starter Laravel project by SSHing into the VM with In
\nFor me I want to display it on iOS devices.
\nDo you know any good libs or HTML templates to use the diffbot text in?<\/p>\nPHP Library<\/h2>\n
JavaScript Content<\/h2>\n
Pricing and API Health<\/h2>\n
Demo<\/h2>\n
Create Project<\/h3>\n
vagrant ssh<\/code>, going into the
Code<\/code> folder, and executing
composer create-project laravel/laravel Laravel --prefer-dist<\/code>. This will let you access the Laravel greeting page via
http://homestead.app:8000<\/code> from the host’s browser.<\/p>\n
Add a Route and Action<\/h3>\n
app/routes.php<\/code> add the following route:<\/p>\n