diff --git a/src/Client/Cache/MySQL/Manager.php b/src/Client/Cache/MySQL/Manager.php index 5edba77..f9ef864 100644 --- a/src/Client/Cache/MySQL/Manager.php +++ b/src/Client/Cache/MySQL/Manager.php @@ -290,6 +290,7 @@ public function cron($timeLimit, $workerID) SELECT base FROM robotstxt__cache1 WHERE worker = :workerID +ORDER BY nextUpdate DESC LIMIT 10; SQL ); @@ -297,10 +298,9 @@ public function cron($timeLimit, $workerID) $query->execute(); if ($query->rowCount() > 0) { while ($row = $query->fetch(PDO::FETCH_ASSOC)) { - if (!$this->push(new UriClient($row['base'], $this->curlOptions, $this->byteLimit))) { - throw new ClientException('Unable to update `' . $row['base'] . '`'); + if ($this->push(new UriClient($row['base'], $this->curlOptions, $this->byteLimit))) { + $log[(string)microtime(true)] = $row['base']; } - $log[(string)microtime(true)] = $row['base']; } } } @@ -338,7 +338,7 @@ public function clean($delay) $delay = self::CACHE_TIME + $delay; $query = $this->pdo->prepare(<<bindParam(':delay', $delay, PDO::PARAM_INT); diff --git a/src/Client/Directives/DelayCore.php b/src/Client/Directives/DelayCore.php index f604ce8..e55ff3e 100644 --- a/src/Client/Directives/DelayCore.php +++ b/src/Client/Directives/DelayCore.php @@ -77,10 +77,10 @@ public function getUserAgent() */ public function handle(PDO $pdo) { - if ($this->handler === null) { - $handler = new DatabaseHandler($pdo); - $this->handler = $handler->delayClient($this->base, $this->userAgent, $this->getValue()); + if (isset($this->handler)) { + return $this->handler; } - return $this->handler; + $handler = new DatabaseHandler($pdo); + return $this->handler = $handler->delayClient($this->base, $this->userAgent, $this->getValue()); } } diff --git a/src/Client/Directives/UserAgentTools.php b/src/Client/Directives/UserAgentTools.php index 2f97ddb..7ceb967 100644 --- a/src/Client/Directives/UserAgentTools.php +++ b/src/Client/Directives/UserAgentTools.php @@ -133,7 +133,7 @@ private function checkPath($directive, $uri) self::DIRECTIVE_NO_INDEX => $this->handler->noIndex(), self::DIRECTIVE_DISALLOW => $this->handler->disallow(), self::DIRECTIVE_ALLOW => $this->handler->allow(), - ] as $currentDirective => $handler) { + ] as $currentDirective => &$handler) { if ($handler->client()->isListed($uri)) { if ($currentDirective === self::DIRECTIVE_NO_INDEX) { return $directive === self::DIRECTIVE_DISALLOW; diff --git a/src/Parser/Directives/AllowParser.php b/src/Parser/Directives/AllowParser.php index ca2ccd3..69ee991 100644 --- a/src/Parser/Directives/AllowParser.php +++ b/src/Parser/Directives/AllowParser.php @@ -54,6 +54,18 @@ class AllowParser implements ParserInterface, RobotsTxtInterface */ private $host; + /** + * Optimized for performance + * @var bool + */ + private $optimized = false; + + /** + * Client cache + * @var AllowClient + */ + private $client; + /** * AllowParser constructor * @@ -94,39 +106,37 @@ public function add($line) */ private function addPath($path) { - foreach ([ - $path, - '/', - '*', - ] as $testPath) { - if (in_array($testPath, $this->path)) { - return false; - } - } - if ($this->isPath($path)) { + $path = rtrim($path, '*'); + if (in_array(mb_substr($path, 0, 1), [ + '/', + '*', + ])) { $this->path[] = $path; - $this->removeOverlapping(); + $this->optimized = false; } return in_array($path, $this->path); } /** - * Check if path is valid + * Render * - * @param string $path + * @param RenderHandler $handler * @return bool */ - private function isPath($path) + public function render(RenderHandler $handler) { - if (mb_strpos($path, '/') !== 0) { - foreach ([ - '*', - '?', - ] as $char) { - $path = str_replace($char, '/', $path); - } + if (!$this->optimized) { + $this->removeOverlapping(); + } + sort($this->path); + $inline = new RenderHandler($handler->getLevel()); + $this->host->render($inline); + $this->cleanParam->render($inline); + $handler->addInline($this->directive, $inline); + foreach ($this->path as $path) { + $handler->add($this->directive, $path); } - return mb_strpos($path, '/') === 0; + return true; } /** @@ -136,35 +146,18 @@ private function isPath($path) */ private function removeOverlapping() { - foreach ($this->path as $key1 => $path1) { - foreach ($this->path as $key2 => $path2) { + foreach ($this->path as $key1 => &$path1) { + foreach ($this->path as $key2 => &$path2) { if ($key1 !== $key2 && - mb_strpos($path1, $path2) === 0 + (mb_strpos($path1, $path2) === 0 || + mb_strpos(str_replace('*', '/', $path1), $path2) === 0 + ) ) { unset($this->path[$key1]); - return $this->removeOverlapping(); } } } - return true; - } - - /** - * Render - * - * @param RenderHandler $handler - * @return bool - */ - public function render(RenderHandler $handler) - { - sort($this->path); - $inline = new RenderHandler($handler->getLevel()); - $this->host->render($inline); - $this->cleanParam->render($inline); - $handler->addInline($this->directive, $inline); - foreach ($this->path as $path) { - $handler->add($this->directive, $path); - } + $this->optimized = true; return true; } @@ -175,6 +168,11 @@ public function render(RenderHandler $handler) */ public function client() { - return new AllowClient($this->path, $this->host->client(), $this->cleanParam->client()); + if (isset($this->client)) { + return $this->client; + } elseif (!$this->optimized) { + $this->removeOverlapping(); + } + return $this->client = new AllowClient($this->path, $this->host->client(), $this->cleanParam->client()); } } diff --git a/src/Parser/Directives/CleanParamParser.php b/src/Parser/Directives/CleanParamParser.php index e416760..4b8be2c 100644 --- a/src/Parser/Directives/CleanParamParser.php +++ b/src/Parser/Directives/CleanParamParser.php @@ -17,6 +17,12 @@ */ class CleanParamParser extends CleanParamParserCore { + /** + * Client cache + * @var CleanParamClient + */ + private $client; + /** * CleanParamParser constructor. */ @@ -32,6 +38,9 @@ public function __construct() */ public function client() { - return new CleanParamClient($this->cleanParam); + if (isset($this->client)) { + return $this->client; + } + return $this->client = new CleanParamClient($this->cleanParam); } } diff --git a/src/Parser/Directives/CleanParamParserCore.php b/src/Parser/Directives/CleanParamParserCore.php index 26848c5..2a8801f 100644 --- a/src/Parser/Directives/CleanParamParserCore.php +++ b/src/Parser/Directives/CleanParamParserCore.php @@ -42,12 +42,12 @@ public function add($line) { // split into parameter and path $array = array_map('trim', mb_split('\s+', $line, 2)); - // strip any invalid characters from path prefix - $path = '/'; if (isset($array[1])) { + // strip any invalid characters from path prefix $uriParser = new UriParser(preg_replace('/[^A-Za-z0-9\.-\/\*\_]/', '', $array[1])); - $path = $uriParser->encode(); + $path = rtrim($uriParser->encode(), '*'); } + $path = empty($path) ? '/' : $path; $param = array_map('trim', explode('&', $array[0])); foreach ($param as $key) { $this->cleanParam[$key][] = $path; diff --git a/src/Parser/Directives/DelayParser.php b/src/Parser/Directives/DelayParser.php index d095907..caba26e 100644 --- a/src/Parser/Directives/DelayParser.php +++ b/src/Parser/Directives/DelayParser.php @@ -37,6 +37,12 @@ class DelayParser implements ParserInterface, RobotsTxtInterface */ private $delay; + /** + * Client cache + * @var DelayClient + */ + private $client; + /** * DelayParser constructor. * @@ -80,7 +86,10 @@ public function add($line) */ public function client($userAgent = self::USER_AGENT, $fallbackValue = 0) { - return new DelayClient($this->base, $userAgent, $this->delay, $fallbackValue); + if (isset($this->client)) { + return $this->client; + } + return $this->client = new DelayClient($this->base, $userAgent, $this->delay, $fallbackValue); } /** diff --git a/src/Parser/Directives/HostParser.php b/src/Parser/Directives/HostParser.php index cfd4bc5..418e9ea 100644 --- a/src/Parser/Directives/HostParser.php +++ b/src/Parser/Directives/HostParser.php @@ -18,6 +18,12 @@ */ class HostParser extends HostParserCore { + /** + * Client cache + * @var HostClient + */ + private $client; + /** * HostParser constructor. * @@ -36,7 +42,10 @@ public function __construct($base, $effective) */ public function client() { - return new HostClient($this->base, $this->effective, isset($this->host[0]) ? [$this->host[0]] : []); + if (isset($this->client)) { + return $this->client; + } + return $this->client = new HostClient($this->base, $this->effective, isset($this->host[0]) ? [$this->host[0]] : []); } /** diff --git a/src/Parser/Directives/InlineCleanParamParser.php b/src/Parser/Directives/InlineCleanParamParser.php index b1ea204..f87abd3 100644 --- a/src/Parser/Directives/InlineCleanParamParser.php +++ b/src/Parser/Directives/InlineCleanParamParser.php @@ -18,6 +18,12 @@ */ class InlineCleanParamParser extends CleanParamParserCore { + /** + * Client cache + * @var InlineCleanParamClient + */ + private $client; + /** * InlineCleanParamParser constructor. */ @@ -33,6 +39,9 @@ public function __construct() */ public function client() { - return new InlineCleanParamClient($this->cleanParam); + if (isset($this->client)) { + return $this->client; + } + return $this->client = new InlineCleanParamClient($this->cleanParam); } } diff --git a/src/Parser/Directives/InlineHostParser.php b/src/Parser/Directives/InlineHostParser.php index f217f2c..dbda713 100644 --- a/src/Parser/Directives/InlineHostParser.php +++ b/src/Parser/Directives/InlineHostParser.php @@ -18,6 +18,12 @@ */ class InlineHostParser extends HostParserCore { + /** + * Client cache + * @var InlineHostClient + */ + private $client; + /** * InlineHostParser constructor. * @@ -36,7 +42,10 @@ public function __construct($base, $effective) */ public function client() { - return new InlineHostClient($this->base, $this->effective, $this->host); + if (isset($this->client)) { + return $this->client; + } + return $this->client = new InlineHostClient($this->base, $this->effective, $this->host); } /** diff --git a/src/Parser/Directives/RequestRateParser.php b/src/Parser/Directives/RequestRateParser.php index 88438db..03c57f0 100644 --- a/src/Parser/Directives/RequestRateParser.php +++ b/src/Parser/Directives/RequestRateParser.php @@ -33,6 +33,12 @@ class RequestRateParser implements ParserInterface, RobotsTxtInterface */ private $requestRates = []; + /** + * Client cache + * @var RequestRateClient + */ + private $client; + /** * RequestRate constructor. * @@ -107,8 +113,11 @@ private function draftParseRate($string) */ public function client($userAgent = self::USER_AGENT, $fallbackValue = 0) { + if (isset($this->client)) { + return $this->client; + } $this->sort(); - return new RequestRateClient($this->base, $userAgent, $this->requestRates, $fallbackValue); + return $this->client = new RequestRateClient($this->base, $userAgent, $this->requestRates, $fallbackValue); } /** diff --git a/src/Parser/Directives/RobotVersionParser.php b/src/Parser/Directives/RobotVersionParser.php index cb81681..58ce0bb 100644 --- a/src/Parser/Directives/RobotVersionParser.php +++ b/src/Parser/Directives/RobotVersionParser.php @@ -25,6 +25,12 @@ class RobotVersionParser implements ParserInterface, RobotsTxtInterface */ private $version; + /** + * Client cache + * @var RobotVersionClient + */ + private $client; + /** * RobotVersionParser constructor. */ @@ -54,7 +60,10 @@ public function add($line) */ public function client() { - return new RobotVersionClient($this->version); + if (isset($this->client)) { + return $this->client; + } + return $this->client = new RobotVersionClient($this->version); } /** diff --git a/src/Parser/Directives/SitemapParser.php b/src/Parser/Directives/SitemapParser.php index 4506625..8df5b21 100644 --- a/src/Parser/Directives/SitemapParser.php +++ b/src/Parser/Directives/SitemapParser.php @@ -26,6 +26,12 @@ class SitemapParser implements ParserInterface, RobotsTxtInterface */ private $sitemaps = []; + /** + * Client cache + * @var SitemapClient + */ + private $client; + /** * Sitemap constructor. */ @@ -59,7 +65,10 @@ public function add($line) */ public function client() { - return new SitemapClient($this->sitemaps); + if (isset($this->client)) { + return $this->client; + } + return $this->client = new SitemapClient($this->sitemaps); } /** diff --git a/src/Parser/Directives/UserAgentParser.php b/src/Parser/Directives/UserAgentParser.php index 6815453..6fa1469 100644 --- a/src/Parser/Directives/UserAgentParser.php +++ b/src/Parser/Directives/UserAgentParser.php @@ -237,28 +237,24 @@ public function export() * Client * * @param string $product - * @param int|string|null $version + * @param float|int|string|null $version * @param int|null $statusCode * @return UserAgentClient */ public function client($product = self::USER_AGENT, $version = null, $statusCode = null) { - $infix = $product . $version . $statusCode; - if (isset($this->client[$infix])) { - return $this->client[$infix]; + $userAgentString = $version === null ? $product : $product . '/' . $version; + if (isset($this->client[$userAgentString])) { + // Already cached + return $this->client[$userAgentString]; + } elseif (isset($this->handler[$userAgentString])) { + // 100% match + return $this->client[$userAgentString] = new UserAgentClient($this->handler[$userAgentString], $this->base, $statusCode, $product); } - $userAgentProduct = rtrim($product . '/' . $version, '/'); - $userAgentMatch = $userAgentProduct; - if (!isset($this->handler[$userAgentMatch])) { - $userAgentParser = new UserAgentStringParser($product, $version); - $userAgentProduct = $userAgentParser->getProduct(); - if (($userAgentMatch = $userAgentParser->getMostSpecific($this->getUserAgents())) === false) { - $userAgentMatch = self::USER_AGENT; - } + $userAgentParser = new UserAgentStringParser($product, $version); + if (($match = $userAgentParser->getMostSpecific($this->getUserAgents())) === false) { + $match = self::USER_AGENT; } - // Clear cache - $this->client = []; - // Cache and return - return $this->client[$infix] = new UserAgentClient($this->handler[$userAgentMatch], $this->base, $statusCode, $userAgentProduct); + return $this->client[$userAgentString] = new UserAgentClient($this->handler[$match], $this->base, $statusCode, $product); } } diff --git a/src/Parser/Directives/VisitTimeParser.php b/src/Parser/Directives/VisitTimeParser.php index 98d346f..ee36848 100644 --- a/src/Parser/Directives/VisitTimeParser.php +++ b/src/Parser/Directives/VisitTimeParser.php @@ -27,6 +27,12 @@ class VisitTimeParser implements ParserInterface, RobotsTxtInterface */ private $visitTimes = []; + /** + * Client cache + * @var VisitTimeClient + */ + private $client; + /** * VisitTime constructor. */ @@ -57,7 +63,10 @@ public function add($line) */ public function client() { - return new VisitTimeClient($this->visitTimes); + if (isset($this->client)) { + return $this->client; + } + return $this->client = new VisitTimeClient($this->visitTimes); } /** diff --git a/tests/AllowTest.php b/tests/AllowTest.php index ab4ed2d..e0b6914 100644 --- a/tests/AllowTest.php +++ b/tests/AllowTest.php @@ -125,9 +125,9 @@ public function generateDataForTest() User-agent: * Disallow: /admin Disallow: /admin -Disallow: /Admin +Disallow: /Admin* Disallow: /temp#comment -Disallow: /forum +Disallow: /forum** Disallow: /admin/cp/test/ User-agent: agentV diff --git a/tests/CacheSQLTest.php b/tests/CacheSQLTest.php index f3173bd..535964b 100644 --- a/tests/CacheSQLTest.php +++ b/tests/CacheSQLTest.php @@ -38,23 +38,14 @@ public function testCacheSQL($uri, $base) $client = $parser->client($uri); $this->assertInstanceOf('vipnytt\RobotsTxtParser\TxtClient', $client); - $query = $pdo->prepare(<<bindParam(':base', $base, PDO::PARAM_STR); - $query->execute(); - $row = $query->fetch(); - $this->assertEquals($client->render()->compressed(PHP_EOL), $row['content']); + $debug = $parser->debug($uri); + $this->assertTrue(count($debug, COUNT_NORMAL) >= 5); + $this->assertEquals($debug['content'], $client->render()->compressed(PHP_EOL)); for ($i = 1; $i <= 2; $i++) { $parser->client($uri); } - $this->assertTrue(count($parser->debug($uri), COUNT_NORMAL) >= 5); - $parser->cron(); $parser->clean(); @@ -87,6 +78,10 @@ public function generateDataForTest() 'ftp://mirror.ox.ac.uk/', 'ftp://mirror.ox.ac.uk:21', ], + [ + 'http://www.goldmansachs.com/robots.txt', + 'http://www.goldmansachs.com:80', + ], ]; } diff --git a/tests/CleanParamTest.php b/tests/CleanParamTest.php index 4578516..b901388 100644 --- a/tests/CleanParamTest.php +++ b/tests/CleanParamTest.php @@ -59,7 +59,7 @@ function generateDataForTest() [ <<assertInstanceOf('vipnytt\RobotsTxtParser\UriClient', $parser); + + $render1 = $parser->render()->compressed(); + $export1 = $parser->export(); + + $import = new RobotsTxtParser\Import($export1, $base); + $this->assertInstanceOf('vipnytt\RobotsTxtParser\Import', $import); + + $render2 = $import->render()->compressed(); + $export2 = $import->export(); + + $this->assertSame($render1, $render2); + $this->assertSame($export1, $export2); + } + + /** + * Generate test data + * + * @return array + */ + public function generateDataForTest() + { + return [ + [ + 'http://www.goldmansachs.com/robots.txt', + ], + ]; + } +}