diff --git a/.gitignore b/.gitignore index faa026d..3ebef09 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,8 @@ # WiP /packages/perso -/packages/seo-pocket-crawler # ... +/packages/crawler/data build tmp.csv var diff --git a/composer.json b/composer.json index d0b1b39..6d15795 100644 --- a/composer.json +++ b/composer.json @@ -9,7 +9,7 @@ "php": ">=8.1", "ext-ctype": "*", "ext-iconv": "*", - "danielstjules/stringy": "^3.1", + "voku/stringy": "^6.5", "jeremykendall/php-domain-parser": "^6.1", "league/csv": "^9.8", "league/uri": "^6.5", @@ -44,7 +44,7 @@ "PiedWeb\\Google\\": "packages/google/src", "PiedWeb\\GoogleSpreadsheetSeoScraper\\": "packages/google-spreadsheet-seo-scraper/src", "PiedWeb\\Extractor\\": "packages/extractor/src", - "PiedWeb\\SeoPocketCrawler\\": "packages/seo-pocket-crawler/src", + "PiedWeb\\Crawler\\": "packages/crawler/src", "PiedWeb\\TextAnalyzer\\": "packages/text-analyzer/src" } }, @@ -53,7 +53,7 @@ "PiedWeb\\Curl\\Test\\": "packages/curl/tests", "PiedWeb\\Google\\Test\\": "packages/google/tests", "PiedWeb\\Extractor\\Test\\": "packages/extractor/tests", - "PiedWeb\\SeoPocketCrawler\\Test\\": "packages/seo-pocket-crawler/tests", + "PiedWeb\\Crawler\\Test\\": "packages/crawler/tests", "PiedWeb\\TextAnalyzer\\Test\\": "packages/text-analyzer/tests" } }, diff --git a/packages/crawler/LICENSE b/packages/crawler/LICENSE new file mode 100644 index 0000000..74d85b6 --- /dev/null +++ b/packages/crawler/LICENSE @@ -0,0 +1,22 @@ +MIT License +=========== + +Copyright (c) Robin Delattre https://piedweb.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/packages/crawler/README.md b/packages/crawler/README.md new file mode 100644 index 0000000..2f88f86 --- /dev/null +++ b/packages/crawler/README.md @@ -0,0 +1,127 @@ +

+Open Source Package +

+ +# CLI Seo Pocket Crawler + +[![Latest Version](https://img.shields.io/github/tag/PiedWeb/Crawler.svg?style=flat&label=release)](https://github.com/PiedWeb/Crawler/tags) +[![Software License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](LICENSE) +[![GitHub Tests Action Status](https://img.shields.io/github/workflow/status/PiedWeb/Crawler/Tests?label=tests)](https://github.com/PiedWeb/PiedWeb/actions) +[![Quality Score](https://img.shields.io/scrutinizer/g/PiedWeb/PiedWeb.svg?style=flat)](https://scrutinizer-ci.com/g/PiedWeb/PiedWeb) +[![Code Coverage](https://codecov.io/gh/PiedWeb/PiedWeb/branch/main/graph/badge.svg)](https://codecov.io/gh/PiedWeb/PiedWeb/branch/main) +[![Type Coverage](https://shepherd.dev/github/PiedWeb/PiedWeb/coverage.svg)](https://shepherd.dev/github/PiedWeb/PiedWeb) +[![Total Downloads](https://img.shields.io/packagist/dt/piedweb/crawler.svg?style=flat)](https://packagist.org/packages/piedweb/crawler) + +Web Crawler to check few SEO basics. + +Use the collected data in your favorite spreadsheet software or retrieve them via your favorite language. + +French documentation available : +https://piedweb.com/seo/crawler + +## Install + +Via [Packagist](https://img.shields.io/packagist/dt/piedweb/crawler.svg?style=flat) + +```bash +$ composer create-project piedweb/crawler +``` + +## Usage + +### Crawler CLI + +```bash +$ bin/console crawler:go $start +``` + +#### Arguments: + +``` + start Define where the crawl start. Eg: https://piedweb.com + You can specify an id from a previous crawl. Other options will not be listen. + You can use `last` to continue the last crawl (just stopped) +``` + +#### Options: + +``` + -l, --limit=LIMIT Define where a depth limit [default: 5] + -i, --ignore=IGNORE Virtual Robots.txt to respect (could be a string or an URL). + -u, --user-agent=USER-AGENT Define the user-agent used during the crawl. [default: "SEO Pocket Crawler - PiedWeb.com/seo/crawler"] + -w, --wait=WAIT In Microseconds, the time to wait between 2 requests. Default 0,1s. [default: 100000] + -c, --cache-method=CACHE-METHOD In Microseconds, the time to wait between two request. Default : 100000 (0,1s). [default: 2] + -r, --restart=RESTART Permit to restart a previous crawl. Values 1 = fresh restart, 2 = restart from cache + -h, --help Display this help message + -q, --quiet Do not output any message + -V, --version Display this application version + --ansi Force ANSI output + --no-ansi Disable ANSI output + -n, --no-interaction Do not ask any interactive question + -v|vv|vvv, --verbose Increase the verbosity of messages: 1 for normal output, 2 for more verbose output and 3 for debug + + + +``` + +### Extract All External Links in 1s from a previous crawl + +```bash +$ bin/console crawler:external $id [--host] +``` + +``` + --id + id from a previous crawl + You can use `last` too show external links from the last crawl. + + --host -ho + flag permitting to get only host +``` + +### Calcul Page Rank + +Will update the previous `data.csv` generated. Then you can explore your website with the PoC `pagerank.html` +(in a server `npx http-server -c-1 --port 3000`). + +```bash +$ bin/console crawler:pagerank $id +``` + +``` + --id + id from a previous crawl + You can use `last` too calcul page rank from the last crawl. +``` + +## Testing + +```bash +$ composer test +``` + +## Todo + +- [ ] Better Links Harvesting and Recording (record context (list, nav, sentence...)) +- [ ] Transform the PoC (Page Rank Visualizer) +- [ ] Complex Page Rank Calculator (with 301, canonical, nofollow, etc.) + +## Contributing + +Please see [contributing](https://dev.piedweb.com/contributing) + +## Credits + +- [PiedWeb](https://piedweb.com) ak [Robind4](https://twitter.com/Robind4) +- [All Contributors](https://github.com/PiedWeb/:package_skake/graphs/contributors) + +## License + +The MIT License (MIT). Please see [License File](LICENSE) for more information. + +[![Latest Version](https://img.shields.io/github/tag/PiedWeb/PiedWeb.svg?style=flat&label=release)](https://github.com/PiedWeb/PiedWeb/tags) +[![Software License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/PiedWeb/PiedWeb/blob/master/LICENSE) +[![Build Status](https://img.shields.io/travis/PiedWeb/PiedWeb/master.svg?style=flat)](https://travis-ci.org/PiedWeb/PiedWeb) +[![Quality Score](https://img.shields.io/scrutinizer/g/PiedWeb/PiedWeb.svg?style=flat)](https://scrutinizer-ci.com/g/PiedWeb/PiedWeb) +[![Code Coverage](https://img.shields.io/scrutinizer/coverage/g/PiedWeb/PiedWeb.svg?style=flat)](https://scrutinizer-ci.com/g/PiedWeb/PiedWeb/code-structure) +[![Total Downloads](https://img.shields.io/packagist/dt/piedweb/crawler.svg?style=flat)](https://packagist.org/packages/piedweb/crawler) diff --git a/packages/crawler/bin/console b/packages/crawler/bin/console new file mode 100755 index 0000000..9cbe261 --- /dev/null +++ b/packages/crawler/bin/console @@ -0,0 +1,25 @@ +#!/usr/bin/env php +add(new \PiedWeb\Crawler\Command\CrawlerCommand()); +$application->add(new \PiedWeb\Crawler\Command\ShowExternalLinksCommand()); +$application->add(new \PiedWeb\Crawler\Command\PageRankCommand()); + +$application->run($input); diff --git a/packages/crawler/composer.json b/packages/crawler/composer.json new file mode 100644 index 0000000..d6ff258 --- /dev/null +++ b/packages/crawler/composer.json @@ -0,0 +1,35 @@ +{ + "name": "piedweb/crawler", + "type": "library", + "description": "Web Crawler to check few SEO basics.", + "keywords": [ + "Pied Web", + "Crawler" + ], + "homepage": "https://dev.piedweb.com", + "license": "MIT", + "authors": [ + { + "name": "Robin D. (ak Pied Web)", + "email": "contact@robin-d.fr", + "homepage": "https://piedweb.com" + } + ], + "require": { + "php": ">=8.1", + "piedweb/url-harvester": "*", + "league/csv": "^9.8", + "piedweb/curl": "*", + "symfony/console": "^6.1" + }, + "autoload": { + "psr-4": { + "PiedWeb\\Crawler\\": "src" + } + }, + "autoload-dev": { + "psr-4": { + "PiedWeb\\Crawler\\": "tests" + } + } +} diff --git a/packages/crawler/src/Command/CrawlerCommand.php b/packages/crawler/src/Command/CrawlerCommand.php new file mode 100644 index 0000000..c29338c --- /dev/null +++ b/packages/crawler/src/Command/CrawlerCommand.php @@ -0,0 +1,159 @@ +setDescription('Crawl a website.'); + + $this + ->addArgument( + 'start', + InputArgument::REQUIRED, + 'Define where the crawl start. Eg: https://piedweb.com' + .\PHP_EOL.'You can specify an id from a previous crawl. Other options will not be listen.' + .\PHP_EOL.'You can use `last` to continue the last crawl (just stopped).' + ) + ->addOption('limit', 'l', InputOption::VALUE_REQUIRED, 'Define where a depth limit', 5) + ->addOption( + 'ignore', + 'i', + InputOption::VALUE_REQUIRED, + 'Virtual Robots.txt to respect (could be a string or an URL).' + ) + ->addOption( + 'user-agent', + 'u', + InputOption::VALUE_REQUIRED, + 'Define the user-agent used during the crawl.', + 'SEO Pocket Crawler - PiedWeb.com/seo/crawler' + ) + ->addOption( + 'wait', + 'w', + InputOption::VALUE_REQUIRED, + 'In Microseconds, the time to wait between 2 requests. Default 0,1s.', + 100000 + ) + ->addOption( + 'cache-method', + 'c', + InputOption::VALUE_REQUIRED, + 'In Microseconds, the time to wait between two request. Default : 100000 (0,1s).', + \PiedWeb\Crawler\Recorder::CACHE_ID + ) + ->addOption( + 'restart', + 'r', + InputOption::VALUE_REQUIRED, + 'Permit to restart a previous crawl. Values 1 = fresh restart, 2 = restart from cache' + ) + ; + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $this->checkArguments($input); + + $start = microtime(true); + + $crawler = $this->initCrawler($input); + + $output->writeln(['', '', 'Crawl starting !', '============', '', 'ID: '.$crawler->config->getId()]); + $output->writeln([ + null !== $this->id ? ($input->getOption('restart') ? 'Restart' : 'Continue') : '', + '', + 'Details : ', + '- Crawl starting at '.$crawler->config->getStartUrl(), + '- User-Agent used `'.$crawler->config->userAgent, + '- `'.$crawler->config->sleepBetweenReqInMs.' ms between two requests', + ]); + + $crawler->crawl(); + + $end = microtime(true); + + $output->writeln(['', '---------------', 'Crawl succeed', 'You can find your data in ']); + + echo realpath($crawler->config->getDataFolder()).'/data.csv'.\PHP_EOL; + + $output->writeln(['', '', '----Chrono----', round($end - $start, 2).'s', '', '']); + + return 0; + } + + public function checkArguments(InputInterface $input): void + { + $start = $input->getArgument('start'); + if (! filter_var($start, \FILTER_VALIDATE_URL)) { + if (! \is_string($start)) { + throw new \LogicException(); + } + + $this->id = $start; + } + } + + public function initCrawler(InputInterface $input): Crawler + { + if (null === $this->id) { + return new Crawler( + (new CrawlerConfig( + \intval($input->getOption('limit')), + \strval($input->getOption('user-agent')), + \intval($input->getOption('cache-method')), + \intval($input->getOption('wait')), + $this->loadVirtualRobotsTxt($input) + ))->setStartUrl(\strval($input->getArgument('start'))), + ! $input->getOption('quiet') + ); + } + + if ($input->getOption('restart')) { + return Crawler::restart( + $this->id, + 2 == $input->getOption('restart') ? true : false, // $fromCache + ! $input->getOption('quiet') + ); + } + + return Crawler::continue($this->id, ! $input->getOption('quiet')); + } + + public function loadVirtualRobotsTxt(InputInterface $input): string + { + if (null === $input->getOption('ignore')) { + return ''; + } + + $ignore = \strval($input->getOption('ignore')); + + if (filter_var($ignore, \FILTER_VALIDATE_URL)) { + return StaticClient::request($ignore); + } + + if (file_exists($ignore)) { + return \Safe\file_get_contents($ignore); + } + + throw new \Exception('An error occured with your --ignore option'); + } +} diff --git a/packages/crawler/src/Command/PageRankCommand.php b/packages/crawler/src/Command/PageRankCommand.php new file mode 100644 index 0000000..d158e94 --- /dev/null +++ b/packages/crawler/src/Command/PageRankCommand.php @@ -0,0 +1,45 @@ +setDescription('Add internal page rank to index.csv'); + + $this + ->addArgument( + 'id', + InputArgument::REQUIRED, + 'id from a previous crawl' + .\PHP_EOL.'You can use `last` to calcul page rank from the last crawl.' + ) + ; + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $pr = new SimplePageRankCalculator(\strval($input->getArgument('id'))); + + echo $pr->record().\PHP_EOL; + + new LinksVisualizer(\strval($input->getArgument('id'))); + + return 0; + } +} diff --git a/packages/crawler/src/Command/ShowExternalLinksCommand.php b/packages/crawler/src/Command/ShowExternalLinksCommand.php new file mode 100644 index 0000000..c4d8991 --- /dev/null +++ b/packages/crawler/src/Command/ShowExternalLinksCommand.php @@ -0,0 +1,66 @@ +setDescription('List external domain linked.'); + + $this + ->addArgument( + 'id', + InputArgument::REQUIRED, + 'id from a previous crawl' + .\PHP_EOL.'You can use `last` to get show external links from the last crawl.' + ) + ->addOption('host', 'ho', InputOption::VALUE_NONE, 'get only host') + ; + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $table = new Table($output); + + $table->setHeaders($input->getOption('host') ? ['Host'] : ['url', 'from']); + + $links = (new ExtractExternalLinks(\strval($input->getArgument('id'))))->get(); + arsort($links); + $ever = []; + foreach ($links as $link => $from) { + if ($input->getOption('host')) { + $host = parse_url($link, \PHP_URL_HOST); + if ($host && ! isset($ever[$host])) { + $ever[$host] = 1; + $table->addRow([$host]); + } + } else { + $first = true; + foreach ($from as $url) { + $table->addRow([$first ? $link : '', $url]); + $first = false; + } + } + } + + $table->render(); + + return 0; + } +} diff --git a/packages/crawler/src/Crawler.php b/packages/crawler/src/Crawler.php new file mode 100644 index 0000000..50d51fe --- /dev/null +++ b/packages/crawler/src/Crawler.php @@ -0,0 +1,210 @@ + */ + private string $harvester = \PiedWeb\Crawler\CrawlerUrl::class; + + private int $currentClick = 0; + + private int $counter = 0; + + /** + * @var array + */ + private array $urls = []; + + /** @var string[] */ + private array $everCrawled = []; + + private bool $nothingUpdated = true; + + public readonly CrawlerConfig $config; + + public function __construct( + CrawlerConfig|string $config, + public readonly bool $debug = false + ) { + $this->config = \is_string($config) ? (new CrawlerConfig())->setStartUrl($config) : $config; + + $this->urls[$this->config->getStartUrl()->getAbsoluteUri()] = null; + } + + public static function continue( + string $id, + bool $debug = true, + ?string $dataDirectory = null + ): self { + $config = CrawlerConfig::loadFrom($id, $dataDirectory); + $current = new self($config, $debug); + + $dataFromPreviousCrawl = $current->config->getRecordPlayer()->getDataFromPreviousCrawl(); + $current->counter = $dataFromPreviousCrawl['counter']; + $current->currentClick = $dataFromPreviousCrawl['currentClick']; + $current->urls = $dataFromPreviousCrawl['urls']; + + return $current; + } + + public static function restart( + string $id, + bool $fromCache = false, + bool $debug = true, + ?string $dataDirectory = null + ): self { + $config = CrawlerConfig::loadFrom($id, $dataDirectory); + $current = new self($config, $debug); + if ($fromCache) { + $current->harvester = \PiedWeb\Crawler\CrawlerUrlFromCache::class; + } + + exec('rm -rf '.$current->config->getDataFolder().Recorder::LINKS_DIR); // reset Links + $current->urls[$current->config->getStartUrl()->getAbsoluteUri()] = null; + + return $current; + } + + public function crawl(): bool + { + $this->debugInitCrawlLoop(); + + $absoluteUriList = array_keys($this->urls); + foreach ($absoluteUriList as $i => $absoluteUri) { + if (\in_array($absoluteUri, $this->everCrawled, true)) { + continue; + } + + if (0 !== $i) { + usleep($this->config->sleepBetweenReqInMs); + } + + $this->everCrawled[] = $absoluteUri; + $this->crawlUrl($absoluteUri); + } + + ++$this->currentClick; + + // Record after each Level: + $this->config->getRecorder()->record($this->getUrls()); + + $record = $this->nothingUpdated || $this->currentClick >= $this->config->depthLimit; + + return $record ? true : $this->crawl(); + } + + private function getUrl(string $absoluteUri): Url + { + return $this->urls[$absoluteUri] ?? $this->urls[$absoluteUri] = new Url($this->config->getBase().$absoluteUri, $this->currentClick); + } + + /** + * @return Url[] + */ + private function getUrls(): array + { + return array_filter($this->urls, fn ($url): bool => null !== $url); + } + + private function crawlUrl(string $absoluteUri): void + { + if (null !== $this->urls[$absoluteUri] && null !== $this->urls[$absoluteUri]->getCanBeCrawled()) { + $this->debug('déjà crawlée'); + + return; + } + + $this->debugCrawlUrl($absoluteUri); + $this->nothingUpdated = false; + ++$this->counter; + + $url = $this->getUrl($absoluteUri); + if (! $this->canBeCrawled($url)) { + $this->debug('can`t be crawled'); + + return; + } + + /** @var CrawlerUrl */ + $crawlerUrl = new $this->harvester($url, $this->config); + + $this->updateInboundLinksAndUrlsToParse($url, $url->getLinks()); + $url->setDiscovered(\count($this->urls)); + + $this->config->getRecorder()->recordLinksIndex($this->config->getBase(), $url, $this->urls, $url->getLinks()); + + $this->autosave(); + } + + private function autosave(): void + { + if (0 !== $this->counter && $this->counter / $this->config->autosave == round($this->counter / $this->config->autosave)) { + $this->debug(' --- auto-save'); + $this->config->getRecorder()->record($this->getUrls()); + } + } + + private function canBeCrawled(Url $url): bool + { + return $url->getCanBeCrawled() ?? + $url->setCanBeCrawled($this->config->getVirtualRobots() + ->allows($this->config->getBase().$url->getUri(), $this->config->userAgent)); + } + + /** + * @param Link[] $links + */ + public function updateInboundLinksAndUrlsToParse(Url $url, array $links): void + { + $everAdd = []; + foreach ($links as $link) { + if (Link::LINK_INTERNAL !== $link->getType()) { + continue; + } + + $newUri = $link->getUrl()->getAbsoluteUri(); + $this->urls[$newUri] ??= new Url( + $link->getPageUrl()->__toString(), + $this->currentClick + 1 + ); + if (isset($everAdd[$newUri])) { + continue; + } + + $everAdd[$newUri] = 1; + if (! $link->mayFollow) { + $this->urls[$newUri]->incrementInboundLinksNofollow(); + } else { + $this->urls[$newUri]->incrementInboundLinks(); + } + } + } + + private function debug(string $text): void + { + if ($this->debug) { + echo $text.\PHP_EOL; + } + } + + private function debugCrawlUrl(string $url): void + { + if ($this->debug) { + echo $this->counter.'/'.\count($this->urls).' '.$this->config->getBase().$url.\PHP_EOL; + } + } + + private function debugInitCrawlLoop(): void + { + if ($this->debug) { + echo \PHP_EOL.\PHP_EOL.'// -----'.\PHP_EOL.'// '.$this->counter.' crawled / ' + .\count($this->urls).' found '.\PHP_EOL.'// -----'.\PHP_EOL; + } + } +} diff --git a/packages/crawler/src/CrawlerConfig.php b/packages/crawler/src/CrawlerConfig.php new file mode 100644 index 0000000..0701d93 --- /dev/null +++ b/packages/crawler/src/CrawlerConfig.php @@ -0,0 +1,208 @@ + $params + */ + public function __construct( + public readonly int $depthLimit = 0, + public readonly string $userAgent = 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)', + public readonly int $cacheMethod = Recorder::CACHE_URI, + public readonly int $sleepBetweenReqInMs = 10000, // microseconds + public readonly string $virtualRobotsTxtRules = '', + public readonly bool $executeJs = false, + public readonly array $toHarvest = [ + 'indexable', + 'links', + 'textData', + 'title', + 'h1', + 'canonical', + ], + string $dataDirectory = '', + public readonly int $autosave = 500 // number of Urls we can crawled before saving (0 = autosaving disabled) + ) { + $this->dataDirectory = self::dataDirectory($dataDirectory); + } + + public function setStartUrl(string $startUrl): self + { + $this->startUrl = new UrlManipuler($startUrl); + $this->id ??= date('ymdHi').'-'.$this->startUrl->getHost(); + + return $this; + } + + public function getStartUrl(): UrlManipuler + { + return $this->startUrl; + } + + public static function dataDirectory(?string $dataDirectory = null): string + { + $dataDirectory = (string) $dataDirectory; + + return rtrim('' !== $dataDirectory ? $dataDirectory : __DIR__.'/../data', '/'); + } + + /** + * @return string id + */ + public static function getLastCrawl(string $dataDirectory): string + { + $dir = \Safe\scandir($dataDirectory); + $lastCrawl = null; + $lastRunAt = null; + + foreach ($dir as $file) { + if ('.' != $file && '..' != $file + && is_dir($dataDirectory.'/'.$file) + && filemtime($dataDirectory.'/'.$file) > $lastRunAt) { + $lastCrawl = $file; + $lastRunAt = filemtime($dataDirectory.'/'.$file); + } + } + + if (null === $lastCrawl) { + throw new \Exception('No crawl previously runned'); + } + + return $lastCrawl; + } + + public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self + { + $dataDirectory = self::dataDirectory($dataDirectory); + + if ('last' === $crawlId) { + $crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/')); + } + + $configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json'; + if (! file_exists($configFilePath)) { + throw new \Exception('Crawl `'.$crawlId.'` not found ('.$configFilePath.').'); + } + + $config = \Safe\json_decode(file_get_contents($configFilePath), true); // @phpstan-ignore-line + + return (new self( + $config[2], // @phpstan-ignore-line + $config[3], // @phpstan-ignore-line + $config[4], // @phpstan-ignore-line + $config[5], // @phpstan-ignore-line + $config[6], // @phpstan-ignore-line + $config[7], // @phpstan-ignore-line + $config[8], // @phpstan-ignore-line + $dataDirectory + ))->setStartUrl(\strval($config[1]))// @phpstan-ignore-line + ->setId($config[0]); // @phpstan-ignore-line + } + + public function recordConfig(): void + { + $this->getRecorder(); // permit to create folder + file_put_contents($this->getDataFolder().'/config.json', \Safe\json_encode([ + $this->id, + $this->getStartUrl()->get(), + $this->depthLimit, + $this->userAgent, + $this->cacheMethod, + $this->sleepBetweenReqInMs, + $this->virtualRobotsTxtRules, + $this->executeJs, + $this->toHarvest, + ])); + } + + private static function getDataFolderFrom(string $id, ?string $path): string + { + return ($path ?? __DIR__.'/../data').'/'.$id; + } + + public function getDataFolder(): string + { + return $this->dataDirectory.'/'.$this->id; + } + + public function getVirtualRobots(): RobotsTxt + { + if (null === $this->virtualRobots) { + $this->virtualRobots = new RobotsTxt($this->virtualRobotsTxtRules); + } + + return $this->virtualRobots; + } + + private function setId(string $id): self + { + $this->id = $id; + + return $this; + } + + public function getId(): string + { + return $this->id ?? throw new \Exception('id is not setted'); + } + + public function getBase(): string + { + return $this->base ??= preg_match('@^(http://|https://)?[^/\?#]+@', $url = $this->startUrl->get(), $match) ? $match[0] : $url; + } + + public function getUrl(Url $url): UrlManipuler + { + return new UrlManipuler($this->getBase().$url->getUri()); + } + + public function getRobotsTxt(): RobotsTxt + { + if (null === $this->robotsTxt) { + $this->robotsTxt = (new RobotsTxtExtractor())->get($this->startUrl); + } + + return $this->robotsTxt; + } + + public function getRecorder(): Recorder + { + if (null === $this->recorder) { + $this->recorder = new Recorder($this->getDataFolder(), $this->cacheMethod); + } + + return $this->recorder; + } + + public function getRecordPlayer(): RecordPlayer + { + if (null === $this->recordPlayer) { + $this->recordPlayer = new RecordPlayer($this); + } + + return $this->recordPlayer; + } +} diff --git a/packages/crawler/src/CrawlerConfig.php~ b/packages/crawler/src/CrawlerConfig.php~ new file mode 100644 index 0000000..ee78367 --- /dev/null +++ b/packages/crawler/src/CrawlerConfig.php~ @@ -0,0 +1,41 @@ + + + // could be add in an other class.. + /** + * @var array + */ + private array $index = []; + + /** + * @return array + */ + private function getIndexFromPreviousCrawl(): array + { + if ([] !== $this->index) { + return $this->index; + } + + $indexFilePath = $this->getDataFolder().'/index.csv'; + if (! file_exists($indexFilePath)) { + throw new \Exception('Previous crawl\'s data not found (index.csv)'); + } + + $csv = Reader::createFromPath($indexFilePath, 'r'); + $csv->setHeaderOffset(0); + + $records = $csv->getRecords(); + foreach ($records as $r) { + if (!is_array($r) || !isset($r['id'])||!isset($r['uri'])) throw new LogicException(); + $this->index[intval($r['id'])] = new Url($this->getBase().$r['uri'], 0); + $this->index[intval($r['id'])]->setId($r['id']); + } + + return $this->index; + } + + public function getUrlFromId(int $id, bool $addBase = true): ?string + { + $index = $this->getIndexFromPreviousCrawl(); + + return isset($index[$id]) ? ($addBase ? $this->getBase() : '').$index[$id]->getUri() : null; + } \ No newline at end of file diff --git a/packages/crawler/src/CrawlerUrl.php b/packages/crawler/src/CrawlerUrl.php new file mode 100644 index 0000000..8c8ace3 --- /dev/null +++ b/packages/crawler/src/CrawlerUrl.php @@ -0,0 +1,217 @@ +harvest(); + } + + protected function harvest(): void + { + $this->request(); + if (0 !== $this->url->getNetworkStatus()) { + return; + } + + if ($this->isRedirection()) { + return; + } + + $this->defaultHarvesting(); + } + + protected function getCurlClient(): ExtendedClient + { + return self::$curlClient ??= (new ExtendedClient()) + ->setDefaultGetOptions() + ->setDefaultSpeedOptions() + ->setMaximumResponseSize(1_000_000) // 1Mo + ->fakeBrowserHeader() + ->setUserAgent($this->config->userAgent) + ->setOpt(\CURLOPT_MAXREDIRS, 0) + ->setOpt(\CURLOPT_FOLLOWLOCATION, false) + // ->setOpt(\CURLOPT_COOKIE, false) + ->setOpt(\CURLOPT_CONNECTTIMEOUT, 20) + ->setOpt(\CURLOPT_TIMEOUT, 80); + } + + protected function request(): void + { + if ($this->config->executeJs) { + throw new \Exception('Not yet implemented'); + } + + $request = $this->getCurlClient() + ->request($this->config->getBase().$this->url->getUri()); + + if (! $request) { + $this->url->setNetworkStatus( + 42 != $this->getCurlClient()->getError() ? NetworkStatus::NETWORK_ERROR : NetworkStatus::TOO_BIG + ); + $responseToCache = 'curl_error_code:'.$this->getCurlClient()->getError(); + } + + $this->setUrlDataFromResponse($this->getCurlClient()->getResponse()); + + $this->config->getRecorder()->cache( + $responseToCache ?? $this->getCurlClient()->getResponse(), + $this->url + ); + $this->url->setSource($this->config->getRecorder()->getCacheFilePath($this->url)); + } + + protected function setUrlDataFromResponse(Response $response): void + { + $this->url->setHeaders($response->getRawHeaders()); + $this->url->setStatusCode($response->getStatusCode()); + $this->url->setMimeType($response->getMimeType()); + $this->url->setResponseTime((int) $response->getInfo('total_time')); + $this->url->setSize((int) $response->getInfo('size_download')); + + if ('text/html' !== $response->getMimeType()) { + $this->url->setNetworkStatus(NetworkStatus::NOT_HTML); + } elseif (200 === $response->getStatusCode()) { + $this->url->setHtml($response->getBody()); + } + } + + /** + * permit to easily extend and change what is harvested, for example adding : + * $this->harvestBreadcrumb(); + * $this->url->setKws(','.implode(',', array_keys($this->getHarvester()->getKws())).','); // Slow ~20% + * $this->url->setRatioTextCode($this->getHarvester()->getRatioTxtCode()); // Slow ~30% + * $this->url->setH1($this->getHarvester()->getUniqueTag('h1') ?? '');. + */ + protected function defaultHarvesting(): void + { + foreach ($this->config->toHarvest as $toHarvest) { + $toHarvest = ucfirst($toHarvest); + if (! method_exists($this, $harvestMethod = 'harvest'.$toHarvest)) { + throw new \LogicException($harvestMethod.' doesn`t exist.'); + } + + $this->$harvestMethod(); + } + } + + protected function isRedirection(): bool + { + $redirLink = (new RedirectionExtractor($this->url->getUrl(), $this->url->getParsedHeaders())) + ->getRedirectionLink(); + + if (null === $redirLink) { + return false; + } + + $this->links[] = $redirLink; + + $this->url->setIndexable(false); + $this->url->setIndexableStatus(Indexable::NOT_INDEXABLE['redir']); + + return true; + } + + protected function harvestIndexable(): void + { + $indexable = new Indexable( + $this->url->getUrl(), + (new RobotsTxtExtractor())->get($this->url->getUrl()), + $this->url->getDomCrawler(), + $this->url->getStatusCode(), + $this->url->getHeaders() + ); + $this->url->setIndexable($indexable->isIndexable()); + $this->url->setIndexableStatus($indexable->getIndexableStatus()); + } + + protected function harvestLinks(): void + { + $linksExtractor = new LinksExtractor( + $this->url->getUrl(), + $this->url->getDomCrawler(), + $this->url->getHeaders(), + LinksExtractor::SELECT_ALL + ); + $links = $linksExtractor->get(); + foreach ($links as $link) { + $this->links[] = $link; + } + + $this->url->setLinks($links); + $this->url->setLinksTotal(\count($links)); + $this->url->setLinksSelf(\count($linksExtractor->get(Link::LINK_SELF))); + $this->url->setLinksInternal(\count($linksExtractor->get(Link::LINK_INTERNAL))); + $this->url->setLinksSub(\count($linksExtractor->get(Link::LINK_SUB))); + $this->url->setLinksExternal(\count($linksExtractor->get(Link::LINK_EXTERNAL))); + $this->url->setLinksDuplicate($linksExtractor->getNbrDuplicateLinks()); + } + + protected function harvestTextData(): void + { + $textData = new TextData($this->url->getHtml(), $this->url->getDomCrawler()); + $this->url->setWordCount($textData->getWordCount()); + $this->url->setTextRatio($textData->getRatioTxtCode()); + $this->url->setExpressions($textData->getTextAnalysis()->getExpressions(2)); + $this->url->setFlatContent($textData->getFlatContent()); + } + + protected function harvestTitle(): void + { + $this->url->setTitle( + (new TagExtractor($this->url->getDomCrawler())) + ->getFirst('head title') ?? '' + ); + + $nodeMetaDesc = $this->url->getDomCrawler()->filterXPath('//meta[@name="description"]'); + if ($nodeMetaDesc->count() > 0) { + $this->url->setMetaDescription( + $nodeMetaDesc->attr('content') ?? '' + ); + } + } + + protected function harvestHrefLang(): void + { + $this->url->setHrefLangList( + (new HrefLangExtractor($this->url->getDomCrawler()))->getHrefLangList() + ); + } + + protected function harvestH1(): void + { + $this->url->setH1( + (new TagExtractor($this->url->getDomCrawler())) + ->getFirst('h1') ?? '' + ); + } + + protected function harvestCanonical(): void + { + $this->url->setCanonical( + (new CanonicalExtractor($this->url->getUrl(), $this->url->getDomCrawler())) + ->get() + ); + } +} diff --git a/packages/crawler/src/CrawlerUrlFromCache.php b/packages/crawler/src/CrawlerUrlFromCache.php new file mode 100644 index 0000000..2de3226 --- /dev/null +++ b/packages/crawler/src/CrawlerUrlFromCache.php @@ -0,0 +1,33 @@ +config->getRecorder()->getCacheFilePath($this->url); + if (! file_exists($filePath)) { + parent::request(); + + return; + } + + $cachedContent = \Safe\file_get_contents($filePath); + if (str_starts_with($cachedContent, 'curl_error_code:') + && 42 != substr($cachedContent, \strlen('curl_error_code:'))) { + parent::request(); // retry if was not stopped because too big + + return; + } + + $response = new ResponseFromCache( + $cachedContent, + $this->config->getBase().$this->url->getUri(), + \Safe\json_decode(\Safe\file_get_contents($filePath.'---info'), true) // @phpstan-ignore-line + ); + $this->setUrlDataFromResponse($response); + } +} diff --git a/packages/crawler/src/ExtractExternalLinks.php b/packages/crawler/src/ExtractExternalLinks.php new file mode 100644 index 0000000..f168586 --- /dev/null +++ b/packages/crawler/src/ExtractExternalLinks.php @@ -0,0 +1,67 @@ +> + */ + private array $external = []; + + private readonly \PiedWeb\Crawler\CrawlerConfig $config; + + public function __construct( + string $id, + ?string $dataDirectory = null + ) { + $this->config = CrawlerConfig::loadFrom($id, $dataDirectory); + $this->dir = $this->config->getDataFolder().'/links'; + $this->scanLinksDir(); + } + + private function scanLinksDir(): void + { + if ($resource = opendir($this->dir)) { + while (false !== ($filename = readdir($resource))) { + if (str_starts_with($filename, 'From_')) { + $this->harvestExternalLinks( + trim(\Safe\file_get_contents($this->dir.'/'.$filename)), + $this->config->getRecordPlayer()->getUrlFromId((int) substr($filename, \strlen('From_'))) + ); + } + } + + closedir($resource); + } + } + + private function harvestExternalLinks(string $strUrlsLinked, string $from): void + { + if ('' === $strUrlsLinked) { + return; + } + + $lines = explode(\chr(10), $strUrlsLinked); + + foreach ($lines as $line) { + if (! str_starts_with($line, $this->config->getBase())) { + if (! isset($this->external[$line])) { + $this->external[$line] = []; + } + + $this->external[$line][] = $from; + } + } + } + + /** + * @return array> + */ + public function get(): array + { + return $this->external; + } +} diff --git a/packages/crawler/src/LinksVisualizer.php b/packages/crawler/src/LinksVisualizer.php new file mode 100644 index 0000000..40cc3f9 --- /dev/null +++ b/packages/crawler/src/LinksVisualizer.php @@ -0,0 +1,71 @@ +, 'links': array} + */ + protected array $results = ['nodes' => [], 'links' => []]; + + public function __construct(string $id, ?string $dataDirectory = null) + { + $this->config = CrawlerConfig::loadFrom($id, $dataDirectory); + + // $this->loadNodes(); + // $this->loadLinks(); + + file_put_contents( + $this->config->getDataFolder().'/pagerank.html', + file_get_contents(__DIR__.'/Resources/PageRankVisualizer.html') + ); + /* + file_put_contents( + $this->config->getDataFolder().Recorder::LINKS_DIR.'/data.json', + json_encode($this->results, JSON_PRETTY_PRINT) + );**/ + } + + protected function loadLinks(): void + { + $csv = Reader::createFromPath($this->config->getDataFolder().Recorder::LINKS_DIR.'/Index.csv', 'r'); + $csv->setHeaderOffset(0); + $records = $csv->getRecords(); + foreach ($records as $r) { + if (! \is_array($r) || ! isset($r['To'])) { + throw new \LogicException(); + } + + if ($r['To'] > 0 // pas de liens externe + && isset($this->results['nodes'][$r['From']]) && isset($this->results['nodes'][$r['To']]) + ) { + $this->results['links'][] = ['target' => $r['From'], 'source' => $r['To']]; + } + } + + $this->results['nodes'] = array_values($this->results['nodes']); + } + + protected function loadNodes(): void + { + $urls = $this->config->getRecordPlayer()->getDataFromPreviousCrawl()['urls']; + + foreach ($urls as $url) { + if (1 == $url->getMimeType()) { // seulement html + $this->results['nodes'][$url->getId()] = [ + 'id' => $url->getId(), + 'pagerank' => $url->getPagerank(), + 'uri' => $url->getUri(), + ]; + } + } + } +} diff --git a/packages/crawler/src/MimeType.php b/packages/crawler/src/MimeType.php new file mode 100644 index 0000000..e4d0993 --- /dev/null +++ b/packages/crawler/src/MimeType.php @@ -0,0 +1,69 @@ + + */ + public const TYPES = [ + 'audio/aac' => 1, + 'application/x-abiword' => 2, + 'application/octet-stream' => 3, + 'video/x-msvideo' => 4, + 'application/vnd.amazon.ebook' => 5, + 'image/bmp' => 6, + 'application/x-bzip' => 7, + 'application/x-bzip2' => 8, + 'application/x-csh' => 9, + 'text/css' => 10, + 'text/csv' => 11, + 'application/msword' => 12, + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 13, + 'application/vnd.ms-fontobject' => 14, + 'application/epub+zip' => 15, + 'image/gif' => 16, + 'text/html' => 17, + 'image/x-icon' => 18, + 'text/calendar' => 19, + 'application/java-archive' => 20, + 'image/jpeg' => 21, + 'application/javascript' => 24, + 'application/json' => 23, + 'audio/midi' => 24, + 'video/mpeg' => 25, + 'application/vnd.apple.installer+xml' => 26, + 'application/vnd.oasis.opendocument.presentation' => 27, + 'application/vnd.oasis.opendocument.spreadsheet' => 28, + 'application/vnd.oasis.opendocument.text' => 29, + 'audio/ogg' => 30, + 'video/ogg' => 31, + 'application/ogg' => 32, + 'font/otf' => 33, + 'image/png' => 34, + 'application/pdf' => 35, + 'application/vnd.ms-powerpoint ' => 36, + 'application/x-rar-compressed' => 37, + 'application/rtf' => 38, + 'application/x-sh' => 39, + 'image/svg+xml' => 40, + 'application/x-shockwave-flash' => 41, + 'application/x-tar' => 42, + 'image/tiff' => 43, + 'application/typescript' => 44, + 'font/ttf' => 45, + 'application/vnd.visio' => 46, + 'audio/x-wav' => 47, + 'audio/webm' => 48, + 'video/webm' => 49, + 'image/webp' => 50, + 'font/woff' => 51, + 'font/woff4' => 52, + 'application/xhtml+xml' => 53, + 'application/vnd.ms-excel ' => 54, + 'application/xml' => 55, + 'application/vnd.mozilla.xul+xml' => 56, + 'application/zip' => 57, + ]; +} diff --git a/packages/crawler/src/NetworkStatus.php b/packages/crawler/src/NetworkStatus.php new file mode 100644 index 0000000..ee79f10 --- /dev/null +++ b/packages/crawler/src/NetworkStatus.php @@ -0,0 +1,21 @@ + + */ + private array $index = []; + + public function __construct( + private readonly CrawlerConfig $config + ) { + } + + private function loadIndexFromPreviousCrawl(): void + { + if ([] !== $this->index) { + return; + } + + $indexFilePath = $this->config->getDataFolder().'/index.csv'; + if (! file_exists($indexFilePath)) { + throw new \Exception('Previous crawl\'s data not found (index.csv)'); + } + + $csv = Reader::createFromPath($indexFilePath, 'r'); + $csv->setHeaderOffset(0); + + $records = $csv->getRecords(); + foreach ($records as $r) { + if (! \is_array($r) || ! isset($r['id']) || ! isset($r['uri']) || ! \is_string($r['uri'])) { + throw new \LogicException(); + } + + $this->index[(int) $r['id']] = $r['uri']; + } + } + + public function getUrlFromId(int $id): string + { + $this->loadIndexFromPreviousCrawl(); + + if (! isset($this->index[$id])) { + throw new \LogicException(); + } + + return $this->index[$id]; + } + + /** + * @return array{'urls': Url[], 'counter': int, 'currentClick': int} + */ + public function getDataFromPreviousCrawl(): array + { + $r = []; + $dataFilePath = $this->config->getDataFolder().'/data.csv'; + if (! file_exists($dataFilePath)) { + throw new \Exception('Previous crawl\'s data not found (index.csv)'); + } + + $urls = []; + $counter = 0; + + $csv = Reader::createFromPath($dataFilePath, 'r'); + $csv->setHeaderOffset(0); + + $records = $csv->getRecords(); + foreach ($records as $r) { + if (! \is_array($r) || ! isset($r['uri'])) { + throw new \LogicException(); + } + + $urls[$r['uri']] = new Url($this->config->getBase().$r['uri']); + if (! empty($r['can_be_crawled'] ?? '') + // we will retry network errror + && NetworkStatus::NETWORK_ERROR != ($r['network_status'] ?? true) + ) { + foreach ($r as $k => $v) { + $kFunction = 'set'.Stringy::create($k)->camelize() + .(isset(Url::ARRAY_EXPORTED[$k]) ? 'FromString' : ''); + if (! method_exists($urls[$r['uri']], $kFunction)) { + continue; + } + + $urls[$r['uri']]->$kFunction($v); + } + + ++$counter; + } + } + + $currentClick = \intval($r['click'] ?? 0); + + return [ + 'urls' => $urls, + 'counter' => $counter, + 'currentClick' => $currentClick, + ]; + } +} diff --git a/packages/crawler/src/Recorder.php b/packages/crawler/src/Recorder.php new file mode 100644 index 0000000..2715e3e --- /dev/null +++ b/packages/crawler/src/Recorder.php @@ -0,0 +1,216 @@ +folder)) { + mkdir($this->folder); + } + + if (! file_exists($folder.self::LINKS_DIR)) { + mkdir($folder.self::LINKS_DIR); + $this->initLinksIndex(); + } + + if (! file_exists($this->folder.self::CACHE_DIR)) { + mkdir($this->folder.self::CACHE_DIR); + } + } + + public function cache(mixed $response, Url $url): void + { + if (self::CACHE_NONE === $this->cacheMethod) { + return; + } + + $filePath = $this->getCacheFilePath($url); + if (file_exists($filePath)) { + return; + } + + if ($response instanceof Response) { + \Safe\file_put_contents( + $filePath, + $response->getRawHeaders().\PHP_EOL.\PHP_EOL.$response->getBody() + ); + \Safe\file_put_contents($filePath.'---info', \Safe\json_encode($response->getInfo())); + + return; + } + } + + public function getCacheFilePath(Url $url): string + { + if (self::CACHE_URI === $this->cacheMethod) { + return $this->getCacheFilePathWithUrlAsFilename($url); + } + + return $this->getCacheFilePathWithIdAsFilename($url); + } + + private function getCacheFilePathWithUrlAsFilename(Url $url): string + { + $url = trim($url->getUri(), '/').'/'; + $urlPart = explode('/', $url); + $folder = $this->folder.self::CACHE_DIR; + + $urlPartLenght = \count($urlPart); + for ($i = 0; $i < $urlPartLenght; ++$i) { + if ($i == $urlPartLenght - 1) { + return $folder.'/'.('' === $urlPart[$i] ? 'index.html' : $urlPart[$i]); + } + + $folder .= '/'.$urlPart[$i]; + if (! file_exists($folder) || ! is_dir($folder)) { + mkdir($folder); + } + + $folder .= '/'.$urlPart[$i]; + if (! file_exists($folder) || ! is_dir($folder)) { + mkdir($folder); + } + } + + throw new \LogicException(); + } + + private function getCacheFilePathWithIdAsFilename(Url $url): string + { + return $this->folder.self::CACHE_DIR.'/'.(string) $url->getId(); + } + + /** + * @param array $urls + */ + public function record(array $urls): bool + { + $dataCsv = fopen($this->folder.'/data.csv', 'w'); + $indexCsv = fopen($this->folder.'/index.csv', 'w'); + + if (false !== $dataCsv && false !== $indexCsv) { + $header = array_map( + fn (string $name): Stringy => Stringy::create($name)->underscored(), + Url::EXPORTABLE + ); + fputcsv($dataCsv, $header); + fputcsv($indexCsv, ['id', 'uri']); + + foreach ($urls as $url) { + fputcsv($dataCsv, array_values($url->toArray())); + fputcsv($indexCsv, [$url->getId(), $url->getUri()]); + } + + fclose($dataCsv); + + return true; + } + + return false; + } + + public static function removeBase(string $base, string $url): ?string + { + return (str_starts_with($url, $base)) ? substr_replace($url, '', 0, \strlen($base)) : null; + } + + private function initLinksIndex(): void + { + if (! file_exists($this->folder.self::LINKS_DIR.'/Index.csv')) { + file_put_contents($this->folder.self::LINKS_DIR.'/Index.csv', 'From,To'.\PHP_EOL); + } + } + + private function recordInboundLink(Link $link, Url $from, Url $to): void + { + \Safe\file_put_contents( + $this->folder.self::LINKS_DIR.'/To_'.(string) $to->getId().'_'.((int) $link->mayFollow), + $this->inboundLinkToStr($link).\PHP_EOL, // can use ->relativize to get only /uri + \FILE_APPEND + ); + } + + private function inboundLinkToStr(Link $link): string + { + return $link->getParentUrl().';'.$link->getAnchor().';'.((int) $link->mayFollow).';'.$link->getType(); + } + + /** + * @param array $urls + * @param Link[] $links + */ + public function recordLinksIndex(string $base, Url $from, array $urls, array $links): void + { + if (false === $this->recordLinks) { + return; + } + + $everAdded = []; + $content = ''; + + foreach ($links as $link) { + $content .= $from->getId(); + $uri = self::removeBase($base, $link->getPageUrl()); + if (\in_array($link->getUrl(), $everAdded)) { // like Google, we sould not add duplicate link, + // so we say the juice is lost -1 + $content .= ',-1'.\PHP_EOL; + } else { + $everAdded[] = $link->getUrl(); + $content .= ','.(isset($urls[$uri]) ? $urls[$uri]->getId() : 0).\PHP_EOL; // 0 = external + } + + if (isset($urls[$uri])) { + $this->recordInboundLink($link, $from, $urls[$uri]); + } + } + + \Safe\file_put_contents($this->folder.self::LINKS_DIR.'/Index.csv', $content, \FILE_APPEND); + + $this->recordOutboundLink($from, $links); + } + + /** + * @param Link[] $links + */ + private function recordOutboundLink(Url $from, array $links): void + { + $links = array_map(fn (Link $link): string => $link->getUrl().';'.$link->getAnchor().';'.((int) $link->mayFollow).';'.$link->getType(), $links); + + \Safe\file_put_contents($this->folder.self::LINKS_DIR.'/From_'.(string) $from->getId(), implode(\PHP_EOL, $links)); + } +} diff --git a/packages/crawler/src/Resources/PageRankVisualizer.html b/packages/crawler/src/Resources/PageRankVisualizer.html new file mode 100644 index 0000000..f7c187f --- /dev/null +++ b/packages/crawler/src/Resources/PageRankVisualizer.html @@ -0,0 +1,291 @@ + + + + + + + + + + + + Crawler + + + +
+

Crawler : Page Rank Bubble Chart

+ + + + + + + diff --git a/packages/crawler/src/SimplePageRankCalculator.php b/packages/crawler/src/SimplePageRankCalculator.php new file mode 100644 index 0000000..1700ff0 --- /dev/null +++ b/packages/crawler/src/SimplePageRankCalculator.php @@ -0,0 +1,119 @@ + + */ + private array $results; + + private int $maxIteration = 10000; + + /** + * @var array> + */ + private array $linksTo = []; + + /** + * @var array + */ + private array $nbrLinksFrom = []; + + private float $dampingFactor = 0.85; + + public function __construct(string $id, ?string $dataDirectory = null) + { + $this->config = CrawlerConfig::loadFrom($id, $dataDirectory); + + $this->initLinksIndex(); + $this->calcul(); + } + + public function record(): string + { + // merge it with previous data harvested + $data = $this->config->getRecordPlayer()->getDataFromPreviousCrawl(); + $urls = $data['urls']; + + foreach ($urls as $k => $url) { + if (isset($this->results[$url->getId()])) { + $urls[$k]->setPagerank($this->results[$url->getId()]); + } + } + + (new Recorder($this->config->getDataFolder(), $this->config->cacheMethod))->record($urls); + + // return data filepath + return realpath($this->config->getDataFolder()).'/data.csv'; + } + + private function calcul(): void + { + for ($iteration = 0; $iteration < $this->maxIteration; ++$iteration) { + $ids = array_keys($this->linksTo); + foreach ($ids as $id) { + $sumPR = 0; + foreach ($this->getLinksTo($id) as $link) { + $sumPR += $this->results[$link] ?? 0 / $this->getNbrLinksFrom($link); + } + + $this->results[$id] = $this->dampingFactor * $sumPR + (1 - $this->dampingFactor) / $this->getPagesNbr(); + } + } + } + + private function getPagesNbr(): int + { + return $this->pagesNbr ??= \count($this->linksTo); + } + + /** + * @return int[] + */ + private function getLinksTo(int $id): array + { + return $this->linksTo[$id]; + } + + private function getNbrLinksFrom(int $id): int + { + return $this->nbrLinksFrom[$id]; + } + + /** + * @noRector + */ + private function initLinksIndex(): void + { + $csv = Reader::createFromPath($this->config->getDataFolder().Recorder::LINKS_DIR.'/Index.csv', 'r'); + $csv->setHeaderOffset(0); + + $records = $csv->getRecords(); + foreach ($records as $r) { + if (! \is_array($r) || ! isset($r['To']) || ! isset($r['From'])) { + throw new \LogicException(); + } + + $r['To'] = (int) $r['To']; + + if (! isset($this->linksTo[$r['To']])) { + $this->linksTo[$r['To']] = []; + } + + $this->linksTo[$r['To']][] = $r['From'] = (int) $r['From']; + + $this->nbrLinksFrom[$r['From']] = ($this->nbrLinksFrom[$r['From']] ?? 0) + 1; + } + } +} diff --git a/packages/crawler/src/Url.php b/packages/crawler/src/Url.php new file mode 100644 index 0000000..626facb --- /dev/null +++ b/packages/crawler/src/Url.php @@ -0,0 +1,750 @@ + */ + private array $flatContent = []; + + private int $textRatio = 0; + + private int $responseTime = 0; + + private int $size = 0; + + private string $title = ''; + + private int $titlePixelWidth = 0; + + private string $metaDescription = ''; + + private string $h1 = ''; + + /** + * @var array + */ + private array $expressions = []; + + private string $expressionsHash = ''; + + /** + * @var array + */ + private array $hrefLangList = []; + + private ?string $canonical = null; + + private \DateTimeInterface $updatedAt; + + /** + * @var Link[] + */ + private array $breadcrumb = []; + + private ?DomCrawler $domCrawler = null; + + /** + * @var string[] + */ + public const ARRAY_EXPORTED = [ + 'expressions', 'breadcrumb', + ]; + + /** @var string[] */ + public const SERIALIZABLE = [ + 'stringUrl', + 'updatedAt', + 'statusCode', + 'mimeType', + 'links', + 'duplicateLinks', + 'externalLinks', + 'indexable', + 'indexableStatus', + 'canonical', + 'h1', + 'flatContent', + 'expressions', + 'wordCount', + 'textRatio', + 'size', + 'responseTime', + 'metaDescription', + 'hrefLangList', + ]; + + /** + * @var string[] + */ + public const EXPORTABLE = [ + 'id', + 'discovered', + 'uri', + 'networkStatus', + 'source', + 'headers', + 'statusCode', + 'click', + 'pagerank', + 'inboundlinks', + 'inboundlinksNofollow', + 'linksTotal', + 'linksInternal', + 'linksSelf', + 'linksSub', + 'linksExternal', + 'canBeCrawled', + 'indexable', + 'indexableStatus', + 'mimeType', + 'wordCount', + 'textRatio', + 'responseTime', + 'size', + 'title', + 'titlePixelWidth', + 'metaDescription', + 'h1', + 'canonical', + 'expressions', + 'expressionsHash', + 'updatedAt', + ]; + + public function __toJson(): string + { + $return = []; + foreach (self::SERIALIZABLE as $name) { + $getter = 'get'.ucfirst($name); + $return[$name] = $this->$getter(); + } + + return \Safe\json_encode($return); + } + + /** + * @return array + */ + public function toArray(): array + { + $return = []; + foreach (self::EXPORTABLE as $exportable) { + $getter = 'get'.ucfirst($exportable); + $value = $this->$getter(); + if ($value instanceof \DateTimeInterface) { + $value = $value->format('y-m-d H:i:s'); + } + + if (! \is_string($value) && ! \is_int($value) + && ! \is_float($value) && ! \is_bool($value) && null !== $value) { + $getter = 'get'.ucfirst($exportable).'String'; + $value = $this->$getter(); + } + + $return[$exportable] = (string) $value; + } + + return $return; + } + + public function __construct(string $url, private int $click = 0, int $id = 0) + { + $this->id = 0 === $id ? $this->getId() : $id; + $this->url = new UrlManipuler($url); + if (($origin = $this->url->getOrigin()) === '') { + throw new \LogicException('`$url` must contain origin (eg. : https://example.tld/my-page).'); + } + + $this->uri = substr($url, \strlen($origin)); + $this->updatedAt = new \DateTime('now'); + } + + public function getId(): int + { + if (0 === $this->id) { + $this->id = static::$autoIncrement; + ++static::$autoIncrement; + } + + return $this->id; + } + + public function setDiscovered(int $discovered): static + { + $this->discovered = $discovered; + + return $this; + } + + public function setMimeType(string $mimeType): void + { + $this->mimeType = $mimeType; + } + + public function setId(int|string $id): void + { + $this->id = \intval($id); + } + + public function getDiscovered(): int + { + return $this->discovered; + } + + public function getUri(): string + { + return $this->uri; + } + + public function setUri(string $uri): void + { + $this->uri = $uri; + } + + public function getClick(): int + { + return $this->click; + } + + public function setClick(int $click): void + { + $this->click = $click; + } + + public function getPagerank(): float + { + return $this->pagerank; + } + + public function setPagerank(float|string|int $pagerank): void + { + $this->pagerank = (float) $pagerank; + } + + public function getInboundlinks(): int + { + return $this->inboundlinks; + } + + public function incrementInboundLinks(): void + { + ++$this->inboundlinks; + } + + public function setInboundlinks(string|int $inboundlinks): void + { + $this->inboundlinks = (int) $inboundlinks; + } + + public function getInboundlinksNofollow(): int + { + return $this->inboundlinksNofollow; + } + + public function incrementInboundLinksNofollow(): void + { + ++$this->inboundlinksNofollow; + } + + public function setInboundlinksNofollow(string|int $inboundlinksNofollow): void + { + $this->inboundlinksNofollow = (int) $inboundlinksNofollow; + } + + public function getCanBeCrawled(): ?bool + { + return $this->canBeCrawled; + } + + public function setCanBeCrawled(string|int|bool $canBeCrawled): bool + { + return $this->canBeCrawled = (bool) $canBeCrawled; + } + + public function getIndexable(): bool + { + return $this->indexable; + } + + public function setIndexable(string|int|bool $indexable): void + { + $this->indexable = (bool) $indexable; + } + + public function getMimeType(): string + { + return $this->mimeType; + } + + /** @return Link[] */ + public function getLinks(): array + { + return $this->links; + } + + /** @param Link[] $links */ + public function setLinks(array $links): void + { + $this->links = $links; + } + + public function getDuplicateLinks(): int + { + return $this->linksDuplicate; + } + + public function getLinksDuplicate(): int + { + return $this->linksDuplicate; + } + + public function setLinksDuplicate(string|int $linksDuplicate): void + { + $this->linksDuplicate = (int) $linksDuplicate; + } + + public function getLinksSelf(): int + { + return $this->linksSelf; + } + + public function setLinksSelf(int $linksSelf): void + { + $this->linksSelf = $linksSelf; + } + + public function getLinksInternal(): int + { + return $this->linksInternal; + } + + public function setLinksInternal(int|string $linksInternal): void + { + $this->linksInternal = (int) $linksInternal; + } + + public function getLinksSub(): int + { + return $this->linksSub; + } + + public function setLinksSub(int|string $linksSub): void + { + $this->linksSub = (int) $linksSub; + } + + public function getExternalLinks(): int + { + return $this->linksExternal; + } + + public function getLinksExternal(): int + { + return $this->linksExternal; + } + + public function setLinksExternal(int $linksExternal): void + { + $this->linksExternal = $linksExternal; + } + + public function getWordCount(): int + { + return $this->wordCount; + } + + public function setWordCount(int $wordCount): void + { + $this->wordCount = $wordCount; + } + + public function getResponseTime(): int + { + return $this->responseTime; + } + + public function setResponseTime(int $responseTime): void + { + $this->responseTime = $responseTime; + } + + public function getSize(): int + { + return $this->size; + } + + public function setSize(int $size): void + { + $this->size = $size; + } + + public function getTitle(): string + { + return $this->title; + } + + public function setTitle(string $title): void + { + $this->title = $title; + } + + public function getUpdatedAt(): \DateTimeInterface + { + return $this->updatedAt; + } + + public function setUpdatedAt(string|\DateTimeInterface $updatedAt): void + { + $this->updatedAt = \is_string($updatedAt) ? \Safe\DateTime::createFromFormat('y-m-d H:i:s', $updatedAt) : $updatedAt; + } + + public function getNetworkStatus(): int + { + return $this->networkStatus; + } + + public function setNetworkStatus(int $networkStatus): void + { + $this->networkStatus = $networkStatus; + } + + public function getHeaders(): string + { + return $this->headers; + } + + public function setHeaders(string $headers): void + { + $this->headers = $headers; + } + + /** + * @return array + */ + public function getParsedHeaders(): array + { + return Helper::httpParseHeaders($this->headers); + } + + public function getSource(): string + { + return $this->source; + } + + public function setSource(string $source): void + { + $this->source = realpath($source) ?: ''; + } + + public function getHtml(): string + { + return $this->html; + } + + public function setHtml(string $html): void + { + $this->html = $html; + } + + public function getDomCrawler(): DomCrawler + { + return $this->domCrawler ??= new DomCrawler($this->html); + } + + public function getIndexableStatus(): int + { + return $this->indexableStatus; + } + + public function setIndexableStatus(int $indexableStatus): void + { + $this->indexableStatus = $indexableStatus; + } + + public function getStatusCode(): int + { + return $this->statusCode; + } + + public function setStatusCode(int|string $statusCode): void + { + $this->statusCode = (int) $statusCode; + } + + public function getUrl(): UrlManipuler + { + return $this->url; + } + + public function getStringUrl(): string + { + return (string) $this->getUrl(); + } + + /** + * @return Link[] + */ + public function getInboundLinksList(): array + { + return $this->inboundLinksList; + } + + /** + * @param Link[] $inboundLinksList + */ + public function setInboundLinksList(array $inboundLinksList): void + { + $this->inboundLinksList = $inboundLinksList; + } + + public function getLinksTotal(): int + { + return $this->linksTotal; + } + + public function setLinksTotal(int $linksTotal): void + { + $this->linksTotal = $linksTotal; + } + + public function getTitlePixelWidth(): int + { + return $this->titlePixelWidth; + } + + public function setTitlePixelWidth(int $titlePixelWidth): void + { + $this->titlePixelWidth = $titlePixelWidth; + } + + /** + * Get the value of textRatio. + */ + public function getTextRatio(): int + { + return $this->textRatio; + } + + public function setTextRatio(int $textRatio): void + { + $this->textRatio = $textRatio; + } + + public function getMetaDescription(): string + { + return $this->metaDescription; + } + + public function setMetaDescription(string $metaDescription): void + { + $this->metaDescription = $metaDescription; + } + + public function getH1(): string + { + return $this->h1; + } + + public function setH1(string $h1): void + { + $this->h1 = $h1; + } + + /** + * @return array + */ + public function getExpressions(): array + { + return $this->expressions; + } + + public function getExpressionsString(): string + { + $return = ''; + foreach ($this->expressions as $kw => $v) { + $return .= $kw.' :: '.$v.\chr(10); + } + + return $return; + } + + public function setExpressionsFromString(string $kws): void + { + $expressions = []; + $kws = explode(\chr(10), trim($kws)); + foreach ($kws as $kw) { + if ('' === $kw) { + continue; + } + + $kw = explode('::', $kw); + $expressions[trim($kw[0])] = (int) trim($kw[1]); + } + + $this->setExpressions($expressions); + } + + /** + * @param array|string $expressions + */ + public function setExpressions(array|string $expressions): void + { + if (\is_string($expressions)) { + $this->setExpressionsFromString($expressions); + + return; + } + + $this->expressions = $expressions; + $this->expressionsHash = md5(implode('', \array_slice($expressions, 0, 10))); + } + + public function getCanonical(): ?string + { + return $this->canonical; + } + + public function setCanonical(?string $canonical): void + { + $this->canonical = $canonical; + } + + /** + * @return Link[] + */ + public function getBreadcrumb(): array + { + return $this->breadcrumb; + } + + /** + * @param Link[] $breadcrumb + */ + public function setBreadcrumb(array|string $breadcrumb): void + { + if (\is_string($breadcrumb)) { + return; + } + + $this->breadcrumb = $breadcrumb; + } + + public function getBreadcrumbString(): string + { + $return = ''; + foreach ($this->breadcrumb as $link) { + $return .= $link->__toString().\chr(10); + } + + return $return; + } + + public function setBreadcrumbFromString(string $breadcrumb): void + { + // TODO + } + + /** @return array */ + public function getFlatContent(): array + { + return $this->flatContent; + } + + /** @param array $flatContent */ + public function setFlatContent(array $flatContent): self + { + $this->flatContent = $flatContent; + + return $this; + } + + /** + * @return array + */ + public function getHrefLangList(): array + { + return $this->hrefLangList; + } + + /** + * @param array $hrefLangList + */ + public function setHrefLangList(array $hrefLangList): self + { + $this->hrefLangList = $hrefLangList; + + return $this; + } + + /** + * Get the value of expressionsHash. + */ + public function getExpressionsHash(): string + { + return $this->expressionsHash; + } +} diff --git a/packages/crawler/tests/CrawlerTest.php b/packages/crawler/tests/CrawlerTest.php new file mode 100644 index 0000000..9600ec2 --- /dev/null +++ b/packages/crawler/tests/CrawlerTest.php @@ -0,0 +1,85 @@ +setStartUrl('https://dev.piedweb.com/') + ); + $crawler->config->recordConfig(); + $crawler->crawl(); + + $this->assertTrue(file_exists($crawler->config->getDataFolder().'/index.csv')); + + $id = $crawler->config->getId(); + + $crawlerRestart = Crawler::restart($id, true, false); + $crawlerRestart->crawl(); + // todo test + $crawlerRestart = Crawler::continue($id, false); + $crawlerRestart->crawl(); + // todo test + $prCalculator = new SimplePageRankCalculator($id); + $prCalculator->record(); + // todo test + } + + public function testCommand(): void + { + $application = new Application(); + + $application->add(new \PiedWeb\Crawler\Command\CrawlerCommand()); + $application->add(new \PiedWeb\Crawler\Command\ShowExternalLinksCommand()); + $application->add(new \PiedWeb\Crawler\Command\PageRankCommand()); + + $command = $application->find('crawler:go'); + $commandTester = new CommandTester($command); + $commandTester->execute([ + 'start' => 'https://dev.piedweb.com', + '--quiet', + // prefix the key with two dashes when passing options, + // e.g: '--some-option' => 'option_value', + ]); + + // the output of the command in the console + $output = $commandTester->getDisplay(); + $this->assertStringContainsString('piedweb.com', $output); + } + + public function testWitchCachId(): void + { + $crawler = new Crawler( + (new CrawlerConfig( + 0, + 'HelloMe', + Recorder::CACHE_ID + ))->setStartUrl( + 'https://dev.piedweb.com/' + ) + ); + $crawler->config->recordConfig(); + $crawler->crawl(); + + $this->assertTrue(file_exists($crawler->config->getDataFolder().'/index.csv')); + + $restart = Crawler::restart($crawler->config->getId()); + $restart->crawl(); + + $continue = Crawler::continue($crawler->config->getId()); + $continue->crawl(); + + $this->assertFileExists($crawler->config->getDataFolder().'/index.csv'); + } +} diff --git a/packages/curl/src/ExtendedClient.php b/packages/curl/src/ExtendedClient.php index a17ae3a..50079c5 100644 --- a/packages/curl/src/ExtendedClient.php +++ b/packages/curl/src/ExtendedClient.php @@ -32,9 +32,9 @@ class ExtendedClient extends Client * A short way to set some classic options to cURL a web page. */ public function setDefaultGetOptions( - int $connectTimeOut = 5, - int $timeOut = 10, - int $dnsCacheTimeOut = 600, + int $connectTimeOut = 10, + int $timeOut = 20, + int $dnsCacheTimeOut = 900, bool $followLocation = true, int $maxRedirs = 5, bool $autoReferer = true @@ -54,11 +54,14 @@ public function setDefaultGetOptions( /** * A short way to set some classic options to cURL a web page quickly. */ - public function setDefaultSpeedOptions(): self - { + public function setDefaultSpeedOptions( + int $connectTimeOut = 5, + int $timeOut = 10, + int $dnsCacheTimeOut = 900 + ): self { $this->setOpt(\CURLOPT_SSL_VERIFYHOST, 0); $this->setOpt(\CURLOPT_SSL_VERIFYPEER, 0); - $this->setDefaultGetOptions(5, 10, 600, true, 1); + $this->setDefaultGetOptions($connectTimeOut, $timeOut, $dnsCacheTimeOut, true, 1); $this->setEncodingGzip(); return $this; @@ -225,7 +228,11 @@ public function setMaximumResponseSize(int $maxBytes = 2_000_000): self // $this->setOpt(CURLOPT_BUFFERSIZE, 128); // more progress info $this->setOpt(\CURLOPT_NOPROGRESS, false); $this->setOpt(\CURLOPT_PROGRESSFUNCTION, function ($handle, $totalBytes, $receivedBytes) use ($maxBytes) { - if ($totalBytes > $maxBytes || $receivedBytes > $maxBytes) { + if ($totalBytes > $maxBytes) { + return 1; + } + + if ($receivedBytes > $maxBytes) { return 1; } }); @@ -263,6 +270,8 @@ public function request(?string $target = null, bool $updateRefererAndCookies = $request = parent::request($target); + // if ($this->getError() && !in_array($this->getError(), [18], true)) { return false; } + // Permits to transform HEAD request in GET request if (1 === $this->optChangeDuringRequest) { return $this->request(); @@ -274,10 +283,16 @@ public function request(?string $target = null, bool $updateRefererAndCookies = $this->setReferer($effectiveUrl); } - if ($updateRefererAndCookies && ($cookies = $this->getResponse()->getCookies()) !== null) { - $this->setCookie($cookies); + if (! $updateRefererAndCookies) { + return $request; } + if (($cookies = $this->getResponse()->getCookies()) === null) { + return $request; + } + + $this->setCookie($cookies); + return $request; } diff --git a/packages/curl/src/Helper.php b/packages/curl/src/Helper.php index 0930af5..b3f5c53 100644 --- a/packages/curl/src/Helper.php +++ b/packages/curl/src/Helper.php @@ -2,6 +2,9 @@ namespace PiedWeb\Curl; +/** + * @see \PiedWeb\Curl\Test\HelperTest + */ class Helper { /** diff --git a/packages/curl/src/Response.php b/packages/curl/src/Response.php index bef369b..8007d88 100644 --- a/packages/curl/src/Response.php +++ b/packages/curl/src/Response.php @@ -110,7 +110,7 @@ public function getRawHeaders(): string } /** - * @return string requested url + * @return string|null requested url */ public function getUrl(): ?string { @@ -142,7 +142,7 @@ public function getCookies(): ?string /** * Get information (curl info). * - * @param string $key to get + * @param string|null $key to get * * @return int|string|array|null */ diff --git a/packages/curl/tests/HelperTest.php b/packages/curl/tests/HelperTest.php index 71a05a5..fa26260 100644 --- a/packages/curl/tests/HelperTest.php +++ b/packages/curl/tests/HelperTest.php @@ -8,7 +8,7 @@ class HelperTest extends \PHPUnit\Framework\TestCase { - public function testSchemeFromProxy() + public function testSchemeFromProxy(): void { $proxy = '75.157.242.104:59190'; $this->assertSame('http://', Helper::getSchemeFrom($proxy)); @@ -18,14 +18,14 @@ public function testSchemeFromProxy() $this->assertSame('75.157.242.104:59190', $proxy); } - public function testCheckContentType() + public function testCheckContentType(): void { $line = 'Content-Type: text/html; charset=utf-8'; $expected = 'text/html'; $this->assertTrue(Helper::checkContentType($line, $expected)); } - public function testCheckStatusCode() + public function testCheckStatusCode(): void { $line = 'HTTP/1.1 200 OK'; $expected = 200; diff --git a/packages/curl/tests/RequestTest.php b/packages/curl/tests/RequestTest.php index d84e268..0cbfb94 100644 --- a/packages/curl/tests/RequestTest.php +++ b/packages/curl/tests/RequestTest.php @@ -11,15 +11,13 @@ class RequestTest extends \PHPUnit\Framework\TestCase { - public function testDownloadIfHtml() + public function testDownloadIfHtml(): void { $url = 'https://piedweb.com/'; $request = new Client($url); $request ->setDefaultGetOptions() - ->setDownloadOnlyIf(function ($line) { - return 0 === stripos(trim($line), 'content-type') && false !== stripos($line, 'text/html'); - }) + ->setDownloadOnlyIf(fn ($line): bool => 0 === stripos(trim((string) $line), 'content-type') && false !== stripos((string) $line, 'text/html')) ->setDesktopUserAgent() ->setEncodingGzip() ; @@ -31,14 +29,14 @@ public function testDownloadIfHtml() $this->assertTrue(\is_array($headers)); $this->assertSame('text/html; charset=UTF-8', $request->getResponse()->getContentType()); - $this->assertTrue(\strlen($request->getResponse()->getContent()) > 10); - $this->assertStringContainsString('200', $request->getResponse()->getHeaders()[0]); - $this->assertStringContainsString('200', $request->getResponse()->getHeaderLine('0')); - $this->assertStringContainsString('200', $request->getResponse()->getHeader('0')); + $this->assertGreaterThan(10, \strlen($request->getResponse()->getContent())); + $this->assertStringContainsString('200', \strval($request->getResponse()->getHeaders()[0] ?? '')); + $this->assertStringContainsString('200', (string) $request->getResponse()->getHeaderLine('0')); + $this->assertStringContainsString('200', \strval($request->getResponse()->getHeader('0'))); $this->assertNull($request->getResponse()->getCookies()); } - public function testNotDownload() + public function testNotDownload(): void { $url = 'https://piedweb.com/assets/img/xl/bg.jpg'; $request = new Client($url); @@ -54,7 +52,7 @@ public function testNotDownload() $this->assertSame('', $request->getResponse()->getContent()); } - public function testEffectiveUrl() + public function testEffectiveUrl(): void { $url = 'http://www.piedweb.com/'; $request = new Client($url); @@ -65,15 +63,15 @@ public function testEffectiveUrl() ->setEncodingGzip() ; $request->request(); - + // dump($request->getCurlInfos()); $this->assertSame('https://piedweb.com/', $request->getResponse()->getUrl()); $this->assertSame($url, $request->getTarget()); - $this->assertTrue(\strlen($request->getResponse()->getContent()) > 10); + $this->assertGreaterThan(10, \strlen($request->getResponse()->getContent())); } - public function testCurlError() + public function testCurlError(): void { - $url = 'http://www.readze'.rand(100000, 99999999).'.com/'; + $url = 'http://www.readze'.random_int(100000, 99_999_999).'.com/'; $request = new Client($url); $request ->setDefaultGetOptions() @@ -85,7 +83,7 @@ public function testCurlError() $this->assertSame(6, $request->getResponse()->getError()); } - public function test404() + public function test404(): void { $url = 'https://piedweb.com/404-error'; $request = new Client($url); @@ -100,7 +98,7 @@ public function test404() $this->assertSame(404, $request->getResponse()->getStatusCode()); } - public function testAllMethods() + public function testAllMethods(): void { $checkHeaders = new MultipleCheckInHeaders(); @@ -117,7 +115,7 @@ public function testAllMethods() ->setMobileUserAgent() ->setLessJsUserAgent() ->setTarget($url) - ->setDownloadOnlyIf([$checkHeaders, 'check']) + ->setDownloadOnlyIf($checkHeaders->check(...)) ->setLanguage('en-US,en;q=0.9') ; @@ -134,7 +132,7 @@ public function testAllMethods() $this->assertSame('text/html; charset=UTF-8', $request->getResponse()->getContentType()); - $this->assertTrue(\strlen($request->getResponse()->getContent()) > 100); + $this->assertGreaterThan(100, \strlen($request->getResponse()->getContent())); $this->assertSame('Upgrade-Insecure-Requests: 1 User-Agent: '.$request->lessJsUserAgent.' Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 @@ -150,7 +148,7 @@ public function testAllMethods() Content-Length: 0', trim(strip_tags($request->getResponse()->getBody()))); } - public function testMultipleCheckInHeaders() + public function testMultipleCheckInHeaders(): void { $checkHeaders = new MultipleCheckInHeaders(); @@ -160,7 +158,7 @@ public function testMultipleCheckInHeaders() ->setDefaultGetOptions() ->setDefaultSpeedOptions() ->setUserAgent('Hello :)') - ->setDownloadOnlyIf([$checkHeaders, 'check']) + ->setDownloadOnlyIf($checkHeaders->check(...)) ->setPost('testpost') ; @@ -170,7 +168,7 @@ public function testMultipleCheckInHeaders() $this->assertSame(404, $request->getResponse()->getInfo('http_code')); } - public function testProxy() + public function testProxy(): void { $url = 'https://piedweb.com/404-error'; $request = new Client($url); @@ -183,11 +181,11 @@ public function testProxy() $request->request(); - $this->assertTrue($request->getResponse()->getError() > 0); + $this->assertGreaterThan(0, $request->getResponse()->getError()); $this->assertStringContainsString('timed out', $request->getResponse()->getErrorMessage()); } - public function testAbortIfTooBig() + public function testAbortIfTooBig(): void { $url = 'https://piedweb.com'; $request = new Client($url); @@ -196,17 +194,17 @@ public function testAbortIfTooBig() $this->assertSame($request->getResponse()->getError(), 42); } - public function testDownloadOnlyFirstBytes() + public function testDownloadOnlyFirstBytes(): void { $url = 'https://piedweb.com'; $request = new Client($url); $request->setDownloadOnly('0-199'); $request->request(); - $this->assertTrue(\strlen($request->getResponse()->getContent()) < 300); + $this->assertLessThan(300, \strlen($request->getResponse()->getContent())); } - public function testResponseFromCache() + public function testResponseFromCache(): void { $response = new ResponseFromCache( 'HTTP/1.1 200 OK'.\PHP_EOL.\PHP_EOL.'

Tests

', diff --git a/packages/curl/tests/StaticWrapperTest.php b/packages/curl/tests/StaticWrapperTest.php index 1fa2168..8aa67f2 100644 --- a/packages/curl/tests/StaticWrapperTest.php +++ b/packages/curl/tests/StaticWrapperTest.php @@ -8,10 +8,10 @@ class StaticWrapperTest extends \PHPUnit\Framework\TestCase { - public function testStaticGet() + public function testStaticGet(): void { $url = 'https://dev.piedweb.com/robots.txt'; $result = Client::request($url); - $this->assertTrue(\strlen($result) > 10); + $this->assertGreaterThan(10, \strlen($result)); } } diff --git a/packages/extractor/src/BaseExtractor.php b/packages/extractor/src/BaseExtractor.php index b4031fd..9cda76b 100644 --- a/packages/extractor/src/BaseExtractor.php +++ b/packages/extractor/src/BaseExtractor.php @@ -18,10 +18,14 @@ public function get(): ?Url } $baseHref = (new Crawler($base))->attr('href'); - if ($baseHref && filter_var($baseHref, \FILTER_VALIDATE_URL)) { - return new Url($baseHref); + if (! $baseHref) { + return null; + } + + if (! filter_var($baseHref, \FILTER_VALIDATE_URL)) { + return null; } - return null; + return new Url($baseHref); } } diff --git a/packages/extractor/src/BreadcrumbExtractor.php b/packages/extractor/src/BreadcrumbExtractor.php index b47dd14..c48b5d9 100644 --- a/packages/extractor/src/BreadcrumbExtractor.php +++ b/packages/extractor/src/BreadcrumbExtractor.php @@ -69,7 +69,7 @@ private function findBreadcrumb(): ?string } /** - * @return array + * @return string[]|null */ private function divideBreadcrumb(string $breadcrumb, string $divider): ?array { @@ -107,9 +107,15 @@ private function extractHref(string $str): ?string { $regex = ['href="([^"]*)"', 'href=\'([^\']*)\'', 'href=(\S+) ']; foreach ($regex as $r) { - if (preg_match('/'.$r.'/siU', $str, $match) && Helper::isWebLink($match[1])) { - return $this->parentUrl->resolve($match[1]); + if (! preg_match('/'.$r.'/siU', $str, $match)) { + continue; } + + if (! Helper::isWebLink($match[1])) { + continue; + } + + return $this->parentUrl->resolve($match[1]); } return null; diff --git a/packages/extractor/src/HrefLangExtractor.php b/packages/extractor/src/HrefLangExtractor.php new file mode 100644 index 0000000..d4afbf4 --- /dev/null +++ b/packages/extractor/src/HrefLangExtractor.php @@ -0,0 +1,35 @@ + + */ + public function getHrefLangList(): array + { + $toReturn = []; + $links = $this->crawler->filterXPath('//link[@hreflang]')->extract(['hreflang', 'href']); + foreach ($links as $link) { + if ('x-default' === $link[0]) { + continue; + } + + if (isset($toReturn[$link[0]])) { + continue; + } + + $toReturn[(string) $link[0]] = (string) $link[1]; + } + + return $toReturn; + } +} diff --git a/packages/extractor/src/Indexable.php b/packages/extractor/src/Indexable.php index fa0d32c..ef87b20 100644 --- a/packages/extractor/src/Indexable.php +++ b/packages/extractor/src/Indexable.php @@ -6,49 +6,24 @@ use Spatie\Robots\RobotsTxt; use Symfony\Component\DomCrawler\Crawler; -class Indexable +final class Indexable { private readonly int $indexable; - /** - * @var int - */ - private const INDEXABLE = 0; - - /** - * @var int - */ - private const NOT_INDEXABLE_ROBOTS = 1; - - /** - * @var int - */ - private const NOT_INDEXABLE_HEADER = 2; - - /** - * @var int - */ - private const NOT_INDEXABLE_META = 3; + public const NOT_INDEXABLE = [ + 'robots' => 1, + 'header' => 2, + 'meta' => 3, + 'canonical' => 4, + '4XX' => 5, + '5XX' => 6, + 'redir' => 7, + ]; /** * @var int */ - private const NOT_INDEXABLE_CANONICAL = 4; - - /** - * @var int - */ - private const NOT_INDEXABLE_4XX = 5; - - /** - * @var int - */ - private const NOT_INDEXABLE_5XX = 6; - - /** - * @var int - */ - final public const NOT_INDEXABLE_REDIR = 9; + public const INDEXABLE = 0; public function __construct( private readonly Url $url, @@ -100,38 +75,43 @@ public function getErrorMessage(): string private function analyze(): int { - if ($this->robotsTxtAllows()) { - return self::NOT_INDEXABLE_ROBOTS; + if (! $this->robotsTxtAllows()) { + return self::NOT_INDEXABLE['robots']; } - if ($this->headersAllow()) { - return self::NOT_INDEXABLE_HEADER; + if (! $this->headersAllow()) { + return self::NOT_INDEXABLE['header']; } if (! $this->metaAllows()) { - return self::NOT_INDEXABLE_META; + return self::NOT_INDEXABLE['meta']; } // canonical if (! (new CanonicalExtractor($this->url, $this->crawler))->isCanonicalCorrect()) { - return self::NOT_INDEXABLE_CANONICAL; + return self::NOT_INDEXABLE['canonical']; } // status 4XX if ($this->statusCode < 500 && $this->statusCode > 399) { - return self::NOT_INDEXABLE_4XX; + return self::NOT_INDEXABLE['4XX']; } // status 5XX if ($this->statusCode < 600 && $this->statusCode > 499) { - return self::NOT_INDEXABLE_5XX; + return self::NOT_INDEXABLE['5XX']; } // status 3XX - if ($this->statusCode < 400 && $this->statusCode > 299) { - return self::NOT_INDEXABLE_REDIR; + if ($this->statusCode >= 400) { + // weird + return self::INDEXABLE; + } + + if ($this->statusCode <= 299) { + return self::INDEXABLE; } - return self::INDEXABLE; + return self::NOT_INDEXABLE['redir']; } } diff --git a/packages/extractor/src/Link.php b/packages/extractor/src/Link.php index fdf8bea..3ca1a1d 100644 --- a/packages/extractor/src/Link.php +++ b/packages/extractor/src/Link.php @@ -6,54 +6,58 @@ final class Link implements \Stringable { - private readonly Url $url; + public readonly Url $url; - private ?string $anchor = null; + public readonly bool $mayFollow; - // wrapper related - /** - * @var int - */ + public ?string $anchor; + + /** @var int */ public const LINK_A = 1; - /** - * @var int - */ + /** @var int */ public const LINK_SRC = 4; - /** - * @var int - */ + /** @var int */ public const LINK_3XX = 2; - /** - * @var int - */ + /** @var int */ public const LINK_301 = 3; - // type related - /** - * @var int - */ + // --- + /** @var int */ public const LINK_SELF = 1; - /** - * @var int - */ + /** @var int */ public const LINK_INTERNAL = 2; - /** - * @var int - */ + /** @var int */ public const LINK_SUB = 3; + /** @var int */ + public const LINK_EXTERNAL = 4; + /** - * @var int + * Always submit absoute Url ! */ - public const LINK_EXTERNAL = 4; + public function __construct( + string $url, + public readonly Url $parentUrl, + bool $parentMayFollow = true, + public readonly ?\DOMElement $element = null, + private ?int $wrapper = null + ) { + $this->mayFollow = $this->mayFollow($parentMayFollow); + $this->url = new Url(self::normalizeUrl($url)); + $this->setAnchor(); + if (null !== $this->element) { + $this->setWrapper($this->element); + } + } public function __toString(): string { + // return $link->getParentUrl().';'.$link->getAnchor().';'.((int) $link->mayFollow()).';'.$link->getType(); return '['.$this->anchor.']('.$this->url->get().')'; } @@ -86,23 +90,6 @@ private function setWrapper(\DOMElement $element): void } } - /** - * Always submit absoute Url ! - */ - public function __construct( - string $url, - private readonly Url $parentUrl, - private readonly bool $parentMayFollow = true, - private readonly ?\DOMElement $element = null, - private ?int $wrapper = null - ) { - $this->url = new Url(self::normalizeUrl($url)); - $this->setAnchor(); - if (null !== $this->element) { - $this->setWrapper($this->element); - } - } - public static function createRedirection(string $url, Url $parentUrl, int $redirType = self::LINK_3XX): self { return new self($url, $parentUrl, true, null, $redirType); @@ -160,26 +147,25 @@ public function getElement(): ?\DOMElement return $this->element; } - public function mayFollow(): bool + private function mayFollow(bool $parentMayFollow): bool { // check meta robots and headers - if (! $this->parentMayFollow) { + if (! $parentMayFollow) { return false; } // check "type" rel - if (null !== $this->element && $this->element->getAttribute('rel')) { - if (preg_match('(nofollow|sponsored|ugc)', $this->element->getAttribute('rel'))) { - return false; - } + if (null === $this->element) { + return true; } - return true; + if (! $this->element->getAttribute('rel')) { + return true; + } + + return ! preg_match('(nofollow|sponsored|ugc)', $this->element->getAttribute('rel')); } - /** - * @return string - */ public function getRelAttribute(): ?string { return null !== $this->element ? $this->element->getAttribute('rel') : null; @@ -192,15 +178,21 @@ public function isInternalLink(): bool public function isSubLink(): bool { - return ! $this->isInternalLink() - && $this->url->getRegistrableDomain() == $this->parentUrl->getRegistrableDomain(); + if ($this->isInternalLink()) { + return false; + } + + return $this->url->getRegistrableDomain() == $this->parentUrl->getRegistrableDomain(); // && strtolower(substr($this->getHost(), -strlen($this->parentDomain))) === $this->parentDomain; } public function isSelfLink(): bool { - return $this->isInternalLink() - && $this->url->getDocumentUrl() == $this->parentUrl->getDocumentUrl(); + if (! $this->isInternalLink()) { + return false; + } + + return $this->url->getDocumentUrl() == $this->parentUrl->getDocumentUrl(); } public function getType(): int diff --git a/packages/extractor/src/LinksExtractor.php b/packages/extractor/src/LinksExtractor.php index 4a85edf..5fa9921 100644 --- a/packages/extractor/src/LinksExtractor.php +++ b/packages/extractor/src/LinksExtractor.php @@ -112,7 +112,11 @@ private function extractUrl(\DOMElement $element): ?string } } - if (! $url || ! Helper::isWebLink($url)) { + if (! $url) { + return null; + } + + if (! Helper::isWebLink($url)) { return null; } diff --git a/packages/extractor/src/RedirectionExtractor.php b/packages/extractor/src/RedirectionExtractor.php index 8c8d077..36f6b0f 100644 --- a/packages/extractor/src/RedirectionExtractor.php +++ b/packages/extractor/src/RedirectionExtractor.php @@ -19,11 +19,19 @@ public function __construct( public function getRedirection(): ?string { $headers = array_change_key_case([] !== $this->headers ? $this->headers : []); - if (isset($headers['location']) && \is_string($headers['location']) && Helper::isWebLink($headers['location'])) { - return $this->url->resolve($headers['location']); + if (! isset($headers['location'])) { + return null; } - return null; + if (! \is_string($headers['location'])) { + return null; + } + + if (! Helper::isWebLink($headers['location'])) { + return null; + } + + return $this->url->resolve($headers['location']); } public function getRedirectionLink(): ?Link diff --git a/packages/extractor/src/RobotsTxtExtractor.php b/packages/extractor/src/RobotsTxtExtractor.php index 28c2213..7de46b1 100644 --- a/packages/extractor/src/RobotsTxtExtractor.php +++ b/packages/extractor/src/RobotsTxtExtractor.php @@ -4,6 +4,8 @@ use PiedWeb\Curl\ExtendedClient; use Spatie\Robots\RobotsTxt; +use Symfony\Component\Cache\Adapter\FilesystemAdapter; +use Symfony\Contracts\Cache\ItemInterface; final class RobotsTxtExtractor { @@ -14,10 +16,24 @@ final class RobotsTxtExtractor public function get(Url $url): RobotsTxt { - return self::$cache[$url->getOrigin()] ??= $this->directGet($url); + return self::$cache[$url->getOrigin()] ??= new RobotsTxt($this->getBodyFromCache($url)); } - public function directGet(Url $url): RobotsTxt + private function getBodyFromCache(Url $url): string + { + $cache = new FilesystemAdapter(); + + /** @var string */ + $body = $cache->get('robotstxt_'.$url->getOrigin(), function (ItemInterface $item) use ($url): string { + $item->expiresAfter(172800); + + return $this->getBody($url); + }); + + return $body; + } + + private function getBody(Url $url): string { $url = $url->getOrigin().'/robots.txt'; @@ -28,16 +44,15 @@ public function directGet(Url $url): RobotsTxt ->fakeBrowserHeader() ->setDesktopUserAgent(); if (! $request->request()) { - // todo log - return new RobotsTxt(''); + return ''; } $response = $request->getResponse(); if (false === stripos($response->getContentType(), 'text/plain')) { - return new RobotsTxt(''); + return ''; } - return new RobotsTxt($response->getBody()); + return $response->getBody(); } } diff --git a/packages/extractor/src/TagExtractor.php b/packages/extractor/src/TagExtractor.php index 540b731..dd96444 100644 --- a/packages/extractor/src/TagExtractor.php +++ b/packages/extractor/src/TagExtractor.php @@ -18,6 +18,24 @@ public function get(string $selector): ?string return $found->count() > 0 ? Helper::clean($found->text()) : null; } + public function getFirst(string $selector): ?string + { + $found = $this->crawler->filter($selector); + + if (0 === $found->count()) { + return null; + } + + return Helper::clean($found->eq(0)->text()); + } + + public function getCount(string $selector): int + { + $found = $this->crawler->filter($selector); + + return $found->count(); + } + public function getUnique(string $selector = 'title'): ?string { $found = $this->crawler->filter($selector); diff --git a/packages/extractor/src/TextData.php b/packages/extractor/src/TextData.php index 1617162..9d4fbf0 100644 --- a/packages/extractor/src/TextData.php +++ b/packages/extractor/src/TextData.php @@ -9,25 +9,80 @@ final class TextData { public function __construct( - private readonly Crawler $crawler, - private readonly string $html + private readonly string $html, + private readonly ?Crawler $crawler = null, ) { } /** @psalm-suppress RedundantPropertyInitializationCheck */ - public function getTextAnalysis(): ?Analysis + public function getTextAnalysis(): Analysis { - return $this->crawler->count() > 0 ? (new TextAnalyzer($this->crawler->text(), true, 1))->exec() : null; + return (new TextAnalyzer($this->getText(), false, 4))->exec(); + } + + private ?string $text = null; + + private function getText(): string + { + if (null !== $this->text) { + return $this->text; + } + + if (null === $this->crawler) { + return $this->html; + } + + $this->text = ''; + $elements = $this->crawler->filterXPath(self::getXPathToSelectNodeContent()); + foreach ($elements as $element) { + $this->text .= ' '.Helper::clean($element->textContent); + } + + return $this->text; + } + + public static function getXPathToSelectNodeContent(string $tag = 'p,h1,h2,h3,h4,h5,h6,li,div'): string + { + $tagsToGet = explode(',', $tag); + $xpath = '//head/title'; + $not = '[not(self::node()[count(.//'.implode('|.//', $tagsToGet).') > 0])]'; + foreach ($tagsToGet as $tag) { + $xpath .= ' | //'.$tag.$not; + } + + return $xpath; } public function getWordCount(): int { - return str_word_count($this->crawler->text('')); + return str_word_count($this->getText()); + } + + /** @return array */ + public function getFlatContent(): array + { + if (null === $this->crawler) { + throw new \Exception(); + } + + $flatContent = []; + $elements = $this->crawler->filterXPath(self::getXPathToSelectNodeContent('p,h1,h2,h3,h4,h5,h6,li')); + + foreach ($elements as $node) { + $text = Helper::clean($node->textContent); + if (isset($flatContent[$text])) { + continue; + } + + $flatContent[$text] = $node->nodeName; + } + + return $flatContent; } public function getRatioTxtCode(): int { - $textLenght = \strlen($this->crawler->text('')); + $textLenght = \strlen($this->getText()); $htmlLenght = \strlen(Helper::clean($this->html)); return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0); diff --git a/packages/extractor/tests/GlobalTest.php b/packages/extractor/tests/GlobalTest.php index 052010f..9c42421 100644 --- a/packages/extractor/tests/GlobalTest.php +++ b/packages/extractor/tests/GlobalTest.php @@ -7,6 +7,8 @@ use PiedWeb\Curl\Helper; use PiedWeb\Curl\Response; use PiedWeb\Extractor\CanonicalExtractor; +use PiedWeb\Extractor\HrefLangExtractor; +use PiedWeb\Extractor\TextData; use PiedWeb\Extractor\Url; use Symfony\Component\DomCrawler\Crawler; @@ -27,7 +29,7 @@ public function getPage(string $url = 'https://piedweb.com'): ?string ->fakeBrowserHeader() ->setNoFollowRedirection() ->setMaximumResponseSize() - ->setDownloadOnlyIf([Helper::class, 'checkStatusCode']) + ->setDownloadOnlyIf(Helper::checkStatusCode(...)) ->setMobileUserAgent(); // if ($this->proxy) { $client->setProxy($this->proxy); } $client->request(); @@ -39,7 +41,7 @@ public function getPage(string $url = 'https://piedweb.com'): ?string return $this->response[$url] = $client->getResponse()->getBody(); } - public function testCanonical() + public function testCanonical(): void { $url = new Url('https://piedweb.com'); $canonical = new CanonicalExtractor($url, new Crawler($this->getPage())); @@ -53,4 +55,28 @@ public function testCanonical() $this->assertFalse($canonical->isCanonicalCorrect()); $this->assertTrue($canonical->isCanonicalPartiallyCorrect()); } + + public function testTextDataExtractor(): void + { + $rawHtml = $this->getPage('https://piedweb.com/'); + $crawler = new Crawler($rawHtml); + + $textData = new TextData($rawHtml, $crawler); + + $this->assertSame('title', array_values($textData->getFlatContent())[0]); + $this->assertGreaterThan(10, $textData->getWordCount()); + $this->assertGreaterThan(10, $textData->getRatioTxtCode()); + // dump($textData->getTextAnalysis()->getExpressions(2)); + $this->assertArrayHasKey('web', $textData->getTextAnalysis()->getExpressions()); + } + + public function testHrefLangExtractor(): void + { + $rawHtml = $this->getPage('https://altimood.com/'); + + $extractor = new HrefLangExtractor(new Crawler($rawHtml)); + $list = $extractor->getHrefLangList(); + + $this->assertContains('https://altimood.com/en', $list); + } } diff --git a/packages/google-spreadsheet-seo-scraper/scrap.php b/packages/google-spreadsheet-seo-scraper/scrap.php index c5bbba5..3eccf9b 100755 --- a/packages/google-spreadsheet-seo-scraper/scrap.php +++ b/packages/google-spreadsheet-seo-scraper/scrap.php @@ -17,6 +17,7 @@ function executeScrap($i = 1) { global $argv; + try { $GoogleSpreadsheetSeoScraper = new GoogleSpreadsheetSeoScraper($argv); $GoogleSpreadsheetSeoScraper->exec(); @@ -27,12 +28,12 @@ function executeScrap($i = 1) echo 'Rebooting box and continue ('.$i.')...'.chr(10).chr(10); ++$i; exec('php "'.__DIR__.'/../perso/rebooBox.php"'); - sleep(60*5); - if (!in_array('--retry', $argv)) { + sleep(60 * 5); + if (! in_array('--retry', $argv)) { $argv[] = '--retry'; $argv[] = 'last'; } executeScrap($i); } } -} \ No newline at end of file +} diff --git a/packages/google/src/ErrorDetector.php b/packages/google/src/ErrorDetector.php index c00f91c..0534851 100644 --- a/packages/google/src/ErrorDetector.php +++ b/packages/google/src/ErrorDetector.php @@ -15,11 +15,7 @@ public static function isDetectedAsBot(string $html): bool } /* Captcha Google */ - elseif (str_contains($html, 'document.getElementById(\'captcha')) { - return true; - } - /* RAS */ - return false; + return str_contains($html, 'document.getElementById(\'captcha'); } } diff --git a/packages/google/src/Extractor/SERPExtractor.php b/packages/google/src/Extractor/SERPExtractor.php index a82b8f9..d2f9e76 100644 --- a/packages/google/src/Extractor/SERPExtractor.php +++ b/packages/google/src/Extractor/SERPExtractor.php @@ -9,7 +9,7 @@ class SERPExtractor { - public const SERP_FEATURE_SELECTORS = [ + final public const SERP_FEATURE_SELECTORS = [ 'Ads' => ['.//*[@id="tads"]|.//*[@id="bottomads"]'], 'ImagePack' => ["//span[text()='Images']", "//h3[starts-with(text(), 'Images correspondant')]"], 'Local Pack' => ["//div[text()='Adresses']"], @@ -24,29 +24,29 @@ class SERPExtractor /** * @var string[] */ - public const RELATED = ["//a[@data-xbu][starts-with(@href, '/search')]/div/div/span"]; + final public const RELATED = ["//a[@data-xbu][starts-with(@href, '/search')]/div/div/span"]; /** * @var string[] */ - public const RELATED_DESKTOP = ["//a[@data-xbu][starts-with(@href, '/search')]/div"]; + final public const RELATED_DESKTOP = ["//a[@data-xbu][starts-with(@href, '/search')]/div"]; /** @var string */ // public const RESULT_SELECTOR = '//a[@role="presentation"]/parent::div/parent::div/parent::div'; - public const RESULT_SELECTOR = '(//h2[text()=\'Extrait optimisé sur le Web\']/ancestor::block-component//a[@class])[1]|//a[@role="presentation"] '; + final public const RESULT_SELECTOR = '(//h2[text()=\'Extrait optimisé sur le Web\']/ancestor::block-component//a[@class])[1]|//a[@role="presentation"] '; // (//h2[text()='Extrait optimisé sur le Web']/ancestor::block-component//a[@class])[1]|//a[@role="presentation"] /** * @var string */ - public const RESULT_SELECTOR_DESKTOP = + final public const RESULT_SELECTOR_DESKTOP = '//a[not(starts-with(@href, "/search"))]/parent::div/parent::div/parent::div[@data-hveid] |//a[not(starts-with(@href, "/search"))]/parent::div/parent::div/parent::div[@data-sokoban-container]'; private readonly Crawler $domCrawler; /** - * @var SearchResult[] + * @var \PiedWeb\Google\Result\SearchResult[]|null */ private ?array $results = null; @@ -139,8 +139,11 @@ private function extractResultFrom(\DOMNode $linkNode, bool $ads = false): ?Sear } // skip shopping Results - if (str_starts_with($linkNode->getAttribute('href'), 'https://www.google.') - || str_starts_with($linkNode->getAttribute('href'), '/aclk?')) { + if (str_starts_with($linkNode->getAttribute('href'), 'https://www.google.')) { + return null; + } + + if (str_starts_with($linkNode->getAttribute('href'), '/aclk?')) { return null; } @@ -248,9 +251,15 @@ public function getNode(array $xpaths): \DOMNode { foreach ($xpaths as $xpath) { $node = $this->domCrawler->filterXPath($xpath)->getNode(0); - if (null !== $node && '' !== $node->nodeValue) { - return $node; + if (null === $node) { + continue; } + + if ('' === $node->nodeValue) { + continue; + } + + return $node; } throw new \LogicException('`'.implode('`, ', $xpaths).'` not found'); diff --git a/packages/google/src/Extractor/SERPExtractorJsExtended.php b/packages/google/src/Extractor/SERPExtractorJsExtended.php index 554d973..faa6164 100644 --- a/packages/google/src/Extractor/SERPExtractorJsExtended.php +++ b/packages/google/src/Extractor/SERPExtractorJsExtended.php @@ -58,7 +58,15 @@ private function getPixelPosForWithoutCache(string|\DOMNode $element): int $element = $this->getBrowserPage()->querySelectorXPath($element); if (isset($element[0]) && null !== $element[0]->boundingBox()) { $boundingBox = $element[0]->boundingBox(); - if (! \is_array($boundingBox) || ! isset($boundingBox['y']) || ! \is_int($boundingBox['y'])) { + if (! \is_array($boundingBox)) { + return 0; + } + + if (! isset($boundingBox['y'])) { + return 0; + } + + if (! \is_int($boundingBox['y'])) { return 0; } diff --git a/packages/google/src/Extractor/TrendsExtractor.php b/packages/google/src/Extractor/TrendsExtractor.php index 60b98ad..92b2b8b 100644 --- a/packages/google/src/Extractor/TrendsExtractor.php +++ b/packages/google/src/Extractor/TrendsExtractor.php @@ -28,6 +28,29 @@ class TrendsExtractor */ public array $relatedQueries = []; + public function __toJson(): string + { + return \Safe\json_encode([ + 'interest' => $this->interest, + 'interestOverTime' => $this->interestOverTime, + 'relatedTopics' => $this->relatedTopics, + 'relatedQueries' => $this->relatedQueries, + 'v' => 1, + ]); + } + + public static function loadFromJson(string $json): self + { + $json = \Safe\json_decode($json, true); + $current = new self(); + $current->interest = $json['interest']; // @phpstan-ignore-line + $current->interestOverTime = $json['interestOverTime']; // @phpstan-ignore-line + $current->relatedTopics = $json['relatedTopics']; // @phpstan-ignore-line + $current->relatedQueries = $json['relatedQueries']; // @phpstan-ignore-line + + return $current; + } + public function setRelatedTopics(string $relatedTopics): void { $this->relatedTopics = '' === $relatedTopics ? [] // @phpstan-ignore-line @@ -49,7 +72,11 @@ public function setRelatedQueries(string $relatedQueries): void /** @return array */ public function getInterest(): array { - if ([] !== $this->interest || [] === $this->interestOverTime) { + if ([] !== $this->interest) { + return $this->interest; + } + + if ([] === $this->interestOverTime) { return $this->interest; } diff --git a/packages/google/src/GoogleRequester.php b/packages/google/src/GoogleRequester.php index e2564f5..2b2cb81 100644 --- a/packages/google/src/GoogleRequester.php +++ b/packages/google/src/GoogleRequester.php @@ -20,7 +20,7 @@ public function getCurlClient(): ExtendedClient $this->client = new ExtendedClient(); $this->client ->setMobileUserAgent() - ->setDefaultSpeedOptions() + ->setDefaultSpeedOptions(20, 30, 2000) ->setCookie('CONSENT=YES+') ->fakeBrowserHeader(); } @@ -46,6 +46,7 @@ public function requestGoogleWithCurl(GoogleSERPManager $Google, ?callable $mana } $this->getCurlClient()->request($Google->generateGoogleSearchUrl()); + if (0 !== $this->getCurlClient()->getError()) { throw new \Exception($this->getCurlClient()->getErrorMessage()); } diff --git a/packages/google/src/GoogleSuggester.php b/packages/google/src/GoogleSuggester.php index 36318df..3b36e56 100644 --- a/packages/google/src/GoogleSuggester.php +++ b/packages/google/src/GoogleSuggester.php @@ -39,7 +39,11 @@ private function extractSuggests(string $url): void $content = $this->client->getResponse()->getContent(); $data = json_decode($content, true, 512, \JSON_THROW_ON_ERROR); - if (! \is_array($data) || ! isset($data[1])) { + if (! \is_array($data)) { + return; + } + + if (! isset($data[1])) { return; } diff --git a/packages/google/tests/GoogleSerpTest.php b/packages/google/tests/GoogleSerpTest.php index 0e28745..8d1afa7 100644 --- a/packages/google/tests/GoogleSerpTest.php +++ b/packages/google/tests/GoogleSerpTest.php @@ -21,7 +21,7 @@ private function getSerpManager(string $kw = 'pied web'): GoogleSERPManager return $manager; } - private function extractSERP(string $rawHtml) + private function extractSERP(string $rawHtml): void { $extractor = new SERPExtractor($rawHtml); // $this->assertNotSame(0, $extractor->getNbrResults()); @@ -60,7 +60,7 @@ private function getExtractor(string $query): SERPExtractorJsExtended return new SERPExtractorJsExtended($rawHtml); } - public function testExtractionPositionZero() + public function testExtractionPositionZero(): void { $extractor = $this->getExtractor('marmotte vercors'); // position Zero PiedVert.com, if test failed, check position Zero on SERP exists @@ -71,6 +71,7 @@ public function testExtractionPositionZero() return; } + $this->assertTrue($extractor->containsSerpFeature('PositionZero')); $this->assertStringContainsString('piedvert.com', $extractor->getPositionsZero()->url); } diff --git a/packages/google/tests/GoogleSuggesterTest.php b/packages/google/tests/GoogleSuggesterTest.php index d80bd72..7b8a11b 100644 --- a/packages/google/tests/GoogleSuggesterTest.php +++ b/packages/google/tests/GoogleSuggesterTest.php @@ -10,6 +10,7 @@ final class GoogleSuggesterTest extends TestCase public function testGoogleSuggester(): void { $suggester = new GoogleSuggester('pizza'); - $this->assertContains('pizza fromage', $suggester->extract()); + // dump($suggester->extract()); + $this->assertGreaterThan(10, count($suggester->extract())); } } diff --git a/packages/text-analyzer/src/Analysis.php b/packages/text-analyzer/src/Analysis.php index 356c430..a46782b 100644 --- a/packages/text-analyzer/src/Analysis.php +++ b/packages/text-analyzer/src/Analysis.php @@ -34,8 +34,11 @@ public function getWordNumber(): int /** * @return array */ - public function getExpressions(?int $number = null): array + public function getExpressions(?int $minFound = null): array { - return ! $number ? $this->expressions : \array_slice($this->getExpressions(), 0, $number); + return ! $minFound ? $this->expressions : array_filter( + $this->getExpressions(), + fn ($value): bool => $value >= $minFound + ); } } diff --git a/packages/text-analyzer/src/Analyzer.php b/packages/text-analyzer/src/Analyzer.php index e9bc01e..7341213 100644 --- a/packages/text-analyzer/src/Analyzer.php +++ b/packages/text-analyzer/src/Analyzer.php @@ -4,6 +4,9 @@ use PiedWeb\Extractor\Helper; +/** + * @see \PiedWeb\TextAnalyzer\Test\AnalyzerTest + */ class Analyzer { /** @@ -20,7 +23,7 @@ public function __construct( ) { $this->text = CleanText::stripHtmlTags($this->text); $this->text = CleanText::fixEncoding($this->text); - $this->text = CleanText::removeDate($this->text); + // $this->text = CleanText::removeDate($this->text); if ($this->onlyInSentence) { $this->text = CleanText::keepOnlySentence($this->text); @@ -57,11 +60,50 @@ public function exec(): Analysis $this->extract($sentence); } - arsort($this->expressions); + $this->cleanExpressions(); return new Analysis($this->expressions, $this->wordNumber); } + private function cleanExpressions(): void + { + arsort($this->expressions); + + foreach ($this->expressions as $expression => $int) { + $this->cleanSimilar($expression); + } + } + + private function cleanSimilar(string $expression): void + { + $similar = $this->findSimilar($expression); + if ('' === $similar) { + return; + } + + if ($this->expressions[$similar] === $this->expressions[$expression]) { + unset($this->expressions[$expression]); + } + + // return $this->cleanSimilar($expression); + } + + private function findSimilar(string $expressionToCompare): string + { + foreach ($this->expressions as $expression => $int) { + if ($expression === $expressionToCompare) { + continue; + } + if (! str_contains($expression, $expressionToCompare)) { + continue; + } + + return $expression; + } + + return ''; + } + private function extract(string $sentence): void { $sentence = CleanText::removePunctuation($sentence); @@ -88,7 +130,7 @@ private function extract(string $sentence): void $this->incrementWordNumber(-1); } } else { - $plus = 1 + substr_count(CleanText::removeStopWords($expression), ' '); + $plus = 1; // 1 + substr_count(CleanText::removeStopWords($expression), ' '); $this->expressions[$expression] = ($this->expressions[$expression] ?? 0) + $plus; } } @@ -112,7 +154,7 @@ private function cleanExpr(string $expression, int $wordNumber): string // Last Clean $expression = trim(Helper::preg_replace_str('/\s+/', ' ', $expression)); if ('' == htmlentities($expression)) { // Avoid � - $expression = ''; + return ''; } return $expression; diff --git a/packages/text-analyzer/src/CleanText.php b/packages/text-analyzer/src/CleanText.php index 793e555..4196ad3 100644 --- a/packages/text-analyzer/src/CleanText.php +++ b/packages/text-analyzer/src/CleanText.php @@ -154,7 +154,7 @@ public static function stripHtmlTags(string $html): string $dom = new Crawler($html); if ('' === ($text = $dom->text(''))) { // If we failed to load the html in dom - $text = self::stripHtmlTagsOldWay($html); + return self::stripHtmlTagsOldWay($html); } return $text; diff --git a/packages/text-analyzer/tests/AnalyzerTest.php b/packages/text-analyzer/tests/AnalyzerTest.php index 9adbcca..ea9043e 100644 --- a/packages/text-analyzer/tests/AnalyzerTest.php +++ b/packages/text-analyzer/tests/AnalyzerTest.php @@ -7,19 +7,19 @@ class AnalyzerTest extends \PHPUnit\Framework\TestCase { - public function testMultiAnalyzer() + public function testMultiAnalyzer(): void { $test = new MultiAnalyzer(true); $result = $test->addContent('Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed. Lorem ipsum 2 dolor sit amet, consectetur adipiscing elit, sed...'); - $this->assertSame($result->getExpressions()['dolor sit amet'], 6); + $this->assertSame($result->getExpressions()['dolor sit amet'], 2); $this->assertSame($result->getWordNumber(), 18); $result = $test->addContent('Text Analyser : Expression in a text per Usage.'); $result = $test->addContent('Please check if test are still running without error (phpunit)'); $results = $test->exec(); - $this->assertSame($results->getExpressions()['dolor sit amet'], 6); + $this->assertSame($results->getExpressions()['dolor sit amet'], 2); $this->assertSame($results->getWordNumber(), 24); $content = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit' @@ -27,6 +27,6 @@ public function testMultiAnalyzer() $kws = new Analyzer($content, false, 1); $kws = $kws->exec(); - $this->assertSame(\count($kws->getExpressions(10)), 10); + $this->assertCount(1, $kws->getExpressions(2)); } } diff --git a/packages/text-analyzer/tests/CleanTextTest.php b/packages/text-analyzer/tests/CleanTextTest.php index 8488537..ddc39ea 100644 --- a/packages/text-analyzer/tests/CleanTextTest.php +++ b/packages/text-analyzer/tests/CleanTextTest.php @@ -6,31 +6,31 @@ class CleanTextTest extends \PHPUnit\Framework\TestCase { - public function testSimpleSentences() + public function testSimpleSentences(): void { $loremIpsum = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'; - $this->assertSame(4, \count(CleanText::getSentences($loremIpsum))); + $this->assertCount(4, CleanText::getSentences($loremIpsum)); } - public function testRemoveExtremityStopWords() + public function testRemoveExtremityStopWords(): void { $this->assertSame('ferme', CleanText::removeStopWordsAtExtremity('la ferme')); $this->assertSame('ferme', CleanText::removeStopWordsAtExtremity('la ferme de')); $this->assertSame('ferme', CleanText::removeStopWordsAtExtremity('ferme de')); } - public function testRemoveExtremityStopWords2() + public function testRemoveExtremityStopWords2(): void { $this->assertSame('savoir', CleanText::removeStopWordsAtExtremity('savoir plus ')); } - public function testremoveStopWords() + public function testremoveStopWords(): void { $this->assertSame('', CleanText::removeStopWords(' http//www ')); } - public function testStripTags() + public function testStripTags(): void { $text = ''; diff --git a/phpstan.neon.dist b/phpstan.neon.dist index db31266..6158194 100644 --- a/phpstan.neon.dist +++ b/phpstan.neon.dist @@ -3,6 +3,10 @@ parameters: paths: - packages/text-analyzer/src - packages/curl/src + - packages/curl/tests - packages/google/src + - packages/google/tests - packages/google-spreadsheet-seo-scraper/src - - packages/extractor/src \ No newline at end of file + - packages/extractor/src + - packages/crawler/src + - packages/crawler/tests \ No newline at end of file diff --git a/rector.php b/rector.php index 33fcca0..07ffe67 100644 --- a/rector.php +++ b/rector.php @@ -58,9 +58,14 @@ return static function (RectorConfig $rectorConfig): void { $rectorConfig->parallel(); + $composerConfig = json_decode(file_get_contents('composer.json'), true); + $paths = array_merge( + array_values($composerConfig['autoload']['psr-4']), + array_values($composerConfig['autoload-dev']['psr-4']) + ); $rectorConfig->paths(array_map( function ($path) { return __DIR__.'/'.$path; }, - array_values(json_decode(file_get_contents('composer.json'), true)['autoload']['psr-4']) + $paths )); $rectorConfig->skip([ @@ -72,16 +77,17 @@ function ($path) { return __DIR__.'/'.$path; }, $rectorConfig->sets([ LevelSetList::UP_TO_PHP_80, LevelSetList::UP_TO_PHP_81, + SetList::EARLY_RETURN, + SetList::TYPE_DECLARATION, + SetList::TYPE_DECLARATION_STRICT, + PHPUnitSetList::PHPUNIT_CODE_QUALITY // SetList::CODE_QUALITY, // SetList::DEAD_CODE, // SetList::CODING_STYLE, - // SetList::TYPE_DECLARATION, - // SetList::TYPE_DECLARATION_STRICT, - // SetList::NAMING, // SetList::PRIVATIZATION, - // SetList::EARLY_RETURN, // PHPUnitSetList::PHPUNIT_CODE_QUALITY, ]); + $rectorConfig->rule(CombinedAssignRector::class); $rectorConfig->rule(SimplifyConditionsRector::class); $rectorConfig->rule(SimplifyDeMorganBinaryRector::class);