Skip to content

Commit

Permalink
create AbstractPageRankCalculator
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinDev committed Nov 2, 2023
1 parent b3542f0 commit 751c3c1
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 56 deletions.
2 changes: 1 addition & 1 deletion .phpunit.cache/test-results
@@ -1 +1 @@
{"version":1,"defects":{"GoogleSerpTest::testPuphpeteerMobile":8,"GoogleSerpTest::testCurlMobile":8,"GoogleSerpTest::testExtractionPositionZero":8,"GoogleSerpTest::testExtractMaps":7,"GlobalTest::testTextDataExtractor":7,"GoogleSerpTest::testExtractMapsB":8,"GoogleTrendsTest::testTrendsPuppet":7,"GoogleSuggesterTest::testGoogleSuggester":8,"GoogleTrendsTest::testTrendsCurl":8,"PiedWeb\\MethodDocBlockGenerator\\Test\\MethodDocBlockGeneratorTest::testIt":7},"times":{"PiedWeb\\Crawler\\Test\\CrawlerTest::testIt":0.977,"PiedWeb\\Crawler\\Test\\CrawlerTest::testCommand":0.654,"PiedWeb\\Crawler\\Test\\CrawlerTest::testWitchCachId":0.823,"PiedWeb\\Curl\\Test\\HelperTest::testSchemeFromProxy":0.001,"PiedWeb\\Curl\\Test\\HelperTest::testCheckContentType":0,"PiedWeb\\Curl\\Test\\HelperTest::testCheckStatusCode":0,"PiedWeb\\Curl\\Test\\RequestTest::testDownloadIfHtml":0.245,"PiedWeb\\Curl\\Test\\RequestTest::testNotDownload":0.205,"PiedWeb\\Curl\\Test\\RequestTest::testEffectiveUrl":20.508,"PiedWeb\\Curl\\Test\\RequestTest::testCurlError":0.029,"PiedWeb\\Curl\\Test\\RequestTest::test404":0.187,"PiedWeb\\Curl\\Test\\RequestTest::testAllMethods":0.173,"PiedWeb\\Curl\\Test\\RequestTest::testMultipleCheckInHeaders":0.128,"PiedWeb\\Curl\\Test\\RequestTest::testProxy":1.002,"PiedWeb\\Curl\\Test\\RequestTest::testAbortIfTooBig":0.219,"PiedWeb\\Curl\\Test\\RequestTest::testDownloadOnlyFirstBytes":0.224,"PiedWeb\\Curl\\Test\\RequestTest::testResponseFromCache":0.001,"PiedWeb\\Curl\\Test\\StaticWrapperTest::testStaticGet":0.055,"GlobalTest::testEncoding":0.001,"GlobalTest::testCanonical":0.727,"GlobalTest::testTextDataExtractor":0.403,"GlobalTest::testHrefLangExtractor":0.349,"GoogleSerpTest::testPuphpeteerMobile":8.977,"GoogleSerpTest::testCurlMobile":0.849,"GoogleSerpTest::testExtractionPositionZero":0.732,"GoogleSerpTest::testExtractMaps":2.364,"GoogleSuggesterTest::testGoogleSuggester":1.171,"GoogleTrendsTest::testTrendsCurl":1.101,"GoogleTrendsTest::testTrendsPuppet":33.783,"PiedWeb\\TextAnalyzer\\Test\\AnalyzerTest::testMultiAnalyzer":0.009,"PiedWeb\\TextAnalyzer\\Test\\AnalyzerTest::testTextAnalyzer":0.003,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testSimpleSentences":0,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testRemoveExtremityStopWords":0,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testRemoveExtremityStopWords2":0,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testRemoveStopWords":0,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testStripTags":0.001,"GoogleSerpTest::testRelatedSearches":0.628,"GoogleSerpTest::testExtractMapsB":0.704,"PiedWeb\\MethodDocBlockGenerator\\Test\\MethodDocBlockGeneratorTest::testIt":0.003}}
{"version":1,"defects":{"GoogleSerpTest::testPuphpeteerMobile":8,"GoogleSerpTest::testCurlMobile":8,"GoogleSerpTest::testExtractionPositionZero":8,"GoogleSerpTest::testExtractMaps":7,"GlobalTest::testTextDataExtractor":7,"GoogleSerpTest::testExtractMapsB":8,"GoogleTrendsTest::testTrendsPuppet":7,"GoogleSuggesterTest::testGoogleSuggester":8,"GoogleTrendsTest::testTrendsCurl":8,"PiedWeb\\MethodDocBlockGenerator\\Test\\MethodDocBlockGeneratorTest::testIt":7},"times":{"PiedWeb\\Crawler\\Test\\CrawlerTest::testIt":1.496,"PiedWeb\\Crawler\\Test\\CrawlerTest::testCommand":0.668,"PiedWeb\\Crawler\\Test\\CrawlerTest::testWitchCachId":0.762,"PiedWeb\\Curl\\Test\\HelperTest::testSchemeFromProxy":0.001,"PiedWeb\\Curl\\Test\\HelperTest::testCheckContentType":0,"PiedWeb\\Curl\\Test\\HelperTest::testCheckStatusCode":0,"PiedWeb\\Curl\\Test\\RequestTest::testDownloadIfHtml":0.304,"PiedWeb\\Curl\\Test\\RequestTest::testNotDownload":0.204,"PiedWeb\\Curl\\Test\\RequestTest::testEffectiveUrl":20.478,"PiedWeb\\Curl\\Test\\RequestTest::testCurlError":0.042,"PiedWeb\\Curl\\Test\\RequestTest::test404":0.229,"PiedWeb\\Curl\\Test\\RequestTest::testAllMethods":1.499,"PiedWeb\\Curl\\Test\\RequestTest::testMultipleCheckInHeaders":0.103,"PiedWeb\\Curl\\Test\\RequestTest::testProxy":1.002,"PiedWeb\\Curl\\Test\\RequestTest::testAbortIfTooBig":0.223,"PiedWeb\\Curl\\Test\\RequestTest::testDownloadOnlyFirstBytes":0.233,"PiedWeb\\Curl\\Test\\RequestTest::testResponseFromCache":0.002,"PiedWeb\\Curl\\Test\\StaticWrapperTest::testStaticGet":0.059,"GlobalTest::testEncoding":0.004,"GlobalTest::testCanonical":0.669,"GlobalTest::testTextDataExtractor":0.405,"GlobalTest::testHrefLangExtractor":0.367,"GoogleSerpTest::testPuphpeteerMobile":9.054,"GoogleSerpTest::testCurlMobile":0.873,"GoogleSerpTest::testExtractionPositionZero":0.784,"GoogleSerpTest::testExtractMaps":1.427,"GoogleSuggesterTest::testGoogleSuggester":1.373,"GoogleTrendsTest::testTrendsCurl":2.809,"GoogleTrendsTest::testTrendsPuppet":34.619,"PiedWeb\\TextAnalyzer\\Test\\AnalyzerTest::testMultiAnalyzer":0.009,"PiedWeb\\TextAnalyzer\\Test\\AnalyzerTest::testTextAnalyzer":0.003,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testSimpleSentences":0,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testRemoveExtremityStopWords":0,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testRemoveExtremityStopWords2":0,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testRemoveStopWords":0,"PiedWeb\\ExpressionHarvester\\Test\\CleanTextTest::testStripTags":0.001,"GoogleSerpTest::testRelatedSearches":0.658,"GoogleSerpTest::testExtractMapsB":0.704,"PiedWeb\\MethodDocBlockGenerator\\Test\\MethodDocBlockGeneratorTest::testIt":0.001}}
63 changes: 63 additions & 0 deletions packages/crawler/src/AbstractPageRankCalculator.php
@@ -0,0 +1,63 @@
<?php

namespace PiedWeb\Crawler;

/**
* Page Rank Calculator.
*/
abstract class AbstractPageRankCalculator
{
protected float $dampingFactor = 0.85;

protected int $maxIteration = 10000;

protected ?int $pagesNbr = null;

/**
* @var array<int, float> where key is url (id) and value page rank
*/
protected array $results = [];

/**
* @var array<int, array<int>> where key is destination (id) and value fromIdList
*/
protected array $linksTo = [];

/**
* @var array<int, int> where key is from (id) and value count
*/
protected array $nbrLinksFrom = [];

protected function calcul(): void
{
for ($iteration = 0; $iteration < $this->maxIteration; ++$iteration) {
$ids = array_keys($this->linksTo);
foreach ($ids as $id) {
$sumPR = 0;
foreach ($this->getLinksTo($id) as $link) {
$sumPR += $this->results[$link] ?? 0 / $this->getNbrLinksFrom($link);
}

$this->results[$id] = $this->dampingFactor * $sumPR + (1 - $this->dampingFactor) / $this->getPagesNbr();
}
}
}

protected function getPagesNbr(): int
{
return $this->pagesNbr ??= \count($this->linksTo);
}

/**
* @return int[]
*/
protected function getLinksTo(int $id): array
{
return $this->linksTo[$id];
}

protected function getNbrLinksFrom(int $id): int
{
return $this->nbrLinksFrom[$id];
}
}
56 changes: 1 addition & 55 deletions packages/crawler/src/SimplePageRankCalculator.php
Expand Up @@ -7,31 +7,10 @@
/**
* Page Rank Calculator.
*/
final class SimplePageRankCalculator
final class SimplePageRankCalculator extends AbstractPageRankCalculator
{
private readonly \PiedWeb\Crawler\CrawlerConfig $config;

private ?int $pagesNbr = null;

/**
* @var array<int, float>
*/
private array $results = [];

private int $maxIteration = 10000;

/**
* @var array<int, array<int>>
*/
private array $linksTo = [];

/**
* @var array<int, int>
*/
private array $nbrLinksFrom = [];

private float $dampingFactor = 0.85;

public function __construct(string $id, string $dataDirectory = null)
{
$this->config = CrawlerConfig::loadFrom($id, $dataDirectory);
Expand All @@ -58,39 +37,6 @@ public function record(): string
return realpath($this->config->getDataFolder()).'/data.csv';
}

private function calcul(): void
{
for ($iteration = 0; $iteration < $this->maxIteration; ++$iteration) {
$ids = array_keys($this->linksTo);
foreach ($ids as $id) {
$sumPR = 0;
foreach ($this->getLinksTo($id) as $link) {
$sumPR += $this->results[$link] ?? 0 / $this->getNbrLinksFrom($link);
}

$this->results[$id] = $this->dampingFactor * $sumPR + (1 - $this->dampingFactor) / $this->getPagesNbr();
}
}
}

private function getPagesNbr(): int
{
return $this->pagesNbr ??= \count($this->linksTo);
}

/**
* @return int[]
*/
private function getLinksTo(int $id): array
{
return $this->linksTo[$id];
}

private function getNbrLinksFrom(int $id): int
{
return $this->nbrLinksFrom[$id];
}

private function initLinksIndex(): void
{
$csv = Reader::createFromPath($this->config->getDataFolder().Recorder::LINKS_DIR.'/Index.csv', 'r');
Expand Down

0 comments on commit 751c3c1

Please sign in to comment.