Skip to content

Commit

Permalink
...
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinDev committed Jan 3, 2023
1 parent c629b43 commit c89a621
Show file tree
Hide file tree
Showing 14 changed files with 83 additions and 32 deletions.
16 changes: 8 additions & 8 deletions .git-hooks/pre-commit
Expand Up @@ -3,13 +3,13 @@
# To skip it : git commit --no-verify -m "

composer format
composer rector
#composer rector
git add .

if composer test
then
exit 0
else
echo An error occur testing the code. Commit canceled
exit 1
fi
# if composer test
# then
# exit 0
# else
# echo An error occur testing the code. Commit canceled
# exit 1
# fi
6 changes: 1 addition & 5 deletions composer.json
Expand Up @@ -12,7 +12,7 @@
"jeremykendall/php-domain-parser": "^6.1",
"league/csv": "^9.8",
"league/uri": "^6.5",
"relisten/forceutf8": "dev-multibyte-convert",
"fylax/forceutf8": "^3.0",
"nesk/puphpeteer": "dev-zoon",
"spatie/robots-txt": "^2.0",
"symfony/cache": "^6.1",
Expand Down Expand Up @@ -84,10 +84,6 @@
{
"type": "vcs",
"url": "https://github.com/zoonru/rialto.git"
},
{
"type": "vcs",
"url": "https://github.com/Fylax/forceutf8"
}
]
}
1 change: 1 addition & 0 deletions packages/crawler/src/CrawlerUrl.php
Expand Up @@ -127,6 +127,7 @@ protected function isRedirection(): bool

$this->links[] = $redirLink;

$this->url->setLinks([$redirLink]);
$this->url->setIndexable(false);
$this->url->setIndexableStatus(Indexable::NOT_INDEXABLE['redir']);

Expand Down
19 changes: 18 additions & 1 deletion packages/crawler/src/Url.php
Expand Up @@ -434,8 +434,14 @@ public function getUpdatedAt(): \DateTimeInterface
return $this->updatedAt;
}

public function setUpdatedAt(\DateTimeInterface $updatedAt): void
public function setUpdatedAt(\DateTimeInterface|string $updatedAt): void
{
if (\is_string($updatedAt)) {
$this->setUpdatedAtFromString($updatedAt);

return;
}

$this->updatedAt = $updatedAt;
}

Expand Down Expand Up @@ -692,6 +698,17 @@ public function getFlatContent(): array
return $this->flatContent;
}

public function getFlatContentString(): string
{
$toReturn = '';

foreach (array_keys($this->flatContent) as $partContent) {
$toReturn .= $partContent.\chr(10).\chr(10);
}

return $toReturn;
}

/** @param array<string, string> $flatContent */
public function setFlatContent(array $flatContent): self
{
Expand Down
2 changes: 1 addition & 1 deletion packages/curl/tests/RequestTest.php
Expand Up @@ -38,7 +38,7 @@ public function testDownloadIfHtml(): void

public function testNotDownload(): void
{
$url = 'https://piedweb.com/assets/img/xl/bg.jpg';
$url = 'https://altimood.com/media/default/rando-alpine-coucher-de-soleil.jpg';
$request = new Client($url);
$request
->setDefaultGetOptions()
Expand Down
2 changes: 1 addition & 1 deletion packages/extractor/composer.json
Expand Up @@ -19,7 +19,7 @@
],
"require": {
"php": ">=8.1",
"neitanod/forceutf8": "^2.0",
"fylax/forceutf8": "^3.0",
"symfony/dom-crawler": "^6.1",
"symfony/css-selector": "^6.1",
"league/uri": "^6.5",
Expand Down
4 changes: 4 additions & 0 deletions packages/extractor/src/LinksExtractor.php
Expand Up @@ -124,6 +124,10 @@ private function extractUrl(\DOMElement $element): ?string
return null;
}

if (str_starts_with($url, '////')) {
return null;
}

return $this->requestedUrl->resolve($url);
}
}
2 changes: 1 addition & 1 deletion packages/extractor/src/Url.php
Expand Up @@ -31,7 +31,7 @@ public function __construct(string $url)

public function resolve(string $url): string
{
$resolved = UriResolver::resolve(Http::createFromString($url), $this->http);
$resolved = UriResolver::resolve(Http::createFromString(trim($url)), $this->http);

return $resolved->__toString();
}
Expand Down
11 changes: 6 additions & 5 deletions packages/extractor/tests/GlobalTest.php
Expand Up @@ -70,17 +70,18 @@ public function testCanonical(): void
$canonical = new CanonicalExtractor($url, new Crawler($this->getPage()));
$this->assertTrue($canonical->isCanonicalCorrect());
$this->assertTrue($canonical->ifCanonicalExistsIsItCorrectOrPartiallyCorrect());
$canonical = new CanonicalExtractor($url, new Crawler(str_replace('<link rel="canonical" href="https://piedweb.com/" />', '<link rel="canonical" href="/" />', $this->getPage('https://piedweb.com/'))));
$crawler = new Crawler(str_replace('href=https://piedweb.com/ rel=canonical', 'rel="canonical" href="/"', $this->getPage('https://piedweb.com/')));
$canonical = new CanonicalExtractor($url, $crawler);
$this->assertFalse($canonical->isCanonicalCorrect());
$this->assertTrue($canonical->isCanonicalPartiallyCorrect());
$this->assertTrue($canonical->ifCanonicalExistsIsItCorrectOrPartiallyCorrect());
$canonical = new CanonicalExtractor($url, new Crawler(str_replace('<link rel="canonical" href="https://piedweb.com/" />', ' ', $this->getPage('https://piedweb.com/'))));
$canonical = new CanonicalExtractor($url, new Crawler(str_replace('<link href=https://piedweb.com/ rel=canonical>', ' ', $this->getPage('https://piedweb.com/'))));
$this->assertTrue($canonical->ifCanonicalExistsIsItCorrectOrPartiallyCorrect());
$canonical = new CanonicalExtractor($url, new Crawler(str_replace('<link rel="canonical" href="https://piedweb.com/" />', '<link rel="canonical" href="/other-page" />', $this->getPage('https://piedweb.com/'))));
$canonical = new CanonicalExtractor($url, new Crawler(str_replace('<link href=https://piedweb.com/ rel=canonical>', '<link rel="canonical" href="/other-page" />', $this->getPage('https://piedweb.com/'))));
$this->assertFalse($canonical->ifCanonicalExistsIsItCorrectOrPartiallyCorrect());

$url = new Url('https://piedweb.com/seo');
$canonical = new CanonicalExtractor($url, new Crawler(str_replace('<link rel="canonical" href="https://piedweb.com/seo" />', '<link rel="canonical" href="/seo" />', $this->getPage('https://piedweb.com/seo'))));
$url = new Url('https://piedweb.com/clients');
$canonical = new CanonicalExtractor($url, new Crawler(str_replace('<link href=https://piedweb.com/clients rel=canonical>', '<link rel="canonical" href="/clients" />', $this->getPage('https://piedweb.com/clients'))));
$this->assertFalse($canonical->isCanonicalCorrect());
$this->assertTrue($canonical->isCanonicalPartiallyCorrect());
$this->assertTrue($canonical->ifCanonicalExistsIsItCorrectOrPartiallyCorrect());
Expand Down
2 changes: 1 addition & 1 deletion packages/text-analyzer/composer.json
Expand Up @@ -17,7 +17,7 @@
],
"require": {
"php": ">=8.1",
"neitanod/forceutf8": "^2.0"
"fylax/forceutf8": "^3.0"
},
"autoload": {
"psr-4": {
Expand Down
20 changes: 17 additions & 3 deletions packages/text-analyzer/src/Analyzer.php
Expand Up @@ -82,14 +82,24 @@ private function cleanExpressions(): void
private function cleanSimilar(string $expression): void
{
$similar = $this->findSimilar($expression);
// can return énergies for énergie or énergie bio for énergie
if ('' === $similar) {
return;
}

if ($this->expressions[$similar] === $this->expressions[$expression]) {
unset($this->expressions[$expression]);
// if we find énergie and énergie bio autant de fois
// on supprime le plus court
if ($this->expressions[$similar] === $this->expressions[$expression] && substr_count($expression, ' ') > 0) {
unset($this->expressions[\strlen($similar) < \strlen($expression) ? $similar : $expression]);

return;
}

// if (strlen($similar) < strlen($expression)
// && $this->expressions[$similar] <= $this->expressions[$expression]) {
// unset($this->expressions[$similar]);
// }

// return $this->cleanSimilar($expression);
}

Expand All @@ -112,7 +122,7 @@ private function findSimilar(string $expressionToCompare): string

private function extract(string $words): void
{
$words = explode(' ', trim(strtolower($words)));
$words = explode(' ', trim(mb_strtolower($words)));

$wordsKey = array_keys($words);
foreach ($wordsKey as $key) {
Expand Down Expand Up @@ -145,6 +155,10 @@ private function extract(string $words): void

private function cleanExpr(string $expression, int $wordNumber): string
{
if (1 === \strlen($expression)) {
return '';
}

if ($wordNumber <= 2) {
$expression = trim(CleanText::removeStopWords(' '.$expression.' '));
} else {
Expand Down
15 changes: 10 additions & 5 deletions packages/text-analyzer/src/CleanText.php
Expand Up @@ -29,16 +29,20 @@ class CleanText
'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your',

'cookielawinfo', 'checkbox',
// French Stop words
'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la',
'le', 'leur', 'lui', 'plus', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous',
'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes',
'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi', 'comme',
'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'puis', 'aussi', 'comme', 'pourquoi', 'alors', 'si',
'chaque', 'mentions légales', 'entre', 'autre', 'comment', 'là', 'après', 'principalement',
'certains', 'parfois', 'ensuite', 'article', 'etc', 'où', 'également', 'site', 'mieux', 'ainsi', 'fois', 'encore',
'selon', 'afin', 'blog', 'user', 'certaines', 'avoir', 'autres', 'souvent', '★★★★★', '★', 'propose',

'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y',
'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'e',

'ceci', 'cela', 'celà', 'cet', 'cette', 'ici', 'ils', 'les', 'leurs', 'quel', 'quels', 'quelle', 'quelles',
'sans', 'soi', 'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car',
'sans', 'soi', 'très', 'tout', 'toutes', 'tous', 'bien', 'bonne', 'peu', 'ça', 'car', 'selon', 'lequel',

'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras',
'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était',
Expand Down Expand Up @@ -70,8 +74,9 @@ public static function fixEncoding(string $text): string
$text = str_replace(['’'], "'", $text);
$text = str_replace(['…'], '...', $text);
$text = str_replace(['–', '—', '\xC2\xAD'], ' - ', $text);
$text = preg_replace('#(,|\.|\(|\[|\]|\)|!|\?|;|\{|\}|"|:|\*|\/|\||>|<|-|\+)#', ' $0 ', $text) ?? throw new \Exception();
$text = preg_replace('#(\xE2\x80\xAF|\xC2\xAD|\xC2\xA0|\s)+#', ' ', $text) ?? throw new \Exception();
$text = preg_replace('#(,|\.|\(|\[|\]|\)|!|\?|;|\{|\}|"|:|\*|\/|\||>|<|-|\+)#', ' $0 ', $text) ?? throw new \Exception(preg_last_error_msg());
$text = preg_replace('#(\xE2\x80\xAF|\xC2\xAD|\xC2\xA0)+#', ' ', $text) ?? throw new \Exception(preg_last_error_msg());
$text = preg_replace('#\s+#', ' ', $text) ?? throw new \Exception(preg_last_error_msg());

$text = str_replace('’', "'", $text);

Expand Down
11 changes: 11 additions & 0 deletions packages/text-analyzer/tests/AnalyzerTest.php
Expand Up @@ -30,4 +30,15 @@ public function testMultiAnalyzer(): void
$kws = $kws->exec();
$this->assertCount(1, $kws->getExpressions(2));
}

public function testTextAnalyzer(): void
{
$text = 'chaque fois, c est la même histoire de chaque pluie, c est pas fini chaque matin';
$text = $text.' '.$text.' '.$text;

$tester = new Analyzer($text, false, 2);
dump($tester->exec()->getExpressions(2));

$this->assertTrue(true);
}
}
4 changes: 3 additions & 1 deletion packages/text-analyzer/tests/CleanTextTest.php
Expand Up @@ -25,9 +25,11 @@ public function testRemoveExtremityStopWords2(): void
$this->assertSame('savoir', CleanText::removeStopWordsAtExtremity('savoir plus '));
}

public function testremoveStopWords(): void
public function testRemoveStopWords(): void
{
$this->assertSame('', CleanText::removeStopWords(' http//www '));
$this->assertSame('', CleanText::removeStopWords(' ainsi '));
$this->assertSame('', CleanText::removeStopWords(' chaque '));
}

public function testStripTags(): void
Expand Down

0 comments on commit c89a621

Please sign in to comment.