Skip to content

Commit

Permalink
Merge pull request #36 from StenopePHP/html-processor
Browse files Browse the repository at this point in the history
Mutualise HTML Crawler for better performances
  • Loading branch information
Tom32i authored Jun 7, 2021
2 parents 0a01b1b + 0dcfd72 commit 9b81e96
Show file tree
Hide file tree
Showing 27 changed files with 341 additions and 125 deletions.
41 changes: 33 additions & 8 deletions config/services.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
namespace Symfony\Component\DependencyInjection\Loader\Configurator;

use Psr\Log\LoggerInterface;
use Stenope\Bundle\Behaviour\HtmlCrawlerManagerInterface;
use Stenope\Bundle\Builder;
use Stenope\Bundle\Builder\PageList;
use Stenope\Bundle\Builder\Sitemap;
Expand Down Expand Up @@ -40,7 +41,9 @@
use Stenope\Bundle\Routing\UrlGenerator;
use Stenope\Bundle\Serializer\Normalizer\SkippingInstantiatedObjectDenormalizer;
use Stenope\Bundle\Service\AssetUtils;
use Stenope\Bundle\Service\NaiveHtmlCrawlerManager;
use Stenope\Bundle\Service\Parsedown;
use Stenope\Bundle\Service\SharedHtmlCrawlerManager;
use Stenope\Bundle\TableOfContent\CrawlerTableOfContentGenerator;
use Stenope\Bundle\Twig\ContentExtension;
use Stenope\Bundle\Twig\ContentRuntime;
Expand All @@ -60,6 +63,7 @@
'$decoder' => service('serializer'),
'$denormalizer' => service('serializer'),
'$propertyAccessor' => service('property_accessor'),
'$crawlers' => service(HtmlCrawlerManagerInterface::class),
'$contentProviders' => tagged_iterator(tags\content_provider),
'$processors' => tagged_iterator(tags\content_processor),
'$stopwatch' => service('debug.stopwatch')->nullOnInvalid(),
Expand Down Expand Up @@ -169,29 +173,50 @@

// Table of content
->set(CrawlerTableOfContentGenerator::class)
;

// Tagged Property handlers:
// HTML Crawler Manager
->set(NaiveHtmlCrawlerManager::class)
->set(SharedHtmlCrawlerManager::class)
->alias(HtmlCrawlerManagerInterface::class, NaiveHtmlCrawlerManager::class);

// Tagged processors:
$container->services()->defaults()->tag(tags\content_processor)
->set(LastModifiedProcessor::class)
->set(SlugProcessor::class)
->set(HtmlIdProcessor::class)
->args([
'$property' => 'content',
'$slugger' => service(SluggerInterface::class),
'$crawlers' => service(HtmlCrawlerManagerInterface::class),
])
->set(HtmlAnchorProcessor::class)
->set(HtmlExternalLinksProcessor::class)
->set(ExtractTitleFromHtmlContentProcessor::class)
->set(CodeHighlightProcessor::class)->args(['$highlighter' => service(Prism::class)])
->set(ResolveContentLinksProcessor::class)->args(['$resolver' => service(ContentUrlResolver::class)])
->set(AssetsProcessor::class)->args(['$assetUtils' => service(AssetUtils::class)])
->set(HtmlAnchorProcessor::class)->args([
'$crawlers' => service(HtmlCrawlerManagerInterface::class),
])
->set(HtmlExternalLinksProcessor::class)->args([
'$crawlers' => service(HtmlCrawlerManagerInterface::class),
])
->set(ExtractTitleFromHtmlContentProcessor::class)->args([
'$crawlers' => service(HtmlCrawlerManagerInterface::class),
])
->set(CodeHighlightProcessor::class)->args([
'$crawlers' => service(HtmlCrawlerManagerInterface::class),
'$highlighter' => service(Prism::class),
])
->set(ResolveContentLinksProcessor::class)->args([
'$resolver' => service(ContentUrlResolver::class),
'$crawlers' => service(HtmlCrawlerManagerInterface::class),
])
->set(AssetsProcessor::class)->args([
'$assetUtils' => service(AssetUtils::class),
'$crawlers' => service(HtmlCrawlerManagerInterface::class),
])
->set(TableOfContentProcessor::class)
->args([
'$generator' => service(CrawlerTableOfContentGenerator::class),
'$tableOfContentProperty' => 'tableOfContent',
'$contentProperty' => 'content',
'$minDepth' => 2,
'$crawlers' => service(HtmlCrawlerManagerInterface::class),
])
->tag(tags\content_processor, ['priority' => -100])
;
Expand Down
1 change: 1 addition & 0 deletions doc/app/config/packages/stenope.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
stenope:
shared_html_crawlers: true
resolve_links:
App\Model\Page: { route: page, slug: page }

Expand Down
4 changes: 2 additions & 2 deletions doc/app/src/Processor/DefaultTocProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ public function __construct(string $tableOfContentProperty = 'tableOfContent')
$this->tableOfContentProperty = $tableOfContentProperty;
}

public function __invoke(array &$data, string $type, Content $content): void
public function __invoke(array &$data, Content $content): void
{
if (!is_a($type, Page::class, true)) {
if (!is_a($content->getType(), Page::class, true)) {
return;
}

Expand Down
4 changes: 2 additions & 2 deletions doc/processors.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ class GravatarProcessor implements ProcessorInterface
* @param string $type The model being processed (FQN)
* @param Content $content The source content
*/
public function __invoke(array &$data, string $type, Content $content): void
public function __invoke(array &$data, Content $content): void
{
// Only apply this processor on Users
if (!is_a($type, User::class, true)) {
if (!is_a($content->getType(), User::class, true)) {
return;
}

Expand Down
30 changes: 30 additions & 0 deletions src/Behaviour/HtmlCrawlerManagerInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<?php

/*
* This file is part of the "StenopePHP/Stenope" bundle.
*
* @author Thomas Jarrand <thomas.jarrand@gmail.com>
*/

namespace Stenope\Bundle\Behaviour;

use Stenope\Bundle\Content;
use Symfony\Component\DomCrawler\Crawler;

interface HtmlCrawlerManagerInterface
{
/**
* Get HTML Crawler for the given property (creates it if needed)
*/
public function get(Content $content, array &$data, string $property): ?Crawler;

/**
* Dump the current state of the HTML Crawler into data for the given property.
*/
public function save(Content $content, array &$data, string $property): void;

/**
* Dump the current state of all HTML Crawlers into data for their respective property.
*/
public function saveAll(Content $content, array &$data): void;
}
3 changes: 1 addition & 2 deletions src/Behaviour/ProcessorInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ interface ProcessorInterface
* Apply modifications to decoded data before denormalization
*
* @param array $data The decoded data
* @param string $type The model being processed (FQN)
* @param Content $content The source content
*/
public function __invoke(array &$data, string $type, Content $content): void;
public function __invoke(array &$data, Content $content): void;
}
20 changes: 13 additions & 7 deletions src/ContentManager.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
namespace Stenope\Bundle;

use Stenope\Bundle\Behaviour\ContentManagerAwareInterface;
use Stenope\Bundle\Behaviour\HtmlCrawlerManagerInterface;
use Stenope\Bundle\Behaviour\ProcessorInterface;
use Stenope\Bundle\Exception\ContentNotFoundException;
use Stenope\Bundle\Exception\RuntimeException;
Expand All @@ -27,6 +28,7 @@ class ContentManager
private DecoderInterface $decoder;
private DenormalizerInterface $denormalizer;
private PropertyAccessorInterface $propertyAccessor;
private HtmlCrawlerManagerInterface $crawlers;

/** @var iterable<ContentProviderInterface>|ContentProviderInterface[] */
private iterable $providers;
Expand All @@ -47,13 +49,15 @@ class ContentManager
public function __construct(
DecoderInterface $decoder,
DenormalizerInterface $denormalizer,
HtmlCrawlerManagerInterface $crawlers,
iterable $contentProviders,
iterable $processors,
?PropertyAccessorInterface $propertyAccessor = null,
?Stopwatch $stopwatch = null
) {
$this->decoder = $decoder;
$this->denormalizer = $denormalizer;
$this->crawlers = $crawlers;
$this->propertyAccessor = $propertyAccessor ?? PropertyAccess::createPropertyAccessor();
$this->providers = $contentProviders;
$this->processors = $processors;
Expand All @@ -80,11 +84,11 @@ public function getContents(string $type, $sortBy = null, $filterBy = null): arr
if (isset($contents[$content->getSlug()])) {
throw new RuntimeException(sprintf(
'Found multiple contents of type "%s" with the same "%s" identifier.',
$type,
$content->getType(),
$content->getSlug()
));
}
$contents[$content->getSlug()] = $this->load($type, $content);
$contents[$content->getSlug()] = $this->load($content);
}
}

Expand Down Expand Up @@ -141,7 +145,7 @@ public function getContent(string $type, string $id): object

foreach ($this->getProviders($type) as $provider) {
if ($content = $provider->getContent($id)) {
$loaded = $this->load($type, $content);
$loaded = $this->load($content);

if (isset($event)) {
$event->stop();
Expand Down Expand Up @@ -206,9 +210,9 @@ private function getProviders(string $type): iterable
}
}

private function load(string $type, Content $content)
private function load(Content $content)
{
if ($data = $this->cache[$key = "$type:{$content->getSlug()}"] ?? false) {
if ($data = $this->cache[$key = "{$content->getType()}:{$content->getSlug()}"] ?? false) {
return $data;
}

Expand All @@ -218,10 +222,12 @@ private function load(string $type, Content $content)

// Apply processors to decoded data
foreach ($this->processors as $processor) {
$processor($data, $type, $content);
$processor($data, $content);
}

$data = $this->denormalizer->denormalize($data, $type, $content->getFormat(), [
$this->crawlers->saveAll($content, $data);

$data = $this->denormalizer->denormalize($data, $content->getType(), $content->getFormat(), [
SkippingInstantiatedObjectDenormalizer::SKIP => true,
]);

Expand Down
4 changes: 4 additions & 0 deletions src/DependencyInjection/Configuration.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ public function getConfigTreeBuilder()
->cannotBeEmpty()
->defaultValue('%kernel.project_dir%/build')
->end()
->booleanNode('shared_html_crawlers')
->info('Activate the sharing of HTML crawlers for better performances.')
->defaultFalse()
->end()
->arrayNode('copy')
->defaultValue([
[
Expand Down
6 changes: 6 additions & 0 deletions src/DependencyInjection/StenopeExtension.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@

namespace Stenope\Bundle\DependencyInjection;

use Stenope\Bundle\Behaviour\HtmlCrawlerManagerInterface;
use Stenope\Bundle\Behaviour\ProcessorInterface;
use Stenope\Bundle\Builder;
use Stenope\Bundle\Provider\ContentProviderInterface;
use Stenope\Bundle\Provider\Factory\ContentProviderFactory;
use Stenope\Bundle\Provider\Factory\ContentProviderFactoryInterface;
use Stenope\Bundle\Routing\ContentUrlResolver;
use Stenope\Bundle\Routing\ResolveContentRoute;
use Stenope\Bundle\Service\SharedHtmlCrawlerManager;
use Symfony\Component\Config\FileLocator;
use Symfony\Component\DependencyInjection\ContainerBuilder;
use Symfony\Component\DependencyInjection\Extension\Extension;
Expand All @@ -37,6 +39,10 @@ public function load(array $configs, ContainerBuilder $container): void
$container->getDefinition(Builder::class)->replaceArgument('$buildDir', $config['build_dir']);
$container->getDefinition(Builder::class)->replaceArgument('$filesToCopy', $config['copy']);

if ($config['shared_html_crawlers']) {
$container->setAlias(HtmlCrawlerManagerInterface::class, SharedHtmlCrawlerManager::class);
}

$this->processProviders($container, $config['providers']);
$this->processLinkResolvers($container, $config['resolve_links']);
}
Expand Down
19 changes: 8 additions & 11 deletions src/Processor/AssetsProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,39 @@

namespace Stenope\Bundle\Processor;

use Stenope\Bundle\Behaviour\HtmlCrawlerManagerInterface;
use Stenope\Bundle\Behaviour\ProcessorInterface;
use Stenope\Bundle\Content;
use Stenope\Bundle\Service\AssetUtils;
use Symfony\Component\DomCrawler\Crawler;

/**
* Attempt to resolve local assets URLs using the Asset component for images and links.
*/
class AssetsProcessor implements ProcessorInterface
{
private AssetUtils $assetUtils;
private HtmlCrawlerManagerInterface $crawlers;
private string $property;

public function __construct(AssetUtils $assetUtils, string $property = 'content')
public function __construct(AssetUtils $assetUtils, HtmlCrawlerManagerInterface $crawlers, string $property = 'content')
{
$this->assetUtils = $assetUtils;
$this->crawlers = $crawlers;
$this->property = $property;
}

public function __invoke(array &$data, string $type, Content $content): void
public function __invoke(array &$data, Content $content): void
{
if (!isset($data[$this->property])) {
return;
}

$crawler = new Crawler($data[$this->property]);
$crawler = $this->crawlers->get($content, $data, $this->property);

try {
$crawler->html();
} catch (\Exception $e) {
// Content is not valid HTML.
if (!$crawler) {
return;
}

$crawler = new Crawler($data[$this->property]);

foreach ($crawler->filter('img') as $element) {
$element->setAttribute('src', $this->assetUtils->getUrl($element->getAttribute('src')));
}
Expand All @@ -52,6 +49,6 @@ public function __invoke(array &$data, string $type, Content $content): void
$element->setAttribute('href', $this->assetUtils->getUrl($element->getAttribute('href')));
}

$data[$this->property] = $crawler->html();
$this->crawlers->save($content, $data, $this->property);
}
}
24 changes: 12 additions & 12 deletions src/Processor/CodeHighlightProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,47 +9,47 @@
namespace Stenope\Bundle\Processor;

use Stenope\Bundle\Behaviour\HighlighterInterface;
use Stenope\Bundle\Behaviour\HtmlCrawlerManagerInterface;
use Stenope\Bundle\Behaviour\ProcessorInterface;
use Stenope\Bundle\Content;
use Stenope\Bundle\Service\HtmlUtils;
use Symfony\Component\DomCrawler\Crawler;

/**
* Apply syntax coloration to code blocs
*/
class CodeHighlightProcessor implements ProcessorInterface
{
private HighlighterInterface $highlighter;
private HtmlCrawlerManagerInterface $crawlers;
private string $property;

public function __construct(HighlighterInterface $highlighter, string $property = 'content')
{
public function __construct(
HighlighterInterface $highlighter,
HtmlCrawlerManagerInterface $crawlers,
string $property = 'content'
) {
$this->highlighter = $highlighter;
$this->crawlers = $crawlers;
$this->property = $property;
}

public function __invoke(array &$data, string $type, Content $content): void
public function __invoke(array &$data, Content $content): void
{
if (!isset($data[$this->property])) {
return;
}

$crawler = new Crawler($data[$this->property]);
$crawler = $this->crawlers->get($content, $data, $this->property);

try {
$crawler->html();
} catch (\Exception $e) {
// Content is not valid HTML.
if (!$crawler) {
return;
}

$crawler = new Crawler($data[$this->property]);

foreach ($crawler->filter('code') as $element) {
$this->highlight($element);
}

$data[$this->property] = $crawler->html();
$this->crawlers->save($content, $data, $this->property);
}

private function highlight(\DOMElement $element): void
Expand Down
Loading

0 comments on commit 9b81e96

Please sign in to comment.