Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed bridge #1195

Merged
merged 9 commits into from
Jun 28, 2019
64 changes: 36 additions & 28 deletions bridges/GQMagazineBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ class GQMagazineBridge extends BridgeAbstract
'data-original' => 'src'
);

const POSSIBLE_TITLES = array(
'h2',
'h3'
);

private function getDomain() {
$domain = $this->getInput('domain');
if (empty($domain))
Expand All @@ -54,6 +59,17 @@ public function getURI()
return $this->getDomain() . '/' . $this->getInput('page');
}

private function findTitleOf($link) {
foreach (self::POSSIBLE_TITLES as $tag) {
$title = $link->find($tag, 0);
if($title != null) {
Riduidel marked this conversation as resolved.
Show resolved Hide resolved
if($title->plaintext != null) {
Riduidel marked this conversation as resolved.
Show resolved Hide resolved
return $title->plaintext;
}
}
}
}

public function collectData()
{
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
Expand All @@ -62,30 +78,31 @@ public function collectData()
$main = $html->find('main', 0);
foreach ($main->find('a') as $link) {
$uri = $link->href;
$title = $link->find('h2', 0);
$date = $link->find('time', 0);

$item = array();
$author = $link->find('span[itemprop=name]', 0);
$item['author'] = $author->plaintext;
$item['title'] = $title->plaintext;
if(substr($uri, 0, 1) === 'h') { // absolute uri
$item['uri'] = $uri;
} else if(substr($uri, 0, 1) === '/') { // domain relative url
$item['uri'] = $this->getDomain() . $uri;
} else {
$item['uri'] = $this->getDomain() . '/' . $uri;
}
if($author != null) {
Riduidel marked this conversation as resolved.
Show resolved Hide resolved
$item['author'] = $author->plaintext;
$item['title'] = $this->findTitleOf($link);
if(substr($uri, 0, 1) === 'h') { // absolute uri
Riduidel marked this conversation as resolved.
Show resolved Hide resolved
$item['uri'] = $uri;
} else if(substr($uri, 0, 1) === '/') { // domain relative url
Riduidel marked this conversation as resolved.
Show resolved Hide resolved
$item['uri'] = $this->getDomain() . $uri;
} else {
$item['uri'] = $this->getDomain() . '/' . $uri;
}

$article = $this->loadFullArticle($item['uri']);
if($article) {
$item['content'] = $this->replaceUriInHtmlElement($article);
} else {
$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
$article = $this->loadFullArticle($item['uri']);
if($article) {
$item['content'] = $this->replaceUriInHtmlElement($article);
} else {
$item['content'] = "<strong>Article body couldn't be loaded</strong>. It must be a bug!";
}
$short_date = $date->datetime;
$item['timestamp'] = strtotime($short_date);
$this->items[] = $item;
}
$short_date = $date->datetime;
$item['timestamp'] = strtotime($short_date);
$this->items[] = $item;
}
}

Expand All @@ -96,16 +113,7 @@ public function collectData()
*/
private function loadFullArticle($uri){
$html = getSimpleHTMLDOMCached($uri);
// Once again, that generated css classes madness is an obstacle ... which i can go over easily
foreach($html->find('div') as $div) {
// List the CSS classes of that div
$classes = $div->class;
// I can't directly lookup that class since GQ since to generate random names like "ArticleBodySection-fkggUW"
if(strpos($classes, 'ArticleBodySection') !== false) {
return $div;
}
}
return null;
return $html->find('section[data-test-id=ArticleBodyContent]', 0);
}

/**
Expand Down