Skip to content
Permalink
Browse files

Updated for Craft 3.

  • Loading branch information...
michaelrog committed Jun 1, 2019
1 parent a55353f commit d05b7fa947a6326607c2ff10dce45f4c1940535e
Showing with 2,194 additions and 15,169 deletions.
  1. +7 −0 CHANGELOG.md
  2. +17 −0 LICENSE.md
  3. +57 −23 README.md
  4. +57 −0 composer.json
  5. +0 −146 scraper/ScraperPlugin.php
  6. +0 −27 scraper/services/Scraper_BetaService.php
  7. +0 −42 scraper/services/Scraper_ScraperService.php
  8. +0 −36 scraper/variables/ScraperVariable.php
  9. +0 −891 scraper/vendor/simplehtmldom/app/google.htm
  10. +0 −144 scraper/vendor/simplehtmldom/app/index.php
  11. BIN scraper/vendor/simplehtmldom/app/js/images/treeview-default-line.gif
  12. BIN scraper/vendor/simplehtmldom/app/js/images/treeview-default.gif
  13. +0 −3,363 scraper/vendor/simplehtmldom/app/js/jquery.js
  14. +0 −68 scraper/vendor/simplehtmldom/app/js/jquery.treeview.css
  15. +0 −251 scraper/vendor/simplehtmldom/app/js/jquery.treeview.js
  16. +0 −24 scraper/vendor/simplehtmldom/app/js/screen.css
  17. +0 −109 scraper/vendor/simplehtmldom/change_log.txt
  18. +0 −54 scraper/vendor/simplehtmldom/example/example_advanced_selector.php
  19. +0 −37 scraper/vendor/simplehtmldom/example/example_basic_selector.php
  20. +0 −28 scraper/vendor/simplehtmldom/example/example_callback.php
  21. +0 −5 scraper/vendor/simplehtmldom/example/example_extract_html.php
  22. +0 −18 scraper/vendor/simplehtmldom/example/example_modify_contents.php
  23. +0 −44 scraper/vendor/simplehtmldom/example/scraping/example_scraping_digg.php
  24. +0 −59 scraper/vendor/simplehtmldom/example/scraping/example_scraping_general.php
  25. +0 −51 scraper/vendor/simplehtmldom/example/scraping/example_scraping_imdb.php
  26. +0 −35 scraper/vendor/simplehtmldom/example/scraping/example_scraping_slashdot.php
  27. +0 −35 scraper/vendor/simplehtmldom/example/simple_html_dom_utility.php
  28. +0 −113 scraper/vendor/simplehtmldom/manual/css/default.css
  29. +0 −116 scraper/vendor/simplehtmldom/manual/css/ui.tabs.css
  30. BIN scraper/vendor/simplehtmldom/manual/img/tab.png
  31. +0 −117 scraper/vendor/simplehtmldom/manual/index.htm
  32. +0 −11 scraper/vendor/simplehtmldom/manual/js/jquery-1.2.3.pack.js
  33. +0 −10 scraper/vendor/simplehtmldom/manual/js/ui.tabs.pack.js
  34. +0 −448 scraper/vendor/simplehtmldom/manual/manual.htm
  35. +0 −322 scraper/vendor/simplehtmldom/manual/manual_api.htm
  36. +0 −94 scraper/vendor/simplehtmldom/manual/manual_faq.htm
  37. +0 −24 scraper/vendor/simplehtmldom/testcase/all_test.php
  38. +0 −74 scraper/vendor/simplehtmldom/testcase/callback_testcase.php
  39. +0 −386 scraper/vendor/simplehtmldom/testcase/dom_testcase.php
  40. +0 −247 scraper/vendor/simplehtmldom/testcase/element_testcase.php
  41. +0 −658 scraper/vendor/simplehtmldom/testcase/invalid_testcase.php
  42. +0 −11 scraper/vendor/simplehtmldom/testcase/jquery-1.2.3.pack.js
  43. +0 −68 scraper/vendor/simplehtmldom/testcase/mass_test.php
  44. +0 −121 scraper/vendor/simplehtmldom/testcase/memory_test.php
  45. +0 −60 scraper/vendor/simplehtmldom/testcase/misc_testcase.php
  46. +0 −40 scraper/vendor/simplehtmldom/testcase/performance_test.php
  47. +0 −12 scraper/vendor/simplehtmldom/testcase/reader/all_test.php
  48. +0 −243 scraper/vendor/simplehtmldom/testcase/reader/element_testcase.php
  49. +0 −103 scraper/vendor/simplehtmldom/testcase/reader/memory_test.php
  50. +0 −38 scraper/vendor/simplehtmldom/testcase/reader/performance_test.php
  51. +0 −584 scraper/vendor/simplehtmldom/testcase/reader/selector_testcase.php
  52. +0 −742 scraper/vendor/simplehtmldom/testcase/selector_testcase.php
  53. +0 −27 scraper/vendor/simplehtmldom/testcase/slick_test.php
  54. +0 −2,888 scraper/vendor/simplehtmldom/testcase/slickspeed.htm
  55. +0 −243 scraper/vendor/simplehtmldom/testcase/std_testcase.php
  56. +0 −137 scraper/vendor/simplehtmldom/testcase/strip_testcase.php
  57. +53 −0 src/Scraper.php
  58. +81 −0 src/clients/BaseClient.php
  59. +27 −0 src/clients/GoutteClient.php
  60. +26 −0 src/clients/SimpleHtmlDomClient.php
  61. +25 −0 src/lib/SimpleHtmlDom.php
  62. +1,744 −1,742 {scraper/vendor/simplehtmldom → src/lib}/simple_html_dom.php
  63. +63 −0 src/services/Scraper.php
  64. +37 −0 src/twigextensions/ScraperTwigExtension.php
@@ -0,0 +1,7 @@
# Scraper Changelog

## 3.0.0 - 2019-06-01

### Added

- 3.x Beta release!
@@ -0,0 +1,17 @@
Copyright © Michael Rog

Permission is hereby granted to any person obtaining a copy of this software (the “Software”) to use, copy, modify, merge, publish and/or distribute copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

1. **Don’t plagiarize.** The above copyright notice and this license shall be included in all copies or substantial portions of the Software.

2. **Don’t use the same license on more than one project.** Each licensed copy of the Software shall be actively installed in no more than one production environment at a time.

3. **Don’t mess with the licensing features.** Software features related to licensing shall not be altered or circumvented in any way, including (but not limited to) license validation, payment prompts, feature restrictions, and update eligibility.

4. **Pay up.** Payment shall be made immediately upon receipt of any notice, prompt, reminder, or other message indicating that a payment is owed.

5. **Follow the law.** All use of the Software shall not violate any applicable law or regulation, nor infringe the rights of any other person or entity.

Failure to comply with the foregoing conditions will automatically and immediately result in termination of the permission granted hereby. This license does not include any right to receive updates to the Software or technical support. Licensees bear all risk related to the quality and performance of the Software and any modifications made or obtained to it, including liability for actual and consequential harm, such as loss or corruption of data, and any necessary service, repair, or correction.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, INCLUDING SPECIAL, INCIDENTAL AND CONSEQUENTIAL DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -1,55 +1,89 @@
# Scraper

_Easily fetch, slice, dice, and output HTML content from remote pages._
_Easily fetch, slice, dice, and output HTML (or XML) content from anywhere._

**Lovingly crafted by [Top Shelf Craft](https://topshelfcraft.com)**
**A [Top Shelf Craft](https://topshelfcraft.com) creation**
[Michael Rog](https://michaelrog.com), Proprietor


* * *


## tl;dr
## Installation

1. From your project directory, use Composer to require the plugin package:

```
composer require topshelfcraft/scraper
```

2. In the Control Panel, go to Settings → Plugins and click the “Install” button for Scraper.

**Scraper** allows you to easily fetch HTML content from any URL, create a DOM with it, select elements by CSS selector, find and manipulate DOM nodes, and save or output them using the power of Twig templates.
3. There is no Step 3.

_Scraper is also available for installation via the Craft CMS Plugin Store._

## Usage

Use **Scraper** to query content from remote URLs, select it by HTML and CSS selector, and output it in your Craft templates.
The Scraper plugin exposes a full-featured crawler object to your Twig template, allowing you to fetch, parse, and filter DOM elements from a remote source document.

For example:
### Instantiating a client

{% set acmeContent = craft.scraper.get("http://acmewidgets.com") %}
{% for widgets in acmeContent.find(".widget") %}
<div>{{ widget.innerText }}</div>
{% endfor %}
When invoking the plugin, you can choose whether to use SimpleHtmlDom or Symfony components to instantiate your crawler:

or...
```twig
{% set crawler = craft.scraper.using('symfony').get('https://zombo.com') %}
```
```twig
{% set crawler = craft.scraper.using('simplehtmldom').get('https://zombo.com') %}
```

{% set google = craft.scraper.get("http://google.com") %}
{% for link in google.find("a") %}
<li>{{ link.attr.href }}</li>
{% endfor% }
I generally recommend using the Symfony components; they are more powerful and resilient to malformed source code. (The SimpleHtmlDom crawler is included to provide backwards compatibility with Craft 2 projects.)

### Using the Symfony client

### What are the system requirements?
When you opt for Symfony components, the `get` method instantiates a full [BrowserKit](https://symfony.com/components/BrowserKit) client, giving you access to all the [BrowserKit](https://symfony.com/components/BrowserKit) and [DomCrawler](https://symfony.com/doc/current/components/dom_crawler.html) methods.

Craft 2.5+ and PHP 5.4+
You can iterate over the DOM elements from your source document like this:

```twig
{% for node in crawler.filter('h2 > a') %}
{{ node.text() }}
{% endfor %}
```

### I found a bug.
### Using the SimpleHtmlDom client

When you opt for the SimpleHtmlDom crawler, the `get` method instantiates a [SimpleHtmlDom](https://simplehtmldom.sourceforge.io/) client, giving you access to all the [SimpleHtmlDom methods](https://simplehtmldom.sourceforge.io/manual.htm).

You can iterate over the DOM elements from your source document like this:

Nah...
```twig
{% for node in crawler.find('h1') %}
{{ node.innertext() }}
{% endfor %}
```

### This is great! I still have questions.

### I triple-checked. It's a bug.
Ask a question on [StackExchange](http://craftcms.stackexchange.com/), and ping me with a URL via email or Discord.

Well, alright. Please open a GitHub Issue, submit a PR to the `dev` branch, or just email me to let me know.

### What are the system requirements?

Craft 3.0+ and PHP 7.0+


### I found a bug.

Please open a GitHub Issue, submit a PR to the `3.x.dev` branch, or just email me.


* * *

#### Contributors:

- Plugin development: [Michael Rog](http://michaelrog.com) / @michaelrog
- [Simple HTML DOM](http://simplehtmldom.sourceforge.net/): created by S. C. Chen
- Plugin development: [Michael Rog](http://michaelrog.com) / @michaelrog
- Includes the ["Simple HTML DOM"](http://simplehtmldom.sourceforge.net/) library, created by S. C. Chen
- Includes the Symfony [DomCrawler](https://symfony.com/doc/current/components/dom_crawler.html) via [Goutte](https://github.com/FriendsOfPHP/Goutte), created by S. C. Chen

@@ -0,0 +1,57 @@
{
"name": "topshelfcraft/scraper",
"type": "craft-plugin",
"description": "Easily fetch, parse, and rejigger HTML or XML from anywhere.",
"version": "3.0.0-beta.1",
"keywords": [
"craft",
"cms",
"craftcms",
"plugin",
"scraper",
"simplehtmldom",
"dom",
"fetch",
"html",
"remote",
"external",
"parse"
],
"license": "proprietary",
"homepage": "https://topshelfcraft.com",
"authors": [
{
"name": "Top Shelf Craft (Michael Rog)",
"homepage": "https://topshelfcraft.com"
}
],
"support": {
"email": "support@topshelfcraft.com",
"issues": "https://github.com/TopShelfCraft/Scraper/issues",
"source": "https://github.com/TopShelfCraft/Scraper",
"docs": "https://github.com/TopShelfCraft/Scraper"
},
"require": {
"php": ">=7",
"craftcms/cms": "^3.0",
"topshelfcraft/ranger": "^3.0",
"fabpot/goutte": "^3.2"
},
"autoload": {
"psr-4": {
"topshelfcraft\\scraper\\": "src/"
}
},
"extra": {
"name": "Scraper",
"handle": "scraper",
"schemaVersion": "0.0.0.0",
"hasSettings": false,
"hasCpSection": false,
"changelogUrl": "https://raw.githubusercontent.com/topshelfcraft/scraper/3.x/CHANGELOG.md",
"class": "topshelfcraft\\scraper\\Scraper",
"components": {
"scraper": "topshelfcraft\\scraper\\services\\Scraper"
}
}
}

This file was deleted.

This file was deleted.

0 comments on commit d05b7fa

Please sign in to comment.
You can’t perform that action at this time.