-
Notifications
You must be signed in to change notification settings - Fork 0
/
PdfWikipedia.php
166 lines (130 loc) · 4.45 KB
/
PdfWikipedia.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
<?php
require_once 'dompdf/autoload.inc.php';
require_once 'components/functions.php';
use Dompdf\Dompdf;
use Dompdf\Options;
class PdfLoaderWikipedia
{
//аттрибут класса
public $url;
//конструктор класса, который передает атрибуту значение
public function __construct($url)
{
$this->url = $url;
}
//get all a-tags from div.mw-body-content
public function getLinks()
{
$html = file_get_contents($this->url);
$dom = new DOMDocument();
//вылазила ошибка: Warning: DOMDocument::loadHTML(): Unexpected end tag :
//p in Entity, line: 54 in /opt/lampp/htdocs/wikipdf.ru/PdfConverter.php on line 34
//решение тут https://joomlaforum.ru/index.php/topic,304834.0.html libxml_use_internal_errors(true);
libxml_use_internal_errors(true);
$dom->loadHTML($html);
$links = [];
$xPath = new DOMXPath($dom);
$anchorTags = $xPath->evaluate("//div[@class=\"mw-body-content\"]//a/@href");
//create an array[] of needed links to iterate trhough and to create PDF files from
foreach ($anchorTags as $anchorTag)
{
//decoded link
$aLink = urldecode($anchorTag->nodeValue);
$links[] = $aLink;
//if link is not already in array:
if (!in_array($aLink, $links)) {
$links[] = $aLink;
}
}
//recheck the $links[] for there are doubled links
$linksChecked = [];
foreach ($links as $link)
{
$linkDecoded = urldecode($link);
if (!in_array($linkDecoded, $linksChecked))
{
if (count($linksChecked) < 30) {
$linksChecked[] = $linkDecoded;
}
}
}
return $linksChecked;
}
public function purifyLinks($linksArray)
{
//handle the links for more usability
//get rid of garbage
foreach($linksArray as $link)
{
//decode url to cyrillic
$linkDecoded = urldecode($link);
//if url contains garbage, delete
if ((strpos($linkDecoded, 'Категори') === false) && (strpos($linkDecoded, 'Википедия') === false) &&
(strpos($linkDecoded, 'index.php') === false) && (strpos($linkDecoded, 'Файл') === false) &&
(strpos($linkDecoded, '#') === false) && (strpos($linkDecoded, 'Английский') === false) &&
(strpos($linkDecoded, 'значения') === false) && (strpos($linkDecoded, 'Шаблон') === false) &&
(strpos($linkDecoded, 'Служебная') === false) && (strpos($linkDecoded, 'Портал:') === false))
{
//add 'https://ru.wikipedia.org' if needed
if ($linkDecoded[0] === '/')
{
$fullLink = trim('https://ru.wikipedia.org' . $linkDecoded);
//PDF file title
$title = substr($fullLink, strpos($fullLink, 'wiki/') +5);
$fullLinks[] = $fullLink;
}
}
}
return $fullLinks;
}
public function renderLinks($links)
{
$html = '';
foreach($links as $fullLink)
{
echoJS($fullLink);
//get html of every article from array[]
$html = file_get_contents($fullLink);
}
}
public function savePdf($link)
{
$title = substr($link, strpos($link, 'wiki/') +5);
//get html of every article from array[]
$html = file_get_contents($link);
//creating PDFs
try
{
$options = new Options();
$options->set('defaultFont', 'DejaVu Sans');
$dompdf = new Dompdf($options);
//an alleged workout to POST images into pdf
//according to https://github.com/dompdf/dompdf/wiki/Usage
$context = stream_context_create
(
[
'ssl' =>
[
'verify_peer' => FALSE,
'verify_peer_name' => FALSE,
'allow_self_signed'=> TRUE
]
]
);
$dompdf->setHttpContext($context);
//handle $html of an article
$dompdf->loadHtml($html);
//https://github.com/dompdf/dompdf/issues/2075
//this solves the no-images-in-PDF issue
$dompdf->set_protocol('http://');
$dompdf->setPaper('A4', 'portrait');
$dompdf->render($title);
$output = $dompdf->output();
file_put_contents("G:/Documents/WIKIPDF/$title.pdf", $output);
//delete variables
unset($html);
unset($output);
unset($dompdf);
} catch (Exception $e) { echo 'Выброшено исключение: ' . $e . "\n"; }
}
}