Skip to content

Commit

Permalink
Port to Python 3 (including poster and wikitools)
Browse files Browse the repository at this point in the history
Signed-off-by: Elsie Hupp <9206310+elsiehupp@users.noreply.github.com>
  • Loading branch information
elsiehupp committed Jun 9, 2021
1 parent 0cfde9e commit de976dd
Show file tree
Hide file tree
Showing 49 changed files with 4,571 additions and 2,379 deletions.
10 changes: 10 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"python.analysis.extraPaths": [
".",
"./poster",
"./tests",
"./wikitools",
"/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages"
],
"python.pythonPath": "/usr/local/bin/python3.8"
}
458 changes: 226 additions & 232 deletions dumpgenerator.py

Large diffs are not rendered by default.

32 changes: 15 additions & 17 deletions gui.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2011-2012 WikiTeam
Expand All @@ -19,10 +19,8 @@
import platform
import random
import re
from Tkinter import *
import ttk
import tkMessageBox
import thread
from tkinter import Tk, ttk, messagebox, Label, W, E, N, S, LabelFrame, Entry, StringVar, OptionMenu, Button, Text, Scrollbar, LEFT, SUNKEN, Menu
import threading
import time
import urllib
import webbrowser
Expand Down Expand Up @@ -102,7 +100,7 @@ def __init__(self, master):
self.optionmenu11var.set("api.php")
self.optionmenu11 = OptionMenu(self.labelframe11, self.optionmenu11var, self.optionmenu11var.get(), "index.php")
self.optionmenu11.grid(row=0, column=2)
self.button11 = Button(self.labelframe11, text="Check", command=lambda: thread.start_new_thread(self.checkURL, ()), width=5)
self.button11 = Button(self.labelframe11, text="Check", command=lambda: threading.start_new_threading(self.checkURL, ()), width=5)
self.button11.grid(row=0, column=3)
#batch download labelframe
self.label12 = Label(self.labelframe12, text="Wiki URLs:")
Expand Down Expand Up @@ -174,12 +172,12 @@ def __init__(self, master):
self.tree.heading('status', text='Status')
self.tree.grid(row=2, column=0, columnspan=9, sticky=W+E+N+S)
[self.tree.heading(column, text=column, command=lambda: self.treeSortColumn(column=column, reverse=False)) for column in columns]
#self.tree.bind("<Double-1>", (lambda: thread.start_new_thread(self.downloadDump, ())))
#self.tree.bind("<Double-1>", (lambda: threading.start_new_threading(self.downloadDump, ())))
self.tree.tag_configure('downloaded', background='lightgreen')
self.tree.tag_configure('nodownloaded', background='white')
self.button21 = Button(self.frame2, text="Load available dumps", command=lambda: thread.start_new_thread(self.loadAvailableDumps, ()), width=15)
self.button21 = Button(self.frame2, text="Load available dumps", command=lambda: threading.start_new_threading(self.loadAvailableDumps, ()), width=15)
self.button21.grid(row=3, column=0)
self.button23 = Button(self.frame2, text="Download selection", command=lambda: thread.start_new_thread(self.downloadDump, ()), width=15)
self.button23 = Button(self.frame2, text="Download selection", command=lambda: threading.start_new_threading(self.downloadDump, ()), width=15)
self.button23.grid(row=3, column=4)
self.button22 = Button(self.frame2, text="Clear list", command=self.deleteAvailableDumps, width=10)
self.button22.grid(row=3, column=8, columnspan=2)
Expand Down Expand Up @@ -213,7 +211,7 @@ def __init__(self, master):
#end menu

def blocked(self):
tkMessageBox.showerror("Error", "There is a task in progress. Please, wait.")
messagebox.showerror("Error", "There is a task in progress. Please, wait.")

def checkURL(self):
if re.search(ur"(?im)^https?://[^/]+\.[^/]+/", self.entry11.get()): #well-constructed URL?, one dot at least, aaaaa.com, but bb.aaaaa.com is allowed too
Expand All @@ -234,7 +232,7 @@ def checkURL(self):
self.entry11.config(background='red')
self.msg('index.php is incorrect!', level='error')
else:
tkMessageBox.showerror("Error", "You have to write a correct api.php or index.php URL.")
messagebox.showerror("Error", "You have to write a correct api.php or index.php URL.")

def sumSizes(self, sizes):
total = 0
Expand Down Expand Up @@ -266,15 +264,15 @@ def run(self):
dumpgenerator.main(params=params)
#check dump
"""
"""

def msg(self, msg='', level=''):
levels = { 'ok': 'lightgreen', 'warning': 'yellow', 'error': 'red' }
if levels.has_key(level.lower()):
print '%s: %s' % (level.upper(), msg)
print ('%s: %s' % (level.upper(), msg))
self.status.config(text='%s: %s' % (level.upper(), msg), background=levels[level.lower()])
else:
print msg
print (msg)
self.status.config(text=msg, background='grey')

def treeSortColumn(self, column, reverse=False):
Expand Down Expand Up @@ -326,7 +324,7 @@ def downloadDump(self, event=None):
else:
self.msg('Problems in %d dumps. Downloaded %d of %d (and %d were previously downloaded).' % (len(items)-(c+d), c, len(items), d), level='error')
else:
tkMessageBox.showerror("Error", "You have to select some dumps to download.")
messagebox.showerror("Error", "You have to select some dumps to download.")
self.clearAvailableDumps()
self.showAvailableDumps()
self.filterAvailableDumps()
Expand Down Expand Up @@ -410,7 +408,7 @@ def loadAvailableDumps(self):
wikifarms_r = re.compile(ur"(%s)" % ('|'.join(wikifarms.keys())))
c = 0
for mirror, url, regexp in self.urls:
print 'Loading data from', mirror, url
print ('Loading data from', mirror, url)
self.msg(msg='Please wait... Loading data from %s %s' % (mirror, url))
f = urllib.urlopen(url)
m = re.compile(regexp).finditer(f.read())
Expand Down Expand Up @@ -452,7 +450,7 @@ def callback(self):
self.msg("Feature not implemented for the moment. Contributions are welcome.", level='warning')

def askclose():
if tkMessageBox.askokcancel("Quit", "Do you really wish to exit?"):
if messagebox.askokcancel("Quit", "Do you really wish to exit?"):
root.destroy()

if __name__ == "__main__":
Expand Down
34 changes: 17 additions & 17 deletions launcher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2011-2016 WikiTeam
Expand Down Expand Up @@ -28,17 +28,17 @@

def main():
if len(sys.argv) < 2:
print 'python script.py file-with-apis.txt'
print ('python script.py file-with-apis.txt')
sys.exit()

print 'Reading list of APIs from', sys.argv[1]
print ('Reading list of APIs from', sys.argv[1])
wikis = open(sys.argv[1], 'r').read().splitlines()
print '%d APIs found' % (len(wikis))
print ('%d APIs found' % (len(wikis)))

for wiki in wikis:
print "#"*73
print "# Downloading", wiki
print "#"*73
print ("#"*73)
print ("# Downloading", wiki)
print ("#"*73)
wiki = wiki.lower()
# Make the prefix in standard way; api and index must be defined, not important which is which
prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
Expand All @@ -52,17 +52,17 @@ def main():
break #stop searching, dot not explore subdirectories

if compressed:
print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
print ('Skipping... This wiki was downloaded and compressed before in', zipfilename)
# Get the archive's file list.
if ( ( ( sys.version_info[0] == 3 ) and ( sys.version_info[1] > 0 ) ) or ( ( sys.version_info[0] == 2 ) and ( sys.version_info[1] > 6 ) ) ):
archivecontent = subprocess.check_output (['7z', 'l', zipfilename])
if re.search(ur"%s.+-history\.xml" % (prefix), archivecontent) is None:
# We should perhaps not create an archive in this case, but we continue anyway.
print "ERROR: The archive contains no history!"
print ("ERROR: The archive contains no history!")
if re.search(ur"Special:Version\.html", archivecontent) is None:
print "WARNING: The archive doesn't contain Special:Version.html, this may indicate that download didn't finish."
print ("WARNING: The archive doesn't contain Special:Version.html, this may indicate that download didn't finish.")
else:
print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
print ("WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+.")
# TODO: Find a way like grep -q below without doing a 7z l multiple times?
continue

Expand All @@ -81,10 +81,10 @@ def main():
# such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
# typically they don't provide any crawl-delay value in their robots.txt).
if started and wikidir: #then resume
print 'Resuming download, using directory', wikidir
subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False)
print ('Resuming download, using directory', wikidir)
subprocess.call(['python3', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False)
else: #download from scratch
subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False)
subprocess.call(['python3', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False)
started = True
#save wikidir now
for f in os.listdir('.'):
Expand All @@ -98,7 +98,7 @@ def main():
finished = False
if started and wikidir and prefix:
if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
print "No </mediawiki> tag found: dump failed, needs fixing; resume didn't work. Exiting."
print ("No </mediawiki> tag found: dump failed, needs fixing; resume didn't work. Exiting.")
else:
finished = True
# You can also issue this on your working directory to find all incomplete dumps:
Expand All @@ -108,7 +108,7 @@ def main():
if finished:
time.sleep(1)
os.chdir(wikidir)
print 'Changed directory to', os.getcwd()
print ('Changed directory to', os.getcwd())
# Basic integrity check for the xml. The script doesn't actually do anything, so you should check if it's broken. Nothing can be done anyway, but redownloading.
subprocess.call('grep "<title>" *.xml -c;grep "<page>" *.xml -c;grep "</page>" *.xml -c;grep "<revision>" *.xml -c;grep "</revision>" *.xml -c', shell=True)
# Make a non-solid archive with all the text and metadata at default compression. You can also add config.txt if you don't care about your computer and user names being published or you don't use full paths so that they're not stored in it.
Expand All @@ -123,7 +123,7 @@ def main():
subprocess.call('7z' + ' a -ms=off -mx=1 ../%s-wikidump.7z.tmp %s-images.txt images/' % (prefix, prefix), shell=True)
subprocess.call('mv' + ' ../%s-wikidump.7z.tmp ../%s-wikidump.7z' % (prefix, prefix), shell=True)
os.chdir('..')
print 'Changed directory to', os.getcwd()
print ('Changed directory to', os.getcwd())
time.sleep(1)

if __name__ == "__main__":
Expand Down
32 changes: 20 additions & 12 deletions listsofwikis/mediawiki/checkalive.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2011-2012 WikiTeam
Expand All @@ -17,27 +17,35 @@

# Script to check if a list of wikis are alive or dead

import thread
import threading
import time
import sys
import urllib2
import exceptions
import urllib.request
from urllib.error import *
from http.server import BaseHTTPRequestHandler
import re

# Configuration
delay = 30 # Seconds before timing out on request
limit = 100

def printapi(api):
print api, 'is alive'
print (api, 'is alive')
open('wikisalive.txt', 'a').write(('%s\n' % api.strip()).encode('utf-8'))

def checkcore(api):
req = urllib2.Request(api, None)
req = urllib.request(api, None)
try:
raw = urllib2.urlopen(req, None, delay).read()
except IOError: # http://docs.python.org/2/howto/urllib2.html#handling-exceptions
print api, 'is dead or has errors'
raw = urllib.request.urlopenurlopen(req, None, delay).read()
except URLError as reason: # https://docs.python.org/3/library/urllib.error.html

if reason.isinstance(HTTPError):
print (api + 'is dead or has errors because:')
print ("Error code " + HTTPError.code + ": " + BaseHTTPRequestHandler.responses[HTTPError.code].shortmessage)
print (BaseHTTPRequestHandler.responses[HTTPError.code].longmessage)
print ("Reason: " + HTTPError.reason)
print ("HTTP Headers:\n" + HTTPError.headers)
else:
print (api + 'is dead or has errors because:' + reason)
return
# RSD is available since 1.17, bug 25648
rsd = re.search(r'(?:link rel="EditURI".+href=")(?:https?:)?(.+api.php)\?action=rsd', raw)
Expand All @@ -60,11 +68,11 @@ def checkcore(api):
index = domain.group(1) + login.group(1)
printapi(index)
else:
print api, 'is not a MediaWiki wiki'
print (api, 'is not a MediaWiki wiki')

def check(apis):
for api in apis:
thread.start_new_thread(checkcore, (api,))
threading.start_new_threading(checkcore, (api,))
time.sleep(0.1)
time.sleep(delay+1)

Expand Down
4 changes: 2 additions & 2 deletions listsofwikis/mediawiki/miraheze-spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2014-2017 WikiTeam developers
Expand Down Expand Up @@ -29,7 +29,7 @@ def main():
m = re.findall(ur'<tr><td>(<del>)?<a href="https://([^>]+?)/">[^<]+</a>', raw)
m.sort()
for i in m:
print 'https://' + i[1] + '/w/api.php'
print ('https://' + i[1] + '/w/api.php')

if __name__ == '__main__':
main()
4 changes: 2 additions & 2 deletions listsofwikis/mediawiki/neoseeker-spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2014-2017 WikiTeam developers
Expand Down Expand Up @@ -29,7 +29,7 @@ def main():
m = re.findall(ur'<li><a href=\'([^>]+?)/wiki/\'>', raw)
m.sort()
for i in m:
print i + '/w/api.php'
print (i + '/w/api.php')

if __name__ == '__main__':
main()
4 changes: 2 additions & 2 deletions listsofwikis/mediawiki/orain-spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2014 WikiTeam developers
Expand Down Expand Up @@ -28,7 +28,7 @@ def main():
raw = r.text
m = re.findall(ur'<tr><td><a href="//([^>]+?)/">[^<]+</a></td></tr>', raw)
for i in m:
print 'http://' + i + '/w/api.php'
print ('http://' + i + '/w/api.php')

if __name__ == '__main__':
main()
4 changes: 2 additions & 2 deletions listsofwikis/mediawiki/referata-spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2014 WikiTeam developers
Expand Down Expand Up @@ -33,7 +33,7 @@ def main():

m = re.findall(ur'(?im)<h3 class="r"><a href=\"([^ ]+?)" onmouse', raw)
for i in m:
print i
print (i)

if re.search(ur'id="ofr"', raw): #resultados omitidos, final
break
Expand Down
4 changes: 2 additions & 2 deletions listsofwikis/mediawiki/shoutwiki-spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2014 WikiTeam developers
Expand Down Expand Up @@ -41,7 +41,7 @@ def main():
for site in jsonsites['query']['listwikis']:
siteid = int(site['id'])
siteurl = site['url']
print siteurl
print (siteurl)

if len(jsonsites['query']['listwikis']) == int(swlimit):
#there are more
Expand Down
4 changes: 2 additions & 2 deletions listsofwikis/mediawiki/wiki-site-spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (C) 2014 WikiTeam developers
Expand Down Expand Up @@ -32,7 +32,7 @@ def main():
raw = r.text
m = re.findall(ur'<td><a href="([^>]+?)"', raw)
for i in m:
print i
print (i)

if __name__ == '__main__':
main()
Loading

0 comments on commit de976dd

Please sign in to comment.