Port to Python 3 (including poster and wikitools)

Signed-off-by: Elsie Hupp <9206310+elsiehupp@users.noreply.github.com>
WikiTeam · Jun 9, 2021 · de976dd · de976dd
1 parent 0cfde9e
commit de976dd
Show file tree

Hide file tree

Showing 49 changed files with 4,571 additions and 2,379 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,10 @@
+{
+    "python.analysis.extraPaths": [
+        ".",
+        "./poster",
+        "./tests",
+        "./wikitools",
+        "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages"
+    ],
+    "python.pythonPath": "/usr/local/bin/python3.8"
+}
diff --git a/dumpgenerator.py b/dumpgenerator.py
diff --git a/gui.py b/gui.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright (C) 2011-2012 WikiTeam
@@ -19,10 +19,8 @@
 import platform
 import random
 import re
-from Tkinter import *
-import ttk
-import tkMessageBox
-import thread
+from tkinter import Tk, ttk, messagebox, Label, W, E, N, S, LabelFrame, Entry, StringVar, OptionMenu, Button, Text, Scrollbar, LEFT, SUNKEN, Menu
+import threading
 import time
 import urllib
 import webbrowser
@@ -102,7 +100,7 @@ def __init__(self, master):
         self.optionmenu11var.set("api.php")
         self.optionmenu11 = OptionMenu(self.labelframe11, self.optionmenu11var, self.optionmenu11var.get(), "index.php")
         self.optionmenu11.grid(row=0, column=2)
-        self.button11 = Button(self.labelframe11, text="Check", command=lambda: thread.start_new_thread(self.checkURL, ()), width=5)
+        self.button11 = Button(self.labelframe11, text="Check", command=lambda: threading.start_new_threading(self.checkURL, ()), width=5)
         self.button11.grid(row=0, column=3)
         #batch download labelframe
         self.label12 = Label(self.labelframe12, text="Wiki URLs:")
@@ -174,12 +172,12 @@ def __init__(self, master):
         self.tree.heading('status', text='Status')
         self.tree.grid(row=2, column=0, columnspan=9, sticky=W+E+N+S)
         [self.tree.heading(column, text=column, command=lambda: self.treeSortColumn(column=column, reverse=False)) for column in columns]        
-        #self.tree.bind("<Double-1>", (lambda: thread.start_new_thread(self.downloadDump, ())))
+        #self.tree.bind("<Double-1>", (lambda: threading.start_new_threading(self.downloadDump, ())))
         self.tree.tag_configure('downloaded', background='lightgreen')
         self.tree.tag_configure('nodownloaded', background='white')
-        self.button21 = Button(self.frame2, text="Load available dumps", command=lambda: thread.start_new_thread(self.loadAvailableDumps, ()), width=15)
+        self.button21 = Button(self.frame2, text="Load available dumps", command=lambda: threading.start_new_threading(self.loadAvailableDumps, ()), width=15)
         self.button21.grid(row=3, column=0)
-        self.button23 = Button(self.frame2, text="Download selection", command=lambda: thread.start_new_thread(self.downloadDump, ()), width=15)
+        self.button23 = Button(self.frame2, text="Download selection", command=lambda: threading.start_new_threading(self.downloadDump, ()), width=15)
         self.button23.grid(row=3, column=4)
         self.button22 = Button(self.frame2, text="Clear list", command=self.deleteAvailableDumps, width=10)
         self.button22.grid(row=3, column=8, columnspan=2)
@@ -213,7 +211,7 @@ def __init__(self, master):
         #end menu
 
     def blocked(self):
-        tkMessageBox.showerror("Error", "There is a task in progress. Please, wait.")
+        messagebox.showerror("Error", "There is a task in progress. Please, wait.")
 
     def checkURL(self):
         if re.search(ur"(?im)^https?://[^/]+\.[^/]+/", self.entry11.get()): #well-constructed URL?, one dot at least, aaaaa.com, but bb.aaaaa.com is allowed too
@@ -234,7 +232,7 @@ def checkURL(self):
                     self.entry11.config(background='red')
                     self.msg('index.php is incorrect!', level='error')
         else:
-            tkMessageBox.showerror("Error", "You have to write a correct api.php or index.php URL.")
+            messagebox.showerror("Error", "You have to write a correct api.php or index.php URL.")
 
     def sumSizes(self, sizes):
         total = 0
@@ -266,15 +264,15 @@ def run(self):
         dumpgenerator.main(params=params)
 
         #check dump
-        """
+       """
 
     def msg(self, msg='', level=''):
         levels = { 'ok': 'lightgreen', 'warning': 'yellow', 'error': 'red' }
         if levels.has_key(level.lower()):
-            print '%s: %s' % (level.upper(), msg)
+            print ('%s: %s' % (level.upper(), msg))
             self.status.config(text='%s: %s' % (level.upper(), msg), background=levels[level.lower()])
         else:
-            print msg
+            print (msg)
             self.status.config(text=msg, background='grey')
 
     def treeSortColumn(self, column, reverse=False):
@@ -326,7 +324,7 @@ def downloadDump(self, event=None):
             else:
                 self.msg('Problems in %d dumps. Downloaded %d of %d (and %d were previously downloaded).' % (len(items)-(c+d), c, len(items), d), level='error')
         else:
-            tkMessageBox.showerror("Error", "You have to select some dumps to download.")
+            messagebox.showerror("Error", "You have to select some dumps to download.")
         self.clearAvailableDumps()
         self.showAvailableDumps()
         self.filterAvailableDumps()
@@ -410,7 +408,7 @@ def loadAvailableDumps(self):
         wikifarms_r = re.compile(ur"(%s)" % ('|'.join(wikifarms.keys())))
         c = 0
         for mirror, url, regexp in self.urls:
-            print 'Loading data from', mirror, url
+            print ('Loading data from', mirror, url)
             self.msg(msg='Please wait... Loading data from %s %s' % (mirror, url))
             f = urllib.urlopen(url)
             m = re.compile(regexp).finditer(f.read())
@@ -452,7 +450,7 @@ def callback(self):
         self.msg("Feature not implemented for the moment. Contributions are welcome.", level='warning')
 
 def askclose():
-    if tkMessageBox.askokcancel("Quit", "Do you really wish to exit?"):
+    if messagebox.askokcancel("Quit", "Do you really wish to exit?"):
         root.destroy()
 
 if __name__ == "__main__":

diff --git a/launcher.py b/launcher.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright (C) 2011-2016 WikiTeam
@@ -28,17 +28,17 @@
 
 def main():
     if len(sys.argv) < 2:
-        print 'python script.py file-with-apis.txt'
+        print ('python script.py file-with-apis.txt')
         sys.exit()
 
-    print 'Reading list of APIs from', sys.argv[1]
+    print ('Reading list of APIs from', sys.argv[1])
     wikis = open(sys.argv[1], 'r').read().splitlines()
-    print '%d APIs found' % (len(wikis))
+    print ('%d APIs found' % (len(wikis)))
 
     for wiki in wikis:
-        print "#"*73
-        print "# Downloading", wiki
-        print "#"*73
+        print ("#"*73)
+        print ("# Downloading", wiki)
+        print ("#"*73)
         wiki = wiki.lower()
         # Make the prefix in standard way; api and index must be defined, not important which is which
         prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
@@ -52,17 +52,17 @@ def main():
                 break #stop searching, dot not explore subdirectories
 
         if compressed:
-            print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
+            print ('Skipping... This wiki was downloaded and compressed before in', zipfilename)
             # Get the archive's file list.
             if ( ( ( sys.version_info[0] == 3 ) and ( sys.version_info[1] > 0 ) ) or ( ( sys.version_info[0] == 2 ) and ( sys.version_info[1] > 6 ) ) ):
                 archivecontent = subprocess.check_output (['7z', 'l', zipfilename])
                 if re.search(ur"%s.+-history\.xml" % (prefix), archivecontent) is None:
                     # We should perhaps not create an archive in this case, but we continue anyway.
-                    print "ERROR: The archive contains no history!"
+                    print ("ERROR: The archive contains no history!")
                 if re.search(ur"Special:Version\.html", archivecontent) is None:
-                    print "WARNING: The archive doesn't contain Special:Version.html, this may indicate that download didn't finish."
+                    print ("WARNING: The archive doesn't contain Special:Version.html, this may indicate that download didn't finish.")
             else:
-                print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
+                print ("WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+.")
                 # TODO: Find a way like grep -q below without doing a 7z l multiple times?
             continue
 
@@ -81,10 +81,10 @@ def main():
         # such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
         # typically they don't provide any crawl-delay value in their robots.txt).
         if started and wikidir: #then resume
-            print 'Resuming download, using directory', wikidir
-            subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False)
+            print ('Resuming download, using directory', wikidir)
+            subprocess.call(['python3', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False)
         else: #download from scratch
-            subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False)
+            subprocess.call(['python3', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False)
             started = True
             #save wikidir now
             for f in os.listdir('.'):
@@ -98,7 +98,7 @@ def main():
         finished = False
         if started and wikidir and prefix:
             if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
-                print "No </mediawiki> tag found: dump failed, needs fixing; resume didn't work. Exiting."
+                print ("No </mediawiki> tag found: dump failed, needs fixing; resume didn't work. Exiting.")
             else:
                 finished = True
         # You can also issue this on your working directory to find all incomplete dumps:
@@ -108,7 +108,7 @@ def main():
         if finished:
             time.sleep(1)
             os.chdir(wikidir)
-            print 'Changed directory to', os.getcwd()
+            print ('Changed directory to', os.getcwd())
             # Basic integrity check for the xml. The script doesn't actually do anything, so you should check if it's broken. Nothing can be done anyway, but redownloading.
             subprocess.call('grep "<title>" *.xml -c;grep "<page>" *.xml -c;grep "</page>" *.xml -c;grep "<revision>" *.xml -c;grep "</revision>" *.xml -c', shell=True)
             # Make a non-solid archive with all the text and metadata at default compression. You can also add config.txt if you don't care about your computer and user names being published or you don't use full paths so that they're not stored in it.
@@ -123,7 +123,7 @@ def main():
             subprocess.call('7z' + ' a -ms=off -mx=1 ../%s-wikidump.7z.tmp %s-images.txt images/' % (prefix, prefix), shell=True)
             subprocess.call('mv' + ' ../%s-wikidump.7z.tmp ../%s-wikidump.7z' % (prefix, prefix), shell=True)
             os.chdir('..')
-            print 'Changed directory to', os.getcwd()
+            print ('Changed directory to', os.getcwd())
             time.sleep(1)
 
 if __name__ == "__main__":

diff --git a/listsofwikis/mediawiki/checkalive.py b/listsofwikis/mediawiki/checkalive.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright (C) 2011-2012 WikiTeam
@@ -17,27 +17,35 @@
 
 # Script to check if a list of wikis are alive or dead
 
-import thread
+import threading
 import time
-import sys
-import urllib2
-import exceptions
+import urllib.request
+from urllib.error import *
+from http.server import BaseHTTPRequestHandler
 import re
 
 # Configuration
 delay = 30 # Seconds before timing out on request
 limit = 100
 
 def printapi(api):
-    print api, 'is alive'
+    print (api, 'is alive')
     open('wikisalive.txt', 'a').write(('%s\n' % api.strip()).encode('utf-8'))
 
 def checkcore(api):
-    req = urllib2.Request(api, None)
+    req = urllib.request(api, None)
     try:
-        raw = urllib2.urlopen(req, None, delay).read()
-    except IOError: # http://docs.python.org/2/howto/urllib2.html#handling-exceptions
-        print api, 'is dead or has errors'
+        raw = urllib.request.urlopenurlopen(req, None, delay).read()
+    except URLError as reason: # https://docs.python.org/3/library/urllib.error.html
+
+        if reason.isinstance(HTTPError):
+            print (api + 'is dead or has errors because:')
+            print ("Error code " + HTTPError.code + ": " + BaseHTTPRequestHandler.responses[HTTPError.code].shortmessage)
+            print (BaseHTTPRequestHandler.responses[HTTPError.code].longmessage)
+            print ("Reason: " + HTTPError.reason)
+            print ("HTTP Headers:\n" + HTTPError.headers)
+        else:
+            print (api + 'is dead or has errors because:' + reason)
         return
     # RSD is available since 1.17, bug 25648
     rsd = re.search(r'(?:link rel="EditURI".+href=")(?:https?:)?(.+api.php)\?action=rsd', raw)
@@ -60,11 +68,11 @@ def checkcore(api):
         index = domain.group(1) + login.group(1)
         printapi(index)
     else:
-        print api, 'is not a MediaWiki wiki'
+        print (api, 'is not a MediaWiki wiki')
 
 def check(apis):
     for api in apis:
-        thread.start_new_thread(checkcore, (api,))
+        threading.start_new_threading(checkcore, (api,))
         time.sleep(0.1)
     time.sleep(delay+1)
 

diff --git a/listsofwikis/mediawiki/miraheze-spider.py b/listsofwikis/mediawiki/miraheze-spider.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright (C) 2014-2017 WikiTeam developers
@@ -29,7 +29,7 @@ def main():
     m = re.findall(ur'<tr><td>(<del>)?<a href="https://([^>]+?)/">[^<]+</a>', raw)
     m.sort()
     for i in m:
-        print 'https://' + i[1] + '/w/api.php'
+        print ('https://' + i[1] + '/w/api.php')
 
 if __name__ == '__main__':
     main()
diff --git a/listsofwikis/mediawiki/neoseeker-spider.py b/listsofwikis/mediawiki/neoseeker-spider.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright (C) 2014-2017 WikiTeam developers
@@ -29,7 +29,7 @@ def main():
     m = re.findall(ur'<li><a href=\'([^>]+?)/wiki/\'>', raw)
     m.sort()
     for i in m:
-        print i + '/w/api.php'
+        print (i + '/w/api.php')
 
 if __name__ == '__main__':
     main()
diff --git a/listsofwikis/mediawiki/orain-spider.py b/listsofwikis/mediawiki/orain-spider.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright (C) 2014 WikiTeam developers
@@ -28,7 +28,7 @@ def main():
     raw = r.text
     m = re.findall(ur'<tr><td><a href="//([^>]+?)/">[^<]+</a></td></tr>', raw)
     for i in m:
-        print 'http://' + i + '/w/api.php'
+        print ('http://' + i + '/w/api.php')
 
 if __name__ == '__main__':
     main()
diff --git a/listsofwikis/mediawiki/referata-spider.py b/listsofwikis/mediawiki/referata-spider.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright (C) 2014 WikiTeam developers
@@ -33,7 +33,7 @@ def main():
 
         m = re.findall(ur'(?im)<h3 class="r"><a href=\"([^ ]+?)" onmouse', raw)
         for i in m:
-            print i
+            print (i)
 
         if re.search(ur'id="ofr"', raw): #resultados omitidos, final
             break

diff --git a/listsofwikis/mediawiki/shoutwiki-spider.py b/listsofwikis/mediawiki/shoutwiki-spider.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright (C) 2014 WikiTeam developers
@@ -41,7 +41,7 @@ def main():
         for site in jsonsites['query']['listwikis']:
             siteid = int(site['id'])
             siteurl = site['url']
-            print siteurl
+            print (siteurl)
 
         if len(jsonsites['query']['listwikis']) == int(swlimit):
             #there are more

diff --git a/listsofwikis/mediawiki/wiki-site-spider.py b/listsofwikis/mediawiki/wiki-site-spider.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright (C) 2014 WikiTeam developers
@@ -32,7 +32,7 @@ def main():
         raw = r.text
         m = re.findall(ur'<td><a href="([^>]+?)"', raw)
         for i in m:
-            print i
+            print (i)
 
 if __name__ == '__main__':
     main()