Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
411 lines (395 sloc) 19 KB
#!/usr/bin/env python3.6
import mwclient, configparser, mwparserfromhell, re, twitter, textwrap, argparse, sys
from datetime import *
from dateutil.parser import *
from itertools import tee, islice, zip_longest
from twitter.error import TwitterError
#from time import sleep
import time
import wayback
from mwclient import *
def call_home(site):#config):
#page = site.Pages['User:' + config.get('enwiki','username') + "/status"]
page = site.Pages['User:TweetCiteBot/status']
text = page.text()
if "false" in text.lower():
return False
return True
def allow_bots(text, user):
user = user.lower().strip()
text = mwparserfromhell.parse(text)
for tl in text.filter_templates():
if tl.name in ('bots', 'nobots'):
break
else:
return True
for param in tl.params:
bots = [x.lower().strip() for x in param.value.split(",")]
if param.name == 'allow':
if ''.join(bots) == 'none': return False
for bot in bots:
if bot in (user, 'all'):
return True
elif param.name == 'deny':
if ''.join(bots) == 'none': return True
for bot in bots:
if bot in (user, 'all'):
return False
return True
def get_next_iter_item(some_iterable, window=1):
"""
Get the item that will be in next iteration of the loop.
This will be useful for finding {{dead link}} templates.
This code is adapted from an answer to a StackOverflow question by user nosklo
https://stackoverflow.com/questions/4197805/python-for-loop-look-ahead/4197869#4197869
@param some_iterable Thing to iterate over
@param window How far to look ahead
"""
items, nexts = tee(some_iterable, 2)
nexts = islice(nexts, window, None)
return zip_longest(items, nexts)
def save_edit(page, utils, text):#config, api, site, original_text,dry_run):#,config):
#utils = [config,api,site,archive_urls,dry_run]
config = utils[0]
api = utils[1]
site = utils[2]
dry_run = utils[4]
archive_urls = utils[3]
original_text = text
code = mwparserfromhell.parse(text)
for template in code.filter_templates():
if template.name.matches("nobots") or template.name.matches("Wikipedia:Exclusion compliant"):
if template.has("allow"):
if "TweetCiteBot" in template.get("allow").value:
break # can edit
print("\n\nPage editing blocked as template preventing edit is present.\n\n")
return
if not call_home(site):#config):
raise ValueError("Kill switch on-wiki is false. Terminating program.")
edit_summary = """Converted Tweet
URLs to [[Template:Cite tweet|{{cite tweet}}]] using
[[User:""" + config.get('enwiki','username') + "| " + config.get('enwiki','username') + "]]-PyEdition. Mistake? [[User talk:TheSandDoctor|msg TSD!]] (please mention that this is the PyEdition!)"""
time = 0
while True:
#content_changed = False
#text = page.edit()
#text = text.replace('[[Category:Apples]]', '[[Category:Pears]]')
if time == 0:
text = page.text()
if time == 1:
# page = site.Pages[page.page_title]
page.purge()
original_text = site.Pages[page.page_title].text()
content_changed, text = convert(original_text,dry_run, api, archive_urls)
try:
if dry_run:
print("Dry run")
#Write out the initial input
text_file = open("Input02.txt", "w")
text_file.write(original_text)
text_file.close()
#Write out the output
if content_changed:
text_file = open("Output02.txt", "w")
text_file.write(text)
text_file.close()
else:
print("Content not changed, don't print output")
break
else:
if verbose:
print("LIVE run")
#print("Would have saved here")
#break
#TODO: Enable
if not content_changed:
if verbose:
print("Content not changed, don't bother pushing edit to server")
break
#break
page.save(text, summary=edit_summary, bot=True, minor=True)
#print(page.page_title)
print("Saved page")
if time == 1:
time = 0
break
except [[EditError]]:
print("Error")
time = 1
time.sleep(5) # sleep for 5 seconds, giving server some time before querying again
continue
except [[ProtectedPageError]] as e:
print('Could not edit ' + page.page_title + ' due to protection')
print(e)
break
def convert(text,dry_run, api,archive_urls):
"""
Converts use of {{cite web}} for tweets (if present) to using {{cite tweet}}.
@param text Page text to go over
@param dry_run boolean Whether or not this is a dry run (dry run = no live edit)
@param api Twitter API instance
@returns [content_changed, content] Whether content was changed,
(if former true, modified) content.
"""
# print("In remove {}".format(dry_run))
wikicode = mwparserfromhell.parse(text)
templates = wikicode.filter_templates()
content_changed = False
#TODO: Testing (dry run) only
if dry_run:
text_file = open("Input.txt","w")
text_file.write(text)
text_file.close()
#TODO: End dry run only
code = mwparserfromhell.parse(text)
dead_link = False
use_mdy = False
for template, next_template in get_next_iter_item(code.filter_templates()):#Tracklist, Track, Soundtrack, Tlist, Track list
#template.name = template.name.lower()
#if template.name.matches("dead link"):
# dead_link = True
if (template.name.matches("use mdy dates") or template.name.matches("mdy")
or template.name.matches("use mdy") or template.name.matches("usemdy")
or template.name.matches("usemdydates")):
if verbose:
print("Use MDY format")
use_mdy = True
# else:
# Clearly the date format template isn't there, so just assume other format should be used
# by default
# print("Use DMY format")
#pass
if (template.name.matches("cite web") or template.name.matches("citeweb")
or template.name.matches("c web") or template.name.matches("cita web")
or template.name.matches("weblink") or template.name.matches("ref web")
or template.name.matches("citweb") or template.name.matches("cw")
or template.name.matches("web cite") or template.name.matches("web citation")
or template.name.matches("cite w") or template.name.matches("cit web")
or template.name.matches("cite url") or template.name.matches("cite blog")
or template.name.matches("cite web.") or template.name.matches("cite webpage")
or template.name.matches("web reference") or template.name.matches("web-reference")
or template.name.matches("cite wb") or template.name.matches("cite we")
or template.name.matches("citat web") or template.name.matches("cite-web")
or template.name.matches("webbref") or template.name.matches("cite website")
or template.name.matches("cite website article") or template.name.matches("chú thích web")
or template.name.matches("citace elektronické monografie") or template.name.matches("citeer web")
or template.name.matches("یادکرد وب") or template.name.matches("웹 인용")
or template.name.matches("cite web/lua")):
if template.has("url"):
url = template.get("url").value
match = re.match(r'(?:(?:\s)*https?:\/\/)?(?:www\.)?(?:\s)*?twitter\.com\/(?:#!\/)?@?([^\/\?\s]*)\/status\/([{\d+:\d+]+)',str(url))
if match: # it is a twitter URL
if next_template:
if next_template.name.matches("dead link"): #TODO: Expand to cover variations/aliases of {{dead link}}
# Play it safe and leave this template as the next one
# shouldn't be a deadlink (if it is, doing all this for nothing)
print("FOUND DEADLINK......SKIPPING!")
continue
try:
tweet = api.GetStatus(match.group(2))
if tweet:
has_archive_url = False
content_changed = True
if verbose:
print(match.group(0))
#url_reg = r'[a-z]*[:.]+\S+'
url_reg = r'((?:\s)*https?:\/\/)?(?:www\.)?(?:\s)*?t\.co\/([a-zA-Z0-9])*'
sec_pattern = r'/\r|\n/'
text = re.sub(url_reg, '', tweet.text)
text = re.sub(sec_pattern, ' ', text)
tweet_text = textwrap.shorten(text,width=40,placeholder="...")
if verbose:
print(tweet_text)
tweet_obj = "{{cite tweet|number=" + str(match.group(2)) + "|user=" + tweet.user.screen_name + "|title=" + tweet_text
tweet_accessdate = tweet_archivedate = tweet_language = tweet_archiveurl = tweet_date = None
if template.has("accessdate") or template.has("access-date"):
#tweet_accessdate = template.get("accessdate").value
tweet_obj += "|accessdate=" + str(template.get("accessdate").value)
if verbose:
print("Has accessdate")
if template.has("archivedate") or template.has("archive-date"):
if verbose:
print("Has archive date")
# tweet_archivedate = template.get("archivedate").value
tweet_obj += "|archivedate=" + str(template.get("archivedate").value)
if template.has("language"):
#tweet_language = template.get("language").value
tweet_obj += "|language=" + str(template.get("language").value)
if verbose:
print("Has language")
if template.has("archiveurl"):
has_archive_url = True
#tweet_archiveurl = template.get("archiveurl").value
tweet_obj += "|archiveurl=" + str(template.get("archiveurl").value)
if verbose:
print("Has archiveurl")
if template.has("date"):
#tweet_date = template.get("date").value
tweet_obj += "|date=" + str(template.get("date").value)
if verbose:
print("Has date")
else:
#For reference: http://strftime.org
date_format = '%-d %B %Y'
if use_mdy:
date_format = '%B %-d, %Y'
tweet_obj += "|date=" + parse(tweet.created_at).strftime(date_format)
#tweet_obj += "}}"
if not has_archive_url and archive_urls:
wb = wayback.Wayback()
archive_url = wb.closest(str(url))
print(archive_url)
if archive_url:
tweet_obj += "|archive-url=" + archive_url
tweet_obj += "|archivedate=" + datetime.now().strftime('%B %Y')
#pass
# code.replace(template, str(template) + "{{dead link|date=" + datetime.now().strftime(date_format)
# + "|url=" + archive_url + "|bot=TweetCiteBot}}")
# code.replace(template, str(template) + "{{dead link|date=" + datetime.now().strftime(date_format)
# + "|fix-attempted=yes" + "|bot=TweetCiteBot}}")
tweet_obj += "}}"
code.replace(template, tweet_obj)
content_changed = True
except TwitterError as err:
#TODO: Somewhere here we should try to look to archive,
# since Tweet clearly doesn't exist.
# TODO: Figure out wayback in python
print("Clearly something went wrong with tweet " + str(err))
wb = wayback.Wayback()
archive_url = wb.closest(str(url))
print(archive_url)
date_format = '%B %Y'
if archive_url:
code.replace(template, str(template) + "{{dead link|date=" + datetime.now().strftime(date_format)
+ "|url=" + archive_url + "|bot=TweetCiteBot}}")
content_changed = True
else:
code.replace(template, str(template) + "{{dead link|date=" + datetime.now().strftime(date_format)
+ "|fix-attempted=yes" + "|bot=TweetCiteBot}}")
content_changed = True
#print(archive_url)
return [content_changed, str(code)] # get back text to save
def getList():
f = open("list of all articles containing links to tweets (unmarked up).txt", 'r')
lst = f.read().split('\n')
articles = []
for l in lst:
if not l is "":
articles.append(l)
return articles
def main():
limited_run = True
pages_to_run = 4727
offset = 4721
category = None
archive_urls = False
dry_run = False
verbose = False
parser = argparse.ArgumentParser(prog='TweetCiteBot Tweet URL conversion', description='''Reads {{cite web}} templates
on articles looking for url parameters containing Tweet URLs. If found, convert template to {{cite tweet}} and retrieve
relevant information (if possible). If the Tweet is a dead link, attempt recovery with the Wayback archive and tag accordingly
on-wiki. This task was approved by the English Wikipedia Bot Approvals Group at 17:59, 2 December 2017 (UTC) by BAG admin
User:cyberpower678''')
parser.add_argument("-dr", "--dryrun", help="perform a dry run (don't actually edit)",
action="store_true")
parser.add_argument("-arch","--archive", help="actively archive Tweet links (even if still live links)",
action="store_true")
parser.add_argument("-v","--verbose", help="Display more information when running",
action="store_true")
args = parser.parse_args()
if args.dryrun:
dry_run = True
print("Dry run")
if args.archive:
print("Archive allow")
archive_urls = True
if args.verbose:
print("Verbose mode")
verbose = True
#raise ValueError("for testing, dont want whole script running")
site = mwclient.Site(('https','en.wikipedia.org'), '/w/')
#site = mwclient.Site(('https','wiki.markyrosongaming.com','/w/'))
config = configparser.RawConfigParser()
config.read('credentials.txt')
try:
site.login(config.get('enwiki','username'), config.get('enwiki', 'password'))
except errors.LoginError as e:
#print(e[1]['reason'])
print(e)
raise ValueError("Login failed.")
api = twitter.Api(consumer_key='CUST_KEY',
consumer_secret='CUST_SECRET',
access_token_key='TOK_KEY',
access_token_secret='TOK_SECRET')
counter = 0
#for page in site.Categories[category]:
#page = site.Pages['User:TweetCiteBot/sandbox']#"If You Ever Think I Will Stop Goin' In Ask Double R"]#'3 (Bo Bice album)']
# print("Working with: " + page.name)
#page = site.Pages['User:TweetCiteBot/sandbox']#'3 (Bo Bice album)']
# if limited_run:
# if counter < pages_to_run:
# counter += 1
# else:
# return # run out of pages in limited run
utils = [config,api,site,archive_urls,dry_run]
list = getList()
#print(len(list))
#raise ValueError("yoyo")
#page = site.Pages["Rhode Island Rams women's ice hockey"]
# text = page.text()
# try:
# save_edit(page, utils, text)#config, api, site, text, dry_run)#, config)
#time.sleep(0.5) # sleep 1/2 second in between pages
# except ValueError as err:
# print(err)
if limited_run:
while counter < pages_to_run:
if offset > 0:
offset -= 1
if verbose:
print("Skipped due to offset config")
counter += 1
continue
print("Working with: " + list[counter])
page = site.Pages[list[counter]]
print(counter)
text = page.text()
try:
save_edit(page, utils, text)#config, api, site, text, dry_run)#, config)
#time.sleep(0.5) # sleep 1/2 second in between pages
except ValueError as err:
print(err)
counter += 1
#else:
# return # run out of pages in limited run
#for art in getList():
# if offset > 0:
# offset -= 1
# print("Skipped due to offset config")
# continue
# print("Working with: " + art)
# page = site.Pages[art]
# if limited_run:
# if counter < pages_to_run:
# counter += 1
# text = page.text()
# try:
# save_edit(page, utils, text)#config, api, site, text, dry_run)#, config)
# #time.sleep(0.5) # sleep 1/2 second in between pages
# except ValueError as err:
# print(err)
# else:
# return # run out of pages in limited run
#text = page.text()
# try:
# save_edit(page, utils, text)#config, api, site, text, dry_run)#, config)
# except ValueError as err:
# print(err)
#sleep(5) # sleep 5 seconds in between pages
if __name__ == "__main__":
try:
verbose = False
main()
except KeyboardInterrupt:
print('Interrupted')
sys.exit(0)