diff --git a/scripts/generate_links_list.py b/scripts/generate_links_list.py new file mode 100644 index 0000000..8170047 --- /dev/null +++ b/scripts/generate_links_list.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +import os +import re +from bs4 import BeautifulSoup +import requests + +headerRegex = r"(#+) (.*)" +linksRegex = r"(?!!).\[([^\[]+)\]\(([^\)]+)\)" +endFile = "# Recursos\n\n" +index = "# Índice\n\n" +content = "# Contenido\n" + +# URLs that generated exceptions last time... now all are skipped. +ignoreUrls = [ + "https://cylonjs.com/", + "http://addyosmani.com/resources/essentialjsdesignpatterns/book/", + "https://www.campus.co/madrid/es/events", + "https://www.genbeta.com/web/error-404-not-found-historia-y-hazanas-de-este-mitico-codigo", + "https://jsonformatter.curiousconcept.com/", + "https://github.com/technoboy10/crossorigin.me", + "https://ponyfoo.com/articles/es6-generators-in-depth", + "https://ponyfoo.com/articles/es6-promises-in-depth", + "https://ponyfoo.com/articles/understanding-javascript-async-await", + "https://www.genbetadev.com/paradigmas-de-programacion/trabajar-con-microservicios-evitando-la-frustracion-de-las-enormes-aplicaciones-monoliticas", + "http://ashleynolan.co.uk/blog/frontend-tooling-survey-2015-results", + "http://blog.codinghorror.com/the-magpie-developer/", + "http://www.xataka.com/servicios/y-si-el-software-open-source-desapareciera", + "https://www.meetup.com/es-ES/Open-Source-Weekends/", + "https://es.wikipedia.org/wiki/Hoja_de_estilos_en_cascada", + "https://git-scm.com/", + "https://www.polymer-project.org/1.0/", + "http://web.archive.org", + "https://www.ecured.cu/Sentencias_(Programaci%C3%B3n", + "https://travis-ci.org/" +] + +with open("manuscript/Book.txt") as f: + lines = (line.rstrip() for line in f) + lines = (line for line in lines if line) + for line in lines: + with open("manuscript/" + line) as chapter: + fileContent = chapter.readlines() + for line in fileContent: + + # Headers + headerSearch = re.search(headerRegex, line) + if headerSearch: + headerMatch = re.search(headerRegex, line) + if headerMatch.group(1) == "#" or headerMatch.group(1) == "##" or headerMatch.group(1) == "###": + if headerMatch.group(1) == "#": + index += "- **["+headerMatch.group(2)+"](#"+headerMatch.group(2).strip().lower().replace(" ", "-")+")**\n" + + headerLink = re.search(linksRegex, headerMatch.group(2)) + header = "" + + if headerLink: + header = headerLink.group(1) + else: + header = headerMatch.group(2) + + currentHeader = "\n" + headerMatch.group(1) + "# " + header + content += currentHeader + "\n" + + # Links + linksSearch = re.findall(linksRegex, line, flags=re.LOCALE) + if linksSearch: + for link in linksSearch: + finalTitle = link[0] + # scraping + if not link[1] in ignoreUrls and re.match(r"^http:\/\/web\.archive\.org", link[1]) is None: + try: + request = requests.get(link[1], verify=False) + if request.status_code == 200: + print "Current URL:", link[1] + #request = request.read() + soup = BeautifulSoup(request.text, "html5lib") + if soup.title: + finalTitle = soup.title.string + finalTitle = str(finalTitle.encode('utf-8')).strip().replace("\n", " ") + except (requests.exceptions.SSLError, requests.exceptions.ConnectionError, requests.exceptions.MissingSchema): + print "-- PLEASE REMOVE:", link[1] + pass + else: + print "IGNORED URL:", link[1] + + content += "- *["+finalTitle+"]("+link[1]+")*\n" + # End File + content += "\n\n" + +endFile += index + "\n\n" + content + +# Save results +text_file = open("extras/recursos.md", "w") +text_file.write(endFile) +text_file.close() +print "extras/recursos.md updated!" diff --git a/scripts/regex_fixer.py b/scripts/regex_fixer.py new file mode 100644 index 0000000..d66af0d --- /dev/null +++ b/scripts/regex_fixer.py @@ -0,0 +1,20 @@ +import os +import re + +linkRegex = r"(?!_|\*).\[([^\[]+)\]\(([^\)]+)\)" +linkBoldRegex = r"\[([^\[]+)\]\(([^\)]+)\)\*\*" +fixBugs = r"line-numbers=off, " + +for file in os.listdir("manuscript"): + if file.endswith(".txt"): + with open("manuscript/" + file) as f: + fileData = "" + fileContent = f.readlines() + for line in fileContent: + line = re.sub(fixBugs, r'', line.rstrip()) + fileData += line + "\n" + + text_file = open("manuscript/" + file, "w") + text_file.write(fileData) + text_file.close() +print "Book updated!" \ No newline at end of file diff --git a/scripts/twitter_developers.py b/scripts/twitter_developers.py new file mode 100644 index 0000000..3d9bcde --- /dev/null +++ b/scripts/twitter_developers.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +from bs4 import BeautifulSoup +import urllib + +def change_text(text): + return text.encode('utf-8', 'ignore') + +developers_md = "" + +developers = ["AriyaHidayat", "BrendanEich", "DotProto", "John_Papa", "MatiasArriola", "PascalPrecht", "SachaGreif", "SimoAhava", "addyosmani", "alexnavasardyan", "amasad", "brianleroux", "codepo8", "codylindley", "codingcarlos", "davidwalshblog", "dejan_dimic", "dshaw", "ducntq", "elijahmanor", "erikthedev_", "firede", "garu_rj", "gavrisimo", "gibson042", "greybax", "idangazit", "jamsyoung", "jdalton", "jeresig", "jfroffice", "kahliltweets", "kentcdodds", "kom_256", "l0ckys", "ladyleet", "leobetosouza", "marcotrulla", "marocchino", "mathias", "mihaipaun", "nataliemac", "nicksalloum_", "okuryu", "os_weekends", "ossreleasefeed", "paul_irish", "rauschma", "rem", "remotesynth", "rmurphey", "roebuk", "rwaldron", "stephanlindauer", "tomdale", "trevnorris", "umaar", "wecodesign", "yotamofek"] + +for developer in developers: + + url = "https://twitter.com/" + developer + request = urllib.urlopen(url) + + print "status code: " + str(request.getcode()) + if request.getcode() == 200: + request = request.read() + soup = BeautifulSoup(request, "html5lib") + print "url: " + url + + name = soup.findAll("a", { "class" : "ProfileHeaderCard-nameLink"}) + bio = soup.findAll("p", { "class" : "ProfileHeaderCard-bio"}) + + if name and bio: + name = name[0].text.encode('utf-8') + bio = bio[0].text.encode('utf-8') + + print "current: " + name + print "bio: " + bio + + developers_md += "- **["+ name +"]("+url+")**\n\n" + developers_md += "\t@"+ developer+ ": *"+ bio +"*\n\n" + print "-------------------------------------" + +text_file = open("developers.md", "w") +text_file.write(developers_md) +text_file.close() \ No newline at end of file