Scripts para facilitar el desarrollo del libro

- Expresiones regulares por todo el libro - Generador del Anexo2 (Twitter Scraper) - Generador del Anexo de Recursos (Regex + Scraper)
UlisesGascon · Jan 8, 2017 · 2b944ff · 2b944ff
1 parent bfd62c0
commit 2b944ff
Show file tree

Hide file tree

Showing 3 changed files with 155 additions and 0 deletions.
diff --git a/scripts/generate_links_list.py b/scripts/generate_links_list.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+import os
+import re
+from bs4 import BeautifulSoup
+import requests
+
+headerRegex = r"(#+) (.*)"
+linksRegex = r"(?!!).\[([^\[]+)\]\(([^\)]+)\)"
+endFile = "# Recursos\n\n"
+index = "# Índice\n\n"
+content = "# Contenido\n"
+
+# URLs that generated exceptions last time... now all are skipped.
+ignoreUrls = [
+    "https://cylonjs.com/",
+    "http://addyosmani.com/resources/essentialjsdesignpatterns/book/",
+    "https://www.campus.co/madrid/es/events",
+    "https://www.genbeta.com/web/error-404-not-found-historia-y-hazanas-de-este-mitico-codigo",
+    "https://jsonformatter.curiousconcept.com/",
+    "https://github.com/technoboy10/crossorigin.me",
+    "https://ponyfoo.com/articles/es6-generators-in-depth",
+    "https://ponyfoo.com/articles/es6-promises-in-depth",
+    "https://ponyfoo.com/articles/understanding-javascript-async-await",
+    "https://www.genbetadev.com/paradigmas-de-programacion/trabajar-con-microservicios-evitando-la-frustracion-de-las-enormes-aplicaciones-monoliticas",
+    "http://ashleynolan.co.uk/blog/frontend-tooling-survey-2015-results",
+    "http://blog.codinghorror.com/the-magpie-developer/",
+    "http://www.xataka.com/servicios/y-si-el-software-open-source-desapareciera",
+    "https://www.meetup.com/es-ES/Open-Source-Weekends/",
+    "https://es.wikipedia.org/wiki/Hoja_de_estilos_en_cascada",
+    "https://git-scm.com/",
+    "https://www.polymer-project.org/1.0/",
+    "http://web.archive.org",
+    "https://www.ecured.cu/Sentencias_(Programaci%C3%B3n",
+    "https://travis-ci.org/"
+]
+
+with open("manuscript/Book.txt") as f:
+    lines = (line.rstrip() for line in f)
+    lines = (line for line in lines if line)
+    for line in lines:
+        with open("manuscript/" + line) as chapter:
+            fileContent = chapter.readlines()
+            for line in fileContent:
+
+                # Headers
+                headerSearch = re.search(headerRegex, line)
+                if headerSearch:
+                    headerMatch = re.search(headerRegex, line)
+                    if headerMatch.group(1) == "#" or headerMatch.group(1) == "##" or headerMatch.group(1) == "###":
+                        if headerMatch.group(1) == "#":
+                            index += "- **["+headerMatch.group(2)+"](#"+headerMatch.group(2).strip().lower().replace(" ", "-")+")**\n"
+
+                        headerLink = re.search(linksRegex, headerMatch.group(2))
+                        header = ""
+
+                        if headerLink:
+                            header = headerLink.group(1)
+                        else:
+                            header = headerMatch.group(2)
+
+                        currentHeader = "\n" + headerMatch.group(1) + "# " + header
+                        content += currentHeader + "\n"
+
+                # Links
+                linksSearch = re.findall(linksRegex, line, flags=re.LOCALE)
+                if linksSearch:
+                    for link in linksSearch:
+                        finalTitle = link[0]
+                        # scraping
+                        if not link[1] in ignoreUrls and re.match(r"^http:\/\/web\.archive\.org", link[1]) is None:
+                            try:
+                                request = requests.get(link[1], verify=False)
+                                if request.status_code == 200:
+                                    print "Current URL:", link[1]
+                                    #request = request.read()
+                                    soup = BeautifulSoup(request.text, "html5lib")
+                                    if soup.title:
+                                        finalTitle = soup.title.string
+                                        finalTitle = str(finalTitle.encode('utf-8')).strip().replace("\n", " ")
+                            except (requests.exceptions.SSLError, requests.exceptions.ConnectionError, requests.exceptions.MissingSchema):
+                                print "-- PLEASE REMOVE:", link[1]
+                                pass
+                        else:
+                            print "IGNORED URL:", link[1]
+
+                        content += "- *["+finalTitle+"]("+link[1]+")*\n"
+            # End File
+            content += "\n\n"
+
+endFile += index + "\n\n" + content
+
+# Save results
+text_file = open("extras/recursos.md", "w")
+text_file.write(endFile)
+text_file.close()
+print "extras/recursos.md updated!"
diff --git a/scripts/regex_fixer.py b/scripts/regex_fixer.py
@@ -0,0 +1,20 @@
+import os
+import re
+
+linkRegex = r"(?!_|\*).\[([^\[]+)\]\(([^\)]+)\)"
+linkBoldRegex = r"\[([^\[]+)\]\(([^\)]+)\)\*\*"
+fixBugs = r"line-numbers=off, "
+
+for file in os.listdir("manuscript"):
+    if file.endswith(".txt"):
+        with open("manuscript/" + file) as f:
+            fileData = ""
+            fileContent = f.readlines()
+            for line in fileContent:
+                line = re.sub(fixBugs, r'', line.rstrip())
+                fileData += line + "\n"
+
+            text_file = open("manuscript/" + file, "w")
+            text_file.write(fileData)
+            text_file.close()
+print "Book updated!"
diff --git a/scripts/twitter_developers.py b/scripts/twitter_developers.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+from bs4 import BeautifulSoup
+import urllib
+
+def change_text(text):
+    return text.encode('utf-8', 'ignore')
+
+developers_md = ""
+
+developers = ["AriyaHidayat", "BrendanEich", "DotProto", "John_Papa", "MatiasArriola", "PascalPrecht", "SachaGreif", "SimoAhava", "addyosmani", "alexnavasardyan", "amasad", "brianleroux", "codepo8", "codylindley", "codingcarlos", "davidwalshblog", "dejan_dimic", "dshaw", "ducntq", "elijahmanor", "erikthedev_", "firede", "garu_rj", "gavrisimo", "gibson042", "greybax", "idangazit", "jamsyoung", "jdalton", "jeresig", "jfroffice", "kahliltweets", "kentcdodds", "kom_256", "l0ckys", "ladyleet", "leobetosouza", "marcotrulla", "marocchino", "mathias", "mihaipaun", "nataliemac", "nicksalloum_", "okuryu", "os_weekends", "ossreleasefeed", "paul_irish", "rauschma", "rem", "remotesynth", "rmurphey", "roebuk", "rwaldron", "stephanlindauer", "tomdale", "trevnorris", "umaar", "wecodesign", "yotamofek"]
+
+for developer in developers:
+
+    url = "https://twitter.com/" + developer
+    request = urllib.urlopen(url)
+
+    print "status code: " + str(request.getcode())
+    if request.getcode() == 200:
+        request = request.read()
+        soup = BeautifulSoup(request, "html5lib")
+        print "url: " + url
+
+        name = soup.findAll("a", { "class" : "ProfileHeaderCard-nameLink"})
+        bio = soup.findAll("p", { "class" : "ProfileHeaderCard-bio"})
+
+        if name and bio:
+            name = name[0].text.encode('utf-8')
+            bio = bio[0].text.encode('utf-8')
+
+            print "current: " + name
+            print "bio: " + bio
+
+            developers_md += "- **["+ name +"]("+url+")**\n\n"
+            developers_md += "\t@"+ developer+ ": *"+ bio +"*\n\n"
+            print "-------------------------------------"
+
+text_file = open("developers.md", "w")
+text_file.write(developers_md)
+text_file.close()