Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Scripts para facilitar el desarrollo del libro
- Expresiones regulares por todo el libro - Generador del Anexo2 (Twitter Scraper) - Generador del Anexo de Recursos (Regex + Scraper)
- Loading branch information
1 parent
bfd62c0
commit 2b944ff
Showing
3 changed files
with
155 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# -*- coding: utf-8 -*- | ||
import os | ||
import re | ||
from bs4 import BeautifulSoup | ||
import requests | ||
|
||
headerRegex = r"(#+) (.*)" | ||
linksRegex = r"(?!!).\[([^\[]+)\]\(([^\)]+)\)" | ||
endFile = "# Recursos\n\n" | ||
index = "# Índice\n\n" | ||
content = "# Contenido\n" | ||
|
||
# URLs that generated exceptions last time... now all are skipped. | ||
ignoreUrls = [ | ||
"https://cylonjs.com/", | ||
"http://addyosmani.com/resources/essentialjsdesignpatterns/book/", | ||
"https://www.campus.co/madrid/es/events", | ||
"https://www.genbeta.com/web/error-404-not-found-historia-y-hazanas-de-este-mitico-codigo", | ||
"https://jsonformatter.curiousconcept.com/", | ||
"https://github.com/technoboy10/crossorigin.me", | ||
"https://ponyfoo.com/articles/es6-generators-in-depth", | ||
"https://ponyfoo.com/articles/es6-promises-in-depth", | ||
"https://ponyfoo.com/articles/understanding-javascript-async-await", | ||
"https://www.genbetadev.com/paradigmas-de-programacion/trabajar-con-microservicios-evitando-la-frustracion-de-las-enormes-aplicaciones-monoliticas", | ||
"http://ashleynolan.co.uk/blog/frontend-tooling-survey-2015-results", | ||
"http://blog.codinghorror.com/the-magpie-developer/", | ||
"http://www.xataka.com/servicios/y-si-el-software-open-source-desapareciera", | ||
"https://www.meetup.com/es-ES/Open-Source-Weekends/", | ||
"https://es.wikipedia.org/wiki/Hoja_de_estilos_en_cascada", | ||
"https://git-scm.com/", | ||
"https://www.polymer-project.org/1.0/", | ||
"http://web.archive.org", | ||
"https://www.ecured.cu/Sentencias_(Programaci%C3%B3n", | ||
"https://travis-ci.org/" | ||
] | ||
|
||
with open("manuscript/Book.txt") as f: | ||
lines = (line.rstrip() for line in f) | ||
lines = (line for line in lines if line) | ||
for line in lines: | ||
with open("manuscript/" + line) as chapter: | ||
fileContent = chapter.readlines() | ||
for line in fileContent: | ||
|
||
# Headers | ||
headerSearch = re.search(headerRegex, line) | ||
if headerSearch: | ||
headerMatch = re.search(headerRegex, line) | ||
if headerMatch.group(1) == "#" or headerMatch.group(1) == "##" or headerMatch.group(1) == "###": | ||
if headerMatch.group(1) == "#": | ||
index += "- **["+headerMatch.group(2)+"](#"+headerMatch.group(2).strip().lower().replace(" ", "-")+")**\n" | ||
|
||
headerLink = re.search(linksRegex, headerMatch.group(2)) | ||
header = "" | ||
|
||
if headerLink: | ||
header = headerLink.group(1) | ||
else: | ||
header = headerMatch.group(2) | ||
|
||
currentHeader = "\n" + headerMatch.group(1) + "# " + header | ||
content += currentHeader + "\n" | ||
|
||
# Links | ||
linksSearch = re.findall(linksRegex, line, flags=re.LOCALE) | ||
if linksSearch: | ||
for link in linksSearch: | ||
finalTitle = link[0] | ||
# scraping | ||
if not link[1] in ignoreUrls and re.match(r"^http:\/\/web\.archive\.org", link[1]) is None: | ||
try: | ||
request = requests.get(link[1], verify=False) | ||
if request.status_code == 200: | ||
print "Current URL:", link[1] | ||
#request = request.read() | ||
soup = BeautifulSoup(request.text, "html5lib") | ||
if soup.title: | ||
finalTitle = soup.title.string | ||
finalTitle = str(finalTitle.encode('utf-8')).strip().replace("\n", " ") | ||
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError, requests.exceptions.MissingSchema): | ||
print "-- PLEASE REMOVE:", link[1] | ||
pass | ||
else: | ||
print "IGNORED URL:", link[1] | ||
|
||
content += "- *["+finalTitle+"]("+link[1]+")*\n" | ||
# End File | ||
content += "\n\n" | ||
|
||
endFile += index + "\n\n" + content | ||
|
||
# Save results | ||
text_file = open("extras/recursos.md", "w") | ||
text_file.write(endFile) | ||
text_file.close() | ||
print "extras/recursos.md updated!" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import os | ||
import re | ||
|
||
linkRegex = r"(?!_|\*).\[([^\[]+)\]\(([^\)]+)\)" | ||
linkBoldRegex = r"\[([^\[]+)\]\(([^\)]+)\)\*\*" | ||
fixBugs = r"line-numbers=off, " | ||
|
||
for file in os.listdir("manuscript"): | ||
if file.endswith(".txt"): | ||
with open("manuscript/" + file) as f: | ||
fileData = "" | ||
fileContent = f.readlines() | ||
for line in fileContent: | ||
line = re.sub(fixBugs, r'', line.rstrip()) | ||
fileData += line + "\n" | ||
|
||
text_file = open("manuscript/" + file, "w") | ||
text_file.write(fileData) | ||
text_file.close() | ||
print "Book updated!" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# -*- coding: utf-8 -*- | ||
from bs4 import BeautifulSoup | ||
import urllib | ||
|
||
def change_text(text): | ||
return text.encode('utf-8', 'ignore') | ||
|
||
developers_md = "" | ||
|
||
developers = ["AriyaHidayat", "BrendanEich", "DotProto", "John_Papa", "MatiasArriola", "PascalPrecht", "SachaGreif", "SimoAhava", "addyosmani", "alexnavasardyan", "amasad", "brianleroux", "codepo8", "codylindley", "codingcarlos", "davidwalshblog", "dejan_dimic", "dshaw", "ducntq", "elijahmanor", "erikthedev_", "firede", "garu_rj", "gavrisimo", "gibson042", "greybax", "idangazit", "jamsyoung", "jdalton", "jeresig", "jfroffice", "kahliltweets", "kentcdodds", "kom_256", "l0ckys", "ladyleet", "leobetosouza", "marcotrulla", "marocchino", "mathias", "mihaipaun", "nataliemac", "nicksalloum_", "okuryu", "os_weekends", "ossreleasefeed", "paul_irish", "rauschma", "rem", "remotesynth", "rmurphey", "roebuk", "rwaldron", "stephanlindauer", "tomdale", "trevnorris", "umaar", "wecodesign", "yotamofek"] | ||
|
||
for developer in developers: | ||
|
||
url = "https://twitter.com/" + developer | ||
request = urllib.urlopen(url) | ||
|
||
print "status code: " + str(request.getcode()) | ||
if request.getcode() == 200: | ||
request = request.read() | ||
soup = BeautifulSoup(request, "html5lib") | ||
print "url: " + url | ||
|
||
name = soup.findAll("a", { "class" : "ProfileHeaderCard-nameLink"}) | ||
bio = soup.findAll("p", { "class" : "ProfileHeaderCard-bio"}) | ||
|
||
if name and bio: | ||
name = name[0].text.encode('utf-8') | ||
bio = bio[0].text.encode('utf-8') | ||
|
||
print "current: " + name | ||
print "bio: " + bio | ||
|
||
developers_md += "- **["+ name +"]("+url+")**\n\n" | ||
developers_md += "\t@"+ developer+ ": *"+ bio +"*\n\n" | ||
print "-------------------------------------" | ||
|
||
text_file = open("developers.md", "w") | ||
text_file.write(developers_md) | ||
text_file.close() |