Skip to content

Commit

Permalink
Scripts para facilitar el desarrollo del libro
Browse files Browse the repository at this point in the history
- Expresiones regulares por todo el libro
- Generador del Anexo2 (Twitter Scraper)
- Generador del Anexo de Recursos (Regex + Scraper)
  • Loading branch information
UlisesGascon committed Jan 8, 2017
1 parent bfd62c0 commit 2b944ff
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 0 deletions.
96 changes: 96 additions & 0 deletions scripts/generate_links_list.py
@@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-
import os
import re
from bs4 import BeautifulSoup
import requests

headerRegex = r"(#+) (.*)"
linksRegex = r"(?!!).\[([^\[]+)\]\(([^\)]+)\)"
endFile = "# Recursos\n\n"
index = "# Índice\n\n"
content = "# Contenido\n"

# URLs that generated exceptions last time... now all are skipped.
ignoreUrls = [
"https://cylonjs.com/",
"http://addyosmani.com/resources/essentialjsdesignpatterns/book/",
"https://www.campus.co/madrid/es/events",
"https://www.genbeta.com/web/error-404-not-found-historia-y-hazanas-de-este-mitico-codigo",
"https://jsonformatter.curiousconcept.com/",
"https://github.com/technoboy10/crossorigin.me",
"https://ponyfoo.com/articles/es6-generators-in-depth",
"https://ponyfoo.com/articles/es6-promises-in-depth",
"https://ponyfoo.com/articles/understanding-javascript-async-await",
"https://www.genbetadev.com/paradigmas-de-programacion/trabajar-con-microservicios-evitando-la-frustracion-de-las-enormes-aplicaciones-monoliticas",
"http://ashleynolan.co.uk/blog/frontend-tooling-survey-2015-results",
"http://blog.codinghorror.com/the-magpie-developer/",
"http://www.xataka.com/servicios/y-si-el-software-open-source-desapareciera",
"https://www.meetup.com/es-ES/Open-Source-Weekends/",
"https://es.wikipedia.org/wiki/Hoja_de_estilos_en_cascada",
"https://git-scm.com/",
"https://www.polymer-project.org/1.0/",
"http://web.archive.org",
"https://www.ecured.cu/Sentencias_(Programaci%C3%B3n",
"https://travis-ci.org/"
]

with open("manuscript/Book.txt") as f:
lines = (line.rstrip() for line in f)
lines = (line for line in lines if line)
for line in lines:
with open("manuscript/" + line) as chapter:
fileContent = chapter.readlines()
for line in fileContent:

# Headers
headerSearch = re.search(headerRegex, line)
if headerSearch:
headerMatch = re.search(headerRegex, line)
if headerMatch.group(1) == "#" or headerMatch.group(1) == "##" or headerMatch.group(1) == "###":
if headerMatch.group(1) == "#":
index += "- **["+headerMatch.group(2)+"](#"+headerMatch.group(2).strip().lower().replace(" ", "-")+")**\n"

headerLink = re.search(linksRegex, headerMatch.group(2))
header = ""

if headerLink:
header = headerLink.group(1)
else:
header = headerMatch.group(2)

currentHeader = "\n" + headerMatch.group(1) + "# " + header
content += currentHeader + "\n"

# Links
linksSearch = re.findall(linksRegex, line, flags=re.LOCALE)
if linksSearch:
for link in linksSearch:
finalTitle = link[0]
# scraping
if not link[1] in ignoreUrls and re.match(r"^http:\/\/web\.archive\.org", link[1]) is None:
try:
request = requests.get(link[1], verify=False)
if request.status_code == 200:
print "Current URL:", link[1]
#request = request.read()
soup = BeautifulSoup(request.text, "html5lib")
if soup.title:
finalTitle = soup.title.string
finalTitle = str(finalTitle.encode('utf-8')).strip().replace("\n", " ")
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError, requests.exceptions.MissingSchema):
print "-- PLEASE REMOVE:", link[1]
pass
else:
print "IGNORED URL:", link[1]

content += "- *["+finalTitle+"]("+link[1]+")*\n"
# End File
content += "\n\n"

endFile += index + "\n\n" + content

# Save results
text_file = open("extras/recursos.md", "w")
text_file.write(endFile)
text_file.close()
print "extras/recursos.md updated!"
20 changes: 20 additions & 0 deletions scripts/regex_fixer.py
@@ -0,0 +1,20 @@
import os
import re

linkRegex = r"(?!_|\*).\[([^\[]+)\]\(([^\)]+)\)"
linkBoldRegex = r"\[([^\[]+)\]\(([^\)]+)\)\*\*"
fixBugs = r"line-numbers=off, "

for file in os.listdir("manuscript"):
if file.endswith(".txt"):
with open("manuscript/" + file) as f:
fileData = ""
fileContent = f.readlines()
for line in fileContent:
line = re.sub(fixBugs, r'', line.rstrip())
fileData += line + "\n"

text_file = open("manuscript/" + file, "w")
text_file.write(fileData)
text_file.close()
print "Book updated!"
39 changes: 39 additions & 0 deletions scripts/twitter_developers.py
@@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib

def change_text(text):
return text.encode('utf-8', 'ignore')

developers_md = ""

developers = ["AriyaHidayat", "BrendanEich", "DotProto", "John_Papa", "MatiasArriola", "PascalPrecht", "SachaGreif", "SimoAhava", "addyosmani", "alexnavasardyan", "amasad", "brianleroux", "codepo8", "codylindley", "codingcarlos", "davidwalshblog", "dejan_dimic", "dshaw", "ducntq", "elijahmanor", "erikthedev_", "firede", "garu_rj", "gavrisimo", "gibson042", "greybax", "idangazit", "jamsyoung", "jdalton", "jeresig", "jfroffice", "kahliltweets", "kentcdodds", "kom_256", "l0ckys", "ladyleet", "leobetosouza", "marcotrulla", "marocchino", "mathias", "mihaipaun", "nataliemac", "nicksalloum_", "okuryu", "os_weekends", "ossreleasefeed", "paul_irish", "rauschma", "rem", "remotesynth", "rmurphey", "roebuk", "rwaldron", "stephanlindauer", "tomdale", "trevnorris", "umaar", "wecodesign", "yotamofek"]

for developer in developers:

url = "https://twitter.com/" + developer
request = urllib.urlopen(url)

print "status code: " + str(request.getcode())
if request.getcode() == 200:
request = request.read()
soup = BeautifulSoup(request, "html5lib")
print "url: " + url

name = soup.findAll("a", { "class" : "ProfileHeaderCard-nameLink"})
bio = soup.findAll("p", { "class" : "ProfileHeaderCard-bio"})

if name and bio:
name = name[0].text.encode('utf-8')
bio = bio[0].text.encode('utf-8')

print "current: " + name
print "bio: " + bio

developers_md += "- **["+ name +"]("+url+")**\n\n"
developers_md += "\t@"+ developer+ ": *"+ bio +"*\n\n"
print "-------------------------------------"

text_file = open("developers.md", "w")
text_file.write(developers_md)
text_file.close()

0 comments on commit 2b944ff

Please sign in to comment.