# REGEX

In [None]:
# # Previous method
# hand = open('data/mbox-short.txt')
# for line in hand:
#     line = line.rstrip()
#     if line.find('From:') >= 0:
#         print(line)

# Regex method
import re

hand = open('data/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    if re.search('From:', line):
        print(line)

In [None]:
# Matching and extracting data
x = 'My 2 favourite numbers are 19 and 42'
print(re.findall('[1-9]+', x))
print(re.findall('[a-h]', x))

In [None]:

# Greedy matching
x = 'From: Using the : character'
print(re.findall('^F.+:', x)) # F = From, .+ = one or more characters, : = colon
# Non-greedy matching
print(re.findall('^F.+?:', x)) # F = From, .+ = one or more characters, ? = non-greedy, : = colon


In [None]:
# Fine tuning string extraction
x = 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'
print(re.findall(r'\S+@\S+', x)) # \S = non-whitespace, + = one or more characters
print(re.findall(r'^From (\S+@\S+)', x)) # ^ = start of line, () = extract only the email address
print(re.findall(r'^From .*@([^ ]*)', x)) # .* = any character, zero or more times, () = extract only the domain name
print(re.findall(r'\S+?@\S+', x)) # .* = any character, zero or more times, () = extract only the domain name


In [None]:
hand = open('data/mbox-short.txt')
numlist = list()
for line in hand:
    line = line.rstrip()
    stuff = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', line)
    if len(stuff) != 1: continue
    num = float(stuff[0])
    numlist.append(num)
print('Maximum:', max(numlist))

In [None]:
# Regex assignment
import re
text = open('data/regex_assignment.txt')
numlist = list()
for line in text:
    # line = line.rstrip()
    numbers = re.findall('[0-9]+', line)
    for number in numbers:
        num = int(number)
        numlist.append(num)
print(sum(numlist))

# Networks and sockets

In [None]:
import socket
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode() # encode() converts string to bytes
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if len(data) < 1:
        break
    print(data.decode(), # decode() converts bytes to string (Unicode)
        end=''
    )
mysock.close()

In [None]:
import socket
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET /intro-short.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if len(data) < 1:
        break
    print(data.decode()) # decode() converts bytes to string
mysock.close()

In [None]:
# http:// is the protocol
# www.dr-chuck.com is the host
# /page1.html is the document



### Real-life example

In [None]:
# Example of connecting to an API and extracting data from it

import websocket
import json

# Función que recibe y muestra los datos
def on_message(ws, message):
    data = json.loads(message)
    price = data.get('p', 'No data')  # Extrae el precio
    print(f'Precio de Bitcoin: {price} USD')

# Conectarse a la API de Binance (precios en tiempo real)
url = 'wss://stream.binance.com:9443/ws/btcusdt@trade'

# Crear conexión WebSocket
ws = websocket.WebSocketApp(url, on_message=on_message)
# ws.run_forever()


# Programs that surf the web

ASCII Mapping vs UTF-8

In [None]:
# Each character is represented by a number between 0 and 256 stored 
# in 8 bits of memory. ord() function returns the number representing 
# the unicode code of a specified character
print(ord('H'))
print(ord('e'))
print(ord('\n'))

# chr() function returns the character that represents the specified
# unicode code
print(chr(72))
print(chr(101))
print(chr(10))
print(chr(108), chr(105), chr(110), chr(101))

# For Python 3, strings are Unicode by default

We need to understand the differences between the tipes of character mapping when we talk to a network, because their characters might be in a different mapping than then one we are talking to. That's why we need to encode and decode. However, the big majority (around 99%) of characters there are in the web are already UTF-8, making this a lot eaasier

In [None]:
# 
x = b'abc'
print(type(x))
x = u'abc'
print(type(x))
x = 'abc'
print(type(x))


In [None]:
import socket
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode() # encode() converts string to bytes
mysock.send(cmd)

while True:
    data = mysock.recv(512) # Data is received as bytes
    if (len(data) < 1):
        break
    mystring = data.decode() # decode() turns the character set to UTF-8 or ASCII by default
    print(mystring) # Data is now Unicode

## Using urllib in Python
Since HTTP is so common, we have a libray that does all the socket work for us and makes web pages look like a file

In [None]:
import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand:
    print(line.decode().strip())
# Even though this doesn't return the headers, it reads them, and you can ask for them later 

In [None]:
# To count the most repetitive words in a web file
counts = dict()
fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')

for line in fhand:
    words = line.decode().split()
    for word in words:
        counts[word] = counts.get(word, 0) + 1
print(counts)

In [None]:
# Example with another link
fhand = urllib.request.urlopen('http://www.dr-chuck.com/page1.htm')
for line in fhand:
    print(line.decode().strip())

## Extra: requests library + BeautifulSoup

> Note: `import requests` is a better practice to manage all this things regarding talking to the web with Python as it is a high level librarie that automates many of the manual steps done with sockets or urllib libraries

In [None]:
import requests

url = 'https://es.wikipedia.org/wiki/Wikipedia:Portada'

# Send a GET request and get the response text
response = requests.get(url)
print(response.text) # Can also use .status_code, .headers, .json(), .content, .url, .encoding, .apparent_encoding, .history, .cookies, .elapsed, .request

You can also use BeautifulSoup to scrape information from a HTML web page that has issues with its syntax (sytax errors)

In [None]:
from bs4 import BeautifulSoup as bs

html_doc = '<html><body><h1>Hello, World!</h1></body></html>'

# Parsing the HTML
soup = bs(html_doc, 'html.parser') # html.parser is the parser

print(soup.text) # Extracting the text from the HTML

Using both at once:

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://es.wikipedia.org/wiki/Wikipedia:Portada'

# Send a GET request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Print the page title
    print('Page Title:', soup.title.string)

    # Find all links on the page
    links = soup.find_all('a')  # Finds all <a> (anchor) tags

    # Print the first 5 links
    for link in links[:5]:
        print(link.get('href'))  # Get the 'href' attribute (URL)
else:
    print('Failed to fetch the page:', response.status_code)


## Back to BeautifulSoup and urllib

In [None]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

url = input('Enter url: ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    print(tag.get('href', None)) # get() method returns the value of the 'href' attribute, or None if the attribute is not

An example code using urllib

In [None]:
# to see the version of ssl installed
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl 

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = ('http://www.dr-chuck.com/page1.htm')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    print(tag.get('href', None)) # get() method returns the value of the 'href' attribute, or None if
    # the attribute is not present

Same example code using requests

In [None]:
import requests
from bs4 import BeautifulSoup
import ssl

# Define the URL
url = 'https://www.dr-chuck.com/page1.htm'  # Replace with the actual URL

# Send a GET request (disabling SSL verification like in urllib)
response = requests.get(url)

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Retrieve all anchor tags
tags = soup.find_all('a')  # Finds all <a> elements

# Print their href attributes
for tag in tags:
    print(tag.get('href', None))  # get() returns None if 'href' doesn't exist


In [None]:
x = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
print(type(x))

### Assignment

In [None]:
# Reading an internet table and adding up the numbers
import urllib.request
from bs4 import BeautifulSoup

url = 'https://py4e-data.dr-chuck.net/comments_2144910.html'

html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

# Find all the numbers inside <span class="comments">
numbers = [int(tag.text) for tag in soup.find_all('span', class_='comments')]

total = sum(numbers)

print(total)

In [None]:

# Following links multiple times until I retrieve the 18th link
import urllib.request
from bs4 import BeautifulSoup
import re

url = 'https://py4e-data.dr-chuck.net/known_by_Sambrid.html'

html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

for i in range(7):
    tags = soup('a')
    link = tags[17].get('href', None)
    html = urllib.request.urlopen(link).read()
    soup = BeautifulSoup(html, 'html.parser')

print(re.findall('known_by_(.+).html', link)[0])