# The world’s simplest web browser


In [2]:
import socket

mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if len(data) < 1:
        break
    print(data.decode(),end='')

mysock.close()

HTTP/1.1 200 OK
Date: Mon, 11 Sep 2023 21:21:25 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "a7-54f6609245537"
Accept-Ranges: bytes
Content-Length: 167
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief


# Retrieving an image over HTTP


In [23]:
import socket
import time

HOST = 'data.pr4e.org'
PORT = 80
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect((HOST, PORT)) 
mysock.sendall(b'GET http://data.pr4e.org/cover3.jpg HTTP/1.0\r\n\r\n') #il b funziona da .encode() e trasforma il tutto in bytes
count = 0
picture = b""  #bytes nullo

while True:
    data = mysock.recv(5120)
    if len(data) < 1: break  #si ferma quando trova uno spazio vuoto
    #time.sleep(0.25)
    #count = count + len(data)
    #print(len(data), count)
    picture = picture + data

mysock.close()

# Look for the end of the header (2 CRLF)
pos = picture.find(b"\r\n\r\n")
print('Header length', pos)
print(picture[:pos].decode())

# Skip past the header and save the picture data
picture = picture[pos+4:] #tolgo i 4 caratteri \r\n\r\n 
fhand = open("stuff.jpg", "wb") #write in bynary 
fhand.write(picture)
fhand.close()

# Code: http://www.py4e.com/code3/urljpeg.py

Header length 394
HTTP/1.1 200 OK
Date: Mon, 11 Sep 2023 16:10:29 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Mon, 15 May 2017 12:27:40 GMT
ETag: "38342-54f8f2e5b6277"
Accept-Ranges: bytes
Content-Length: 230210
Vary: Accept-Encoding
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: image/jpeg


# Retrieving web pages with urllib


In [24]:
import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand:
    print(line.decode().rstrip())


But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief


# Reading binary files using urllib


In [25]:
import urllib.request, urllib.parse, urllib.error

img = urllib.request.urlopen('http://data.pr4e.org/cover3.jpg').read()
fhand = open('cover3.jpg', 'wb')
fhand.write(img)


230210

In [None]:
import urllib.request, urllib.parse, urllib.error

img = urllib.request.urlopen('http://data.pr4e.org/cover3.jpg')
fhand = open('cover3.jpg', 'wb')
size = 0
while True: 
    info = img.read(100000)  #you onlu read 100,000 characters at a time and then write those characters to the cover3.jpg file before retrieving the next 100,000 characters of data from the web.
    if len(info) < 1: break
    size = size + len(info)
    fhand.write(info)

print(size, 'characters copied.')
fhand.close()

230210 characters copied.


KeyboardInterrupt: 

# Parsing HTML using regular expressions



One simple way to parse HTML is to use regular expressions to repeatedly search for and extract substrings that match a particular pattern.

Here a simpe web page:


In [None]:
<h1>The First Page</h1>
<p>
If you like, you can switch to the
<a href="http://www.dr-chuck.com/page2.htm">
Second Page </a>.
</p>

SyntaxError: invalid syntax (1875027082.py, line 1)

We can construct a well-formed regular expression to match and extract the link values from the above text as follows:

regexp = 'href="http[s]?://.+?"'

In [None]:
# Search for link values within URL input
import urllib.request, urllib.parse, urllib.error
import re
import ssl  #per accedere online? 

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
links = re.findall(b'href="(http[s]?://.+?)"', html)
for link in links:
    print(link.decode())


https://docs.python.org/3/index.html
https://www.python.org/
https://docs.python.org/3.13/
https://docs.python.org/3.12/
https://docs.python.org/3.11/
https://docs.python.org/3.10/
https://docs.python.org/3.9/
https://docs.python.org/3.8/
https://docs.python.org/3.7/
https://docs.python.org/3.6/
https://docs.python.org/3.5/
https://docs.python.org/2.7/
https://www.python.org/doc/versions/
https://www.python.org/dev/peps/
https://wiki.python.org/moin/BeginnersGuide
https://wiki.python.org/moin/PythonBooks
https://www.python.org/doc/av/
https://devguide.python.org/
https://www.python.org/
https://devguide.python.org/docquality/#helping-with-documentation
https://docs.python.org/3.13/
https://docs.python.org/3.12/
https://docs.python.org/3.11/
https://docs.python.org/3.10/
https://docs.python.org/3.9/
https://docs.python.org/3.8/
https://docs.python.org/3.7/
https://docs.python.org/3.6/
https://docs.python.org/3.5/
https://docs.python.org/2.7/
https://www.python.org/doc/versions/
https://

# Parsing HTML using BeautifulSoup


In [1]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('a')  #è un dizionario circa
for tag in tags:
    print('TAG:', tag),
    print('URL', tag.get('href', None)) 
    print('CONTENTS', tag.contents)
    print('ATT', tag.attrs)
    
    

# Code: http://www.py4e.com/code3/urllinks.py

TAG: <a href="http://www.dr-chuck.com/page2.htm">
Second Page</a>
URL http://www.dr-chuck.com/page2.htm
CONTENTS ['\nSecond Page']
ATT {'href': 'http://www.dr-chuck.com/page2.htm'}


# Exercise 1:

 Change the socket program socket1.py to prompt the user for the URL so it can read any web page. You can use split('/') to break the URL into its component parts so you can extract the host name for the socket connect call. Add error checking using try and except to handle the condition where the user enters an improperly formatted or non-existent URL.

In [9]:
import socket
import re

url = input('Enter url - ')

url_prefix = url.split('/')  # if the url starts with http(s)    
if len(url_prefix)> 1 and url_prefix[0] == re.findall('http[s]?:', url)[0]:
    host = re.findall('http[s]?://(.+\..+?)/.+', url)[0]

else: # if it starts with wwww.something
    host = url


# port number: 80 for HTTP
portnum = 80
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

# testing the url's validity
try:
    mysock.connect((host, portnum))
    # 'cmd' is edited to incorporate user input
    cmd = f'GET {url} HTTP/1.0\r\n\r\n'.encode()
    mysock.send(cmd)

    while True:
        data = mysock.recv(512)
        if len(data) < 1:
            break
            
        print(data.decode(),end='')

    mysock.close()

except:
    print("Please enter an existing URL!")
    exit()
    



   




Enter url - http://data.pr4e.org/romeo.txt


# Exercise 12.2

Change your socket program so that it counts the number of characters it has received and stops displaying any text after it has shown 3000 characters. The program should retrieve the entire document and count the total number of characters and display the count of the number of characters at the end of the document.

In [11]:
import socket
import re

url = input('Enter url - ')

url_prefix = url.split('/')  # if the url starts with http(s)    
if len(url_prefix)> 1 and url_prefix[0] == re.findall('http[s]?:', url)[0]:
    host = re.findall('http[s]?://(.+\..+?)/.+', url)[0]

else: # if it starts with wwww.something
    host = url


# port number: 80 for HTTP
portnum = 80
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

#new variables for the assignment 
displayCount = 0 
totalCount = 0 
charLimit = 3000


# testing the url's validity
try:
    mysock.connect((host, portnum))
    # 'cmd' is edited to incorporate user input
    cmd = f'GET {url} HTTP/1.0\r\n\r\n'.encode()
    mysock.send(cmd)

    while True:
        data = mysock.recv(512)
        totalCount +=len(data)
        
        if len(data) < 1:
            break
        elif totalCount > charLimit
            displayCount = charLimit
            break
            
        print(data.decode(),end='')
        
    # output as per requested
    print(f"\n\nDisplay stopped at character count of {displayCount}")
    print(f"Total number of characters received: {totalCount}")

    mysock.close()

except:
    print("Please enter an existing URL!")
    exit()
    

    



   





Enter url - http://data.pr4e.org/romeo.txt
HTTP/1.1 200 OK
Date: Mon, 11 Sep 2023 22:14:08 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "a7-54f6609245537"
Accept-Ranges: bytes
Content-Length: 167
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief
