# The world’s simplest web browser


In [2]:
import socket

mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if len(data) < 1:
        break
    print(data.decode(),end='')

mysock.close()

HTTP/1.1 200 OK
Date: Mon, 11 Sep 2023 21:21:25 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "a7-54f6609245537"
Accept-Ranges: bytes
Content-Length: 167
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief


# Retrieving an image over HTTP


In [23]:
import socket
import time

HOST = 'data.pr4e.org'
PORT = 80
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect((HOST, PORT)) 
mysock.sendall(b'GET http://data.pr4e.org/cover3.jpg HTTP/1.0\r\n\r\n') #il b funziona da .encode() e trasforma il tutto in bytes
count = 0
picture = b""  #bytes nullo

while True:
    data = mysock.recv(5120)
    if len(data) < 1: break  #si ferma quando trova uno spazio vuoto
    #time.sleep(0.25)
    #count = count + len(data)
    #print(len(data), count)
    picture = picture + data

mysock.close()

# Look for the end of the header (2 CRLF)
pos = picture.find(b"\r\n\r\n")
print('Header length', pos)
print(picture[:pos].decode())

# Skip past the header and save the picture data
picture = picture[pos+4:] #tolgo i 4 caratteri \r\n\r\n 
fhand = open("stuff.jpg", "wb") #write in bynary 
fhand.write(picture)
fhand.close()

# Code: http://www.py4e.com/code3/urljpeg.py

Header length 394
HTTP/1.1 200 OK
Date: Mon, 11 Sep 2023 16:10:29 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Mon, 15 May 2017 12:27:40 GMT
ETag: "38342-54f8f2e5b6277"
Accept-Ranges: bytes
Content-Length: 230210
Vary: Accept-Encoding
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: image/jpeg


# Retrieving web pages with urllib


In [24]:
import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand:
    print(line.decode().rstrip())


But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief


# Reading binary files using urllib


In [25]:
import urllib.request, urllib.parse, urllib.error

img = urllib.request.urlopen('http://data.pr4e.org/cover3.jpg').read()
fhand = open('cover3.jpg', 'wb')
fhand.write(img)


230210

In [17]:
import urllib.request, urllib.parse, urllib.error

img = urllib.request.urlopen('http://data.pr4e.org/cover3.jpg')
fhand = open('cover3.jpg', 'wb')
size = 0
while True: 
    info = img.read(100000)  #you onlu read 100,000 characters at a time and then write those characters to the cover3.jpg file before retrieving the next 100,000 characters of data from the web.
    if len(info) < 1: break
    size = size + len(info)
    fhand.write(info)

print(size, 'characters copied.')
fhand.close()

230210 characters copied.


# Parsing HTML using regular expressions



One simple way to parse HTML is to use regular expressions to repeatedly search for and extract substrings that match a particular pattern.

Here a simpe web page:


In [None]:
<h1>The First Page</h1>
<p>
If you like, you can switch to the
<a href="http://www.dr-chuck.com/page2.htm">
Second Page </a>.
</p>

SyntaxError: invalid syntax (1875027082.py, line 1)

We can construct a well-formed regular expression to match and extract the link values from the above text as follows:

regexp = 'href="http[s]?://.+?"'

In [None]:
# Search for link values within URL input
import urllib.request, urllib.parse, urllib.error
import re
import ssl  #per accedere online? 

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
links = re.findall(b'href="(http[s]?://.+?)"', html)
for link in links:
    print(link.decode())


https://docs.python.org/3/index.html
https://www.python.org/
https://docs.python.org/3.13/
https://docs.python.org/3.12/
https://docs.python.org/3.11/
https://docs.python.org/3.10/
https://docs.python.org/3.9/
https://docs.python.org/3.8/
https://docs.python.org/3.7/
https://docs.python.org/3.6/
https://docs.python.org/3.5/
https://docs.python.org/2.7/
https://www.python.org/doc/versions/
https://www.python.org/dev/peps/
https://wiki.python.org/moin/BeginnersGuide
https://wiki.python.org/moin/PythonBooks
https://www.python.org/doc/av/
https://devguide.python.org/
https://www.python.org/
https://devguide.python.org/docquality/#helping-with-documentation
https://docs.python.org/3.13/
https://docs.python.org/3.12/
https://docs.python.org/3.11/
https://docs.python.org/3.10/
https://docs.python.org/3.9/
https://docs.python.org/3.8/
https://docs.python.org/3.7/
https://docs.python.org/3.6/
https://docs.python.org/3.5/
https://docs.python.org/2.7/
https://www.python.org/doc/versions/
https://

# Parsing HTML using BeautifulSoup


In [1]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('a')  #è un dizionario circa
for tag in tags:
    print('TAG:', tag),
    print('URL', tag.get('href', None)) 
    print('CONTENTS', tag.contents)
    print('ATT', tag.attrs)
    
    

# Code: http://www.py4e.com/code3/urllinks.py

TAG: <a href="http://www.dr-chuck.com/page2.htm">
Second Page</a>
URL http://www.dr-chuck.com/page2.htm
CONTENTS ['\nSecond Page']
ATT {'href': 'http://www.dr-chuck.com/page2.htm'}


# Exercise 1:

 Change the socket program socket1.py to prompt the user for the URL so it can read any web page. You can use split('/') to break the URL into its component parts so you can extract the host name for the socket connect call. Add error checking using try and except to handle the condition where the user enters an improperly formatted or non-existent URL.

In [23]:
import socket
import re

url = input('Enter url - ')

url_prefix = url.split('/')  # if the url starts with http(s)    
if len(url_prefix)> 1 and url_prefix[0] == re.findall('http[s]?:', url)[0]:
    host = re.findall('http[s]?://(.+\..+?)/.+', url)[0]

else: # if it starts with wwww.something
    host = url


# port number: 80 for HTTP
portnum = 80
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

# testing the url's validity
try:
    mysock.connect((host, portnum))
    # 'cmd' is edited to incorporate user input
    cmd = f'GET {url} HTTP/1.0\r\n\r\n'.encode()
    mysock.send(cmd)

    while True:
        data = mysock.recv(512)
        if len(data) < 1:
            break
            
        print(data.decode(),end='')

    mysock.close()

except:
    print("Please enter an existing URL!")
    exit()
    



   




HTTP/1.1 200 OK
Date: Tue, 12 Sep 2023 11:44:59 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "1d3-54f6609240717"
Accept-Ranges: bytes
Content-Length: 467
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

Why should you learn to write programs?

Writing programs (or programming) is a very creative 
and rewarding activity.  You can write programs for 
many reasons, ranging from making your living to solving
a difficult data analysis problem to having fun to helping
someone else solve a problem.  This book assumes that 
everyone needs to know how to program, and that once 
you know how to program you will figure out what you want 
to do with your newfound skills.  


# Exercise 12.2

Change your socket program so that it counts the number of characters it has received and stops displaying any text after it has shown 3000 characters. The program should retrieve the entire document and count the total number of characters and display the count of the number of characters at the end of the document.

In [10]:
import socket
import re

url = input('Enter url - ')

url_prefix = url.split('/')  # if the url starts with http(s)    
if len(url_prefix)> 1 and url_prefix[0] == re.findall('http[s]?:', url)[0]:
    host = re.findall('http[s]?://(.+\..+?)/.+', url)[0]

else: # if it starts with wwww.something
    host = url


# port number: 80 for HTTP
portnum = 80
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

#new variables for the assignment 
displayCount = 0 #conta i caratteri che sono sul display, alla fine deve essere uguale charLimit
totalCount = 0 #conta tutti i caratteri del file
charLimit =3000


# testing the url's validity
try:
    mysock.connect((host, portnum))
    # 'cmd' is edited to incorporate user input
    cmd = f'GET {url} HTTP/1.0\r\n\r\n'.encode()
    mysock.send(cmd)
    

    while True:
        data = mysock.recv(500)
        totalCount +=len(data) 
        
        if len(data) < 1:
            break
        elif totalCount > charLimit:
            displayCount = charLimit
            continue
            
        print(data.decode()[:charLimit],end='')
    
    # output as per requested
    print(f"\n\nDisplay stopped at character count of {displayCount}")
    print(f"Total number of characters received: {totalCount}")
    
except:
    print("Please enter an existing URL!")
    exit()


HTTP/1.1 200 OK
Date: Tue, 12 Sep 2023 10:50:09 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "22a0-54f6609245537"
Accept-Ranges: bytes
Content-Length: 8864
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

Romeo and Juliet
Act 2, Scene 2 

SCENE II. Capulet's orchard.

Enter ROMEO

ROMEO

He jests at scars that never felt a wound.
JULIET appears above at a window

But, soft! what light through yonder window breaks?
It is the east, and Juliet is the sun.
Arise, fair sun, and kill the envious moon,
Who is already sick and pale with grief,
That thou her maid art far more fair than she:
Be not her maid, since she is envious;
Her vestal livery is but sick and green
And none but fools do wear it; cast it off.
It is my lady, O, it is my love!
O, that she knew she were!
She speaks yet she says nothing: what of that?
Her eye discourses; I will 

# Exercise 3

Use urllib to replicate the previous exercise of (1) retrieving the document from a URL, (2) displaying up to 3000 characters, and (3) counting the overall number of characters in the document. Don’t worry about the headers for this exercise, simply show the first 3000 characters of the document contents

In [22]:
import urllib.request, urllib.parse, urllib.error
import re

url = input('Enter url - ')

#file handle with the url 
fhand = urllib.request.urlopen(url)
content = fhand.read() 
content = content.decode()

charLimit = 3000

print(content[:charLimit])
print('Total characters in the document:', len(content))
    





Why should you learn to write programs?

Writing programs (or programming) is a very creative 
and rewarding activity.  You can write programs for 
many reasons, ranging from making your living to solving
a difficult data analysis problem to having fun to helping
someone else solve a problem.  This book assumes that 
everyone needs to know how to program, and that once 
you know how to program you will figure out what you want 
to do with your newfound skills.  

Total characters in the document: 467


# Exercise 12.4
Change the urllinks.py program to extract and count paragraph (p) tags from the retrieved HTML document and display the count of the paragraphs as the output of your program. Do not display the paragraph text, only count them. Test your program on several small web pages as well as some larger web pages.

In [26]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('p')
count = len(tags)
   
print('Total number of paragraphs', count)

Total number of paragraphs 7


# Exercise 12.5

(Advanced) Change the socket program so that it only shows data after the headers and a blank line have been received. Remember that recv receives characters (newlines and all), not lines.

In [21]:
import socket
import re

url = input('Enter url - ')

url_prefix = url.split('/')  # if the url starts with http(s)    
if len(url_prefix)> 1 and url_prefix[0] == re.findall('http[s]?:', url)[0]:
    host = re.findall('http[s]?://(.+\..+?)/.+', url)[0]

else: # if it starts with wwww.something
    host = url


# port number: 80 for HTTP
portnum = 80
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

# testing the url's validity
mysock.connect((host, portnum))
    # 'cmd' is edited to incorporate user input
cmd = f'GET {url} HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)

text = b''
while True:
    data = mysock.recv(512)
    if len(data) < 1:
            break
    text +=data 
header_pos = text.find(b'\r\n\r\n')
content = text[header_pos+4:].decode()

print(content)
    
mysock.close()


    


But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief



# Exercise 12.6

Scraping Numbers from HTML using BeautifulSoup In this assignment you will write a Python program similar to http://www.py4e.com/code3/urllink2.py. The program will use urllib to read the HTML from the data files below, and parse the data, extracting numbers and compute the sum of the numbers in the file.

We provide two files for this assignment. One is a sample file where we give you the sum for your testing and the other is the actual data you need to process for the assignment.

Sample data: http://py4e-data.dr-chuck.net/comments_42.html (Sum=2553)
Actual data: http://py4e-data.dr-chuck.net/comments_1881372.html (Sum ends with 31)

In [28]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")

# Retrieve all of the anchor tags
tags = soup('span')
countSum = 0
for tag in tags:
    # Look at the parts of a tag
    countSum += int(tag.contents[0])
print( 'Sum', countSum)
    


Sum 2531


# Exercise 12.7

In this assignment you will write a Python program that expands on http://www.py4e.com/code3/urllinks.py. The program will use urllib to read the HTML from the data files below, extract the href= vaues from the anchor tags, scan for a tag that is in a particular position relative to the first name in the list, follow that link and repeat the process a number of times and report the last name you find.

We provide two files for this assignment. One is a sample file where we give you the name for your testing and the other is the actual data you need to process for the assignment

Sample problem: Start at http://py4e-data.dr-chuck.net/known_by_Fikret.html 
Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer is the last name that you retrieve.

Sequence of names: Fikret Montgomery Mhairade Butchi Anayah 

Last name in sequence: Anayah

Actual problem: Start at: http://py4e-data.dr-chuck.net/known_by_Parmin.html 

Find the link at position 18 (the first name is 1). Follow that link. Repeat this process 7 times. The answer is the last name that you retrieve.

**_Hint_**: The first character of the name of the last page that you will load is: R

_Strategy_

The web pages tweak the height between the links and hide the page after a few seconds to make it difficult for you to do the assignment without writing a Python program. But frankly with a little effort and patience you can overcome these attempts to make it a little harder to complete the assignment without writing a Python program. But that is not the point. The point is to write a clever Python program to solve the program.

**Sample execution**

Here is a sample execution of a solution:
```
$ python3 solution.py
Enter URL: http://py4e-data.dr-chuck.net/known_by_Fikret.html
Enter count: 4
Enter position: 3
Retrieving: http://py4e-data.dr-chuck.net/known_by_Fikret.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Mhairade.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Butchi.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Anayah.html
``` 
The answer to the assignment for this execution is "Anayah".

In [13]:
import urllib.request, urllib.parse, urllib.error
import ssl 
from bs4 import BeautifulSoup

#Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

#chiedere l'url 
url = input('Enter the url - ')
#chiedo quante volte devo ripetere il loop - repeatCount- e da quale posiione partire - startPosition. 
repeatCount = int(input('Enter count:'))
startPosition = int(input('Enter position:'))

i = 0 
while i <= repeatCount:
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    print('Retrieving:', url) 
    
    #cerco il tag anchor (a)
    tags = soup('a') #lista con <a href =link> contenuto </a>
    #avrò una serie di tag, mi interessa il tag[startPosition-1] il link sarà quindi tag[startPosition - 1].get('href')
    #rendo il link nuovo il nuovo url e itero il processo
    url = tags[startPosition - 1].get('href')
    i += 1 
    





Retrieving: http://py4e-data.dr-chuck.net/known_by_Parmin.html 
Retrieving: http://py4e-data.dr-chuck.net/known_by_Ali.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Aimeeleigh.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Maya.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Naima.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Caidie.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Rahma.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Rihanna.html
