<a href="https://colab.research.google.com/github/UniVR-DH/ADHLab/blob/main/lecture01-solutions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Crawling with Beautifulsoup4 and  Wikipedia Python APIs to create a document collection

<img src="https://drive.google.com/uc?export=view&id=1m_EMdnI5C826kgqK7r5vB4TXnB0-Wq7W" alt="Intestazione con loghi istituzionali" width="525"/>

| Docente      | Insegnamento | Anno Accademico    |
| :---        |    :----   |          ---: |
| Matteo Lissandrini      | Laboratorio Avanzato di Informatica Umanistica       | 2023/2024   |

### Installing additional packages

In [None]:
%pip install wikipedia-api
%pip install beautifulsoup4
%pip install nltk

### Importing some basic required packages

In [None]:
import gzip
import string
import numpy as np
import requests
import regex as re

### Crawling content with Beautifulsoup4
#### Select a webpage, download its content, parse the HTML to extract the text

In [None]:
from bs4 import BeautifulSoup

page = requests.get('https://en.wikipedia.org/wiki/New_York_City')

# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')

# Pull text from all instances of <p> tag within `mw-body` div
# `mw-body`  has been selected by manually inspecting the HTML code of the page
all_p_items = soup.find(class_='mw-body').find_all('p')
print(len(all_p_items))
print(all_p_items[0])
print(all_p_items[0].get_text())
print('    ----    ')
print(all_p_items[1])
print(all_p_items[1].get_text())

In [None]:
# More about Regex:
# - https://en.wikipedia.org/wiki/Regular_expression
# - https://web.stanford.edu/~jurafsky/slp3/2.pdf
# - https://regexone.com/
punct_regex = re.compile('[{}]'.format(re.escape(string.punctuation))) # Regex matching any punctuation
space_regex = re.compile(' +') # Regex matching whitespace

##### Example of the effect of Regexp and `strip` method

In [None]:

test_string = "??This . is A test String!!"
test_string_parsed1 = punct_regex.sub(' ', test_string)
print("(1)","'"+test_string_parsed1+"'") # <- I am adding quotes around it

test_string_parsed2 = space_regex.sub(' ', test_string_parsed1)
print("(2)", "'"+test_string_parsed2+"'") # <- I am adding quotes around it

test_string_parsed3 = test_string_parsed2.strip()
print("(3)", "'"+test_string_parsed3+"'") # <- I am adding quotes around it

#### Use regext to clean the lines

In [None]:
# Find anything matching the regexp above in the text, then replace it with a *single* empty space
text = punct_regex.sub(' ', soup.find(class_='mw-body').get_text())
text = space_regex.sub(' ', text).lower()  # convert to lowercase
lines = [ # Store lines in a list based on the newline symbol \n
    line.strip()
    for line in text.split("\n")
    if line.strip() != "" # Skip empty lines
]

print(len(lines))
print(lines[0])
print(lines[1])
print(lines[1290])

In [None]:
######
# TODO: Open the wikipedia page for New York, select a sentence, can you find at which line it appears?
######

f = 'substantially by human intervention'  #<-- example of selected sentence

for pos, line in enumerate(lines):
  if f in line:
    print(pos, ":", line)


In [None]:
######
# TODO: Complete the code,
#   a) split a line in single words, compute word frequency
#   b) compute word frequency of all words across all lines
#
# Try out: https://docs.python.org/3/library/collections.html#collections.Counter
#
######

from collections import Counter

# Split the line when we find a space ' ' symbol
word_list = lines[1290].split(' ')
print(len(word_list))

# making a set is not a good option, it removes duplicates
words = set( w for w in word_list)
print(len(words))
print(words)

# Use the Counter object to count frequencies
word_count = Counter(word_list)
print(word_count)

#word_count.most_common(2)

# To compute word frequency of all words across all lines
# we iterate line by line and keep updating the same Counter object
word_count = Counter()
for line in lines:
  word_count.update(line.split(' '))

print(len(word_count))
# most_common can print the most common words found
word_count.most_common(10)


#### Accessing Links in the page

In [None]:
# We search for the <a> tags in the HTML of the page
all_a_items = soup.find(class_='mw-body').find_all('a')
print(len(all_a_items))

# We print the actual URL to which they point
for a in all_a_items:
  href = a.get('href')
  if href is not None and href.startswith('/wiki/') and not 'File:' in href:
    # we only print those URLs that start with `/wiki/` because they are internal to Wikipedia
    # but we exclude those that point to Files -- e.g., images
    print(href)

In [None]:
######
# TODO: Create a dictionary of /wiki/ links, and count how many times they appear in the page, which are the top-5 most frequent links?
######

wiki_links = []
wiki_links_count = Counter()

for a in all_a_items:
  href = a.get('href')
  if href is not None and href.startswith('/wiki/') and not 'File:' in href:
    # we only print those URLs that start with `/wiki/` because they are internal to Wikipedia
    # but we exclude those that point to Files -- e.g., images
    wiki_links.append(href)

wiki_links_count = Counter(wiki_links)
display(wiki_links_count.most_common(5))

In [None]:
######
# TODO: Pick the most frequent /wiki/ link from the above dictionary,
# download its page content and extract all links,
# do you find links in common ?
######


# Most frequent wiki link from the counter
most_frequent = wiki_links_count.most_common(1)[0][0] #<-- why I need a [0][0]?
print(most_frequent)

# Download its page content
page2 = requests.get('https://en.wikipedia.org/'+most_frequent)


# Extract all links
soup2 = BeautifulSoup(page2.text, 'html.parser')
all_a_items2 = soup2.find(class_='mw-body').find_all('a')

wiki_links2 = []
wiki_links_count2 = Counter()

for a in all_a_items2:
  href = a.get('href')
  if href is not None and href.startswith('/wiki/') and not 'File:' in href:
    # we only print those URLs that start with `/wiki/` because they are internal to Wikipedia
    # but we exclude those that point to Files -- e.g., images
    wiki_links2.append(href)

# Display most common links
wiki_links_count2 = Counter(wiki_links2)
display(wiki_links_count2.most_common(5))

In [None]:
links_page1 = wiki_links_count.keys()

links_page2 = wiki_links_count2.keys()

display(links_page1 & links_page2) # <- set intersection

### For those that have not seen '&' as a set intersection operator:

> Indented block



In [None]:
company_names = set(['apple', 'amazon', 'adobe', 'shell', 'orange', 'lotus', 'microsoft'])
organic_objects = set(['apple', 'banana', 'mango', 'rose', 'shell', 'orange', 'lotus'])

print(company_names & organic_objects)

### Extract content from Wikipedia with the Wikipedia APIs

In [None]:
import wikipediaapi
# Creates an object able to connect to Wikipedia and download
# directly parsed content in text form without HTML tags
## EDIT Down There: put your name and email for the Wikipedia logs
wapi_text = wikipediaapi.Wikipedia('MyTestProjectName (my.name@univr.it)',
                                   'en',
                                   extract_format=wikipediaapi.ExtractFormat.WIKI)

In [None]:
page_py = wapi_text.page('New York City')
print("Page - Exists: {}".format( page_py.exists()))
print(len(page_py.summary))
print(len(page_py.text))
print(len(page_py.langlinks))
print(len(page_py.links))

In [None]:
print(page_py.summary[:140])
print("   ---   ")

print(page_py.text[-140:])
print("   ---   ")

print(sorted(page_py.langlinks.keys()))
print("   ---   ")

page_py_it = page_py.langlinks['it']  # <-- this links to the same page in the italian Wikipedia
print(page_py_it.summary[:140])

In [None]:
# we go over all links in the original page (english page)
links = page_py.links
for title in sorted(links.keys()): # each link has a Title
    if len(title) > 4 : # filter on title length to reduce output
      continue
    print("{}".format(title))

In [None]:
test_pages = ['Addis Ababa',  'Tom Sawyer', 'Johannes Gutenberg']

In [None]:
from urllib.parse import quote
from collections import deque

# Example of simple crawling code

## make a queue of pages initialized to some pages of interest
page_queue = deque( wapi_text.page(tp) for  tp in test_pages )

# prepare auxiliar data structures
page_stored = {}
page_visited = set()

# we put a limit to stop after a bit to avoid downloading too many pages
max_iterations = 10


# Here starts the "crawling" which will continue untile we have pages to extract
#   or until we do not reach max num of iterations
while len(page_queue) > 0 and max_iterations > 0:

  # get the first page in the queue
  _page = page_queue.popleft()

  # save its full text
  page_stored[_page.fullurl] = _page.text

  # we add it to the set of pages visited (to avoid visiting it again)
  page_visited.add(_page.fullurl)

  # just print something to show progress
  print(max_iterations, _page.title, _page.fullurl)

  # Update the number of iterations
  max_iterations = max_iterations - 1

  # Find all links, add *some* of those pages to the queue for crawling
  for next_page in _page.links.values():
    try: # We use a "try" to skip in case of errors
      if not next_page.exists():
        continue

      _next_page_url = next_page.fullurl
      _next_page_url_fragment = _next_page_url.split('/')[-1]
      if len(_next_page_url_fragment) < 6 and len(_next_page_url_fragment) > 13: # just a random filter to avoid downloading too many pages
        continue # skip this page

      # of course, if a page has been already visited, we ignore that link
      if _next_page_url in page_visited:
        continue # skip this page

      # otherwise we add it to the pages to visit next
      page_queue.append(next_page)
    except Exception as ex:
      print("\tError retrieving", next_page.title)


print(len(page_stored))
print(page_stored.keys())

In [None]:
######
# TODO: Create the bag of words for all page texts, remember to transform the text in lowercase and remove punctuation
######

# regexp from above
punct_regex = re.compile('[{}]'.format(re.escape(string.punctuation))) # Regex matching any punctuation
space_regex = re.compile(' +') # Regex matching whitespace


# page_stored[_page.fullurl] = _page.text
page_bow = {}
for url, text in page_stored.items():
  page_bow[url] = Counter(space_regex.sub(' ', punct_regex.sub(' ', text.lower())).strip().split())


In [None]:
for url, bow in page_bow.items():
  print(url, bow.most_common(5))

#### The following declaration extract unparsed HTML instead of already parsed text

In [None]:
wapi_html = wikipediaapi.Wikipedia('MyProjectName (name@studenti.univr.it)',
                              'en',
                              extract_format=wikipediaapi.ExtractFormat.HTML)
page_py = wapi_text.page('New York City')
print("Page - Exists: {}".format( page_py.exists()))
print(len(page_py.summary))


### Stemming and lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer


## Download resources needed by methods
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

# More info:
# -- https://www.nltk.org/howto/stem.html
# -- https://www.nltk.org/howto/wordnet.html

In [None]:
# Initialize Python porter stemmer
ps = PorterStemmer()
sn = SnowballStemmer("english")

example_sentence = """Programming is an art and a job.
Python programmers often tend to like programming in python
because it's like english.
This is a better language than many others and an incredibly
useful property that makes things easier.
We called people who program in python pythonistas."""

# Remove punctuation
example_sentence_no_punct = example_sentence.lower().translate(
    str.maketrans("", "", string.punctuation)
    )

# Create tokens
word_tokens = word_tokenize(example_sentence_no_punct)

# Perform stemming
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in word_tokens:
    print ("{0:20}{1:20}{2:20}".format(word, ps.stem(word), sn.stem(word)))


In [None]:
# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

# wn.VERB
# wn.ADV
# wn.NOUN

# Perform lemmatization
print("{0:6}{1:20}{1:20}".format("POS","--Word--","--Lemma--"))
for word in word_tokens:
  lemmatized = wnl.lemmatize(word, pos=wordnet.ADJ)
  if word != lemmatized:
    print ("{0:6}{1:20}{2:20}".format("ADJ", word, lemmatized)) # <- lemmatize as if they are all adjectives

  lemmatized = wnl.lemmatize(word, pos=wordnet.VERB)
  if word != lemmatized:
    print ("{0:6}{1:20}{2:20}".format("VERB", word, lemmatized)) # <- lemmatize as if they are all adjectives

  lemmatized = wnl.lemmatize(word, pos=wordnet.ADV)
  if word != lemmatized:
    print ("{0:6}{1:20}{2:20}".format("ADV", word, lemmatized)) # <- lemmatize as if they are all adjectives


In [None]:
######
# TODO: Text stemming and lemmatization with a wikipedia page summary
######

