# Web Scraper

Runing this notebook enables the scraping of saved web-pages inside the <b>WebPages</b> folder. Each present HTML file is scraped and saved respectively in a TXT file inside the <b>ScrapedHTML</b> folder.

---

In [9]:
from bs4 import BeautifulSoup
import requests
import csv
import urllib
import os
import re
import string

In [23]:
#Defining path variables

file_path = os.getcwd()

processed_html = 'ProcessedData'
raw_html = 'RawWebPages'

scraped_path = os.path.join(file_path, processed_html)
html_path = os.path.join(file_path, raw_html)

In [24]:
#Get unprocessed HTML files

web_pages = []      #Saves the html name of each file
web_html_path = []  #Saves the full path of all the html files

for file in os.listdir(os.path.join(file_path, raw_html)):
    # check only html files
    if file.endswith('.html'):
        web_html_path.append(os.path.join(html_path, file))
        web_pages.append(file)

Each passage/paragraph is pre-processed before being saved!

In [55]:
s = '\u1744 ksad'
s = s.encode("ascii", "ignore")
s = s.decode()
s

' ksad'

In [58]:
def pre_process_paragraph(paragraph):
    ponctuation = "#$()*+-/:;<=>@[\]^_`{|}"
    special_characters = 'å¼«¥ª°©ð±§µæ¹¢³¿®ä£⊙'
    email = '[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}'
    itemized = '[(\s][0-9a-zA-Z][.)]\s+|[(\s][ivxIVX]+[.)]\s+'
    
    #remove unicode
    paragraph = paragraph.encode("ascii", "ignore")
    paragraph = paragraph.decode()

    processed_text = re.sub(email, '', paragraph)                                   #Removes email
    processed_text = re.sub("[\<\[].*?[\>\]]", "", processed_text)
    processed_text = re.sub("\<a.+?\</a>", "", processed_text)
    processed_text = re.sub("\[.*?\]", "", processed_text)                                  
    processed_text = re.sub("\<b>.+?\</b>", "", processed_text)
    processed_text = re.sub("\<i>.+?\</i>", "", processed_text)
    processed_text = re.sub("\<strong>.+?\</strong>", "", processed_text)
    processed_text = processed_text.translate(str.maketrans('', '', special_characters))
    processed_text = re.sub("\<em>.+?\</em>", "", processed_text)
    processed_text = re.sub("</b>", "", processed_text)
    processed_text = re.sub("<br/>", "", processed_text)
    processed_text = re.sub("<p>", "", processed_text)
    processed_text = re.sub("</p>", "", processed_text)
    processed_text = re.sub("\n", "", processed_text)
    processed_text = re.sub("\r", "", processed_text)
    
    return processed_text

In [59]:
def pre_process_title(title):
    ponctuation = "#$()*+-/:;<=>@[\]^_`{|}?!"
    special_characters = 'å¼«¥ª°©ð±§µæ¹¢³¿®ä£⊙'
    
    processed_title = title.translate(str.maketrans('', '', ponctuation))        #Removes some ponctuation
    processed_title = processed_title.translate(str.maketrans('', '', special_characters)) #Removes special characters
    processed_title = ' '.join(re.split('\s+', processed_title.strip(), flags=re.UNICODE)) #Removes duplicated spaces
    
    return processed_title

In [60]:
web_html_path

['c:\\Users\\vlad1\\Documents\\Cenas\\uu\\B5\\LM_Proj\\Code\\WebPages\\Stars _ Science Mission Directorate.html',
 'c:\\Users\\vlad1\\Documents\\Cenas\\uu\\B5\\LM_Proj\\Code\\WebPages\\Visible Light _ Science Mission Directorate.html',
 'c:\\Users\\vlad1\\Documents\\Cenas\\uu\\B5\\LM_Proj\\Code\\WebPages\\What Is a Black Hole_ _ NASA.html',
 'c:\\Users\\vlad1\\Documents\\Cenas\\uu\\B5\\LM_Proj\\Code\\WebPages\\What makes stars shine.html']

In [61]:
for web_page in web_html_path:
    #if web_page == 'c:\\Users\\vlad1\\Documents\\Cenas\\uu\\B5\\LM_Proj\\Code\\WebPages\\What makes stars shine.html':
    with open(web_page, 'rb') as html_file:
        soup = BeautifulSoup(html_file, 'lxml')
    text = soup.find_all('p')
    title = soup.find('title').string
    title = pre_process_title(title)
    html_file.close()
    processed_content = []

    for p in text:
        p.encode('utf-8').strip()
    
        content = pre_process_paragraph(str(p))
        #print(content)
        if content.count(' ') > 1 and content != 'None':
            #content = content + '\n '
            processed_content.append(content)
    #write content to txt
    print(len(processed_content))
    with open(os.path.join(scraped_path, title)+'.txt', 'w', encoding="unicode_escape") as f:
        for p in processed_content:
            p = p + '\n '
            f.write(p)
    f.close()
    
    print(f'{title} - was saved successfully')
    


9
Stars Science Mission Directorate - was saved successfully
16
Visible Light Science Mission Directorate - was saved successfully
20
What Is a Black Hole NASA - was saved successfully
13
What makes stars shine - was saved successfully
