# PART 1

### Importing required libraries for part 1

In [None]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from pymongo import MongoClient

### Creating a connection to MongoDB

In [None]:
client = MongoClient('localhost', 27017)
db = client['news']
collection = db['elespectador']

### Base URL of the site to be analyzed

In [None]:
SITE_URL = 'https://www.elespectador.com'

### Creating a new chrome window

In [None]:
serv = Service(ChromeDriverManager().install())
browser = webdriver.Chrome(service = serv)
browser.get(SITE_URL)

### Fuction to do the request and rendering the browser

In [None]:
def make_request(browser, relative_path, clic=False, login=False, scroll=False):
    browser.get(SITE_URL + relative_path)
    
    if login:
        iframes = browser.find_elements(By.TAG_NAME, 'iframe')
        # the html for the login form is in the position 2 of the iframes list of the page, so
        # to be able of referencing login fields form it is necessary to switch to the iframe appropriate
        browser.switch_to.frame(iframes[1])

        username = browser.find_element(By.ID,'email')
        username.send_keys('oromeror@unbosque.edu.co')

        password = browser.find_element(By.ID,'password')
        password.send_keys('YeABPLxJVYct3FR')

        browser.find_element(By.CLASS_NAME, 'btn-login').click()
        
        return 'logged'
    
    if clic:
        # this is to open the 
        browser.find_element(By.CLASS_NAME, 'Header-BurgerMenu').click()
        time.sleep(2)
    
    if scroll:
        # Simulating vertical scrolling for handling lazy load
        check_height = browser.execute_script('return document.body.scrollHeight;')
        while True:
            browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            time.sleep(2)
            height = browser.execute_script('return document.body.scrollHeight;')
            if height == check_height: 
                break 
            check_height = height
    
    # Getting HTML content and passing it to BeautifulSoup for scraping analysis
    return BeautifulSoup(browser.page_source, 'html.parser')

### login on the page to avoid being blocked by El Espectador during the web-scraping process

In [None]:
make_request(browser, '/login/?utm_source=interno&utm_medium=boton&utm_campaign=login&utm_content=boton_login_header', False, True)

### Getting HTML content for news categories container

In [None]:
soup = make_request(browser, '/', True, False, True)

### Finding the section where menu items are contained 

In [None]:
menu = soup.find(class_ = 'Header-BurgerMenuField')

### Getting items from the menu categories

In [None]:
items = menu.find_all(class_ ='Menu-ItemContent')
print(len(items))

### Saving the title and relative path of each categorie in a list

In [None]:
categories = []
for categorie in items:
    try:
        categories.append({
            'title': categorie.find(class_ = 'Menu-ItemTitle').get_text(),
            'relative_path': categorie.find(class_ = 'Menu-ItemLink')['href']
        })
    except:
        print('Los siguientes elementos no tienen href: ', categorie.find(class_ = 'Menu-ItemTitle').get_text())

### Categories values

In [None]:
categories

### Selecting 5 categories

In [None]:
indexes = [2,7,4,5,6]
five_categories = [categories[i] for i in indexes]
five_categories

In [None]:
# Building a list with title and relative path of the news founded
news = []

### Getting HTML content for news listing pages in each category

In [None]:
for fc in five_categories:
    # Getting HTML content for news page
    soup = make_request(browser, '/archivo' + fc['relative_path'])
    time.sleep(3)
    
    # Finding the section where news are contained 
    layout = soup.find(class_ = 'Layout-flexAds')
    
    # Getting blocks from layout
    blocks = layout.find('section').find_all(recursive = False)
    print('number of blocks: ', len(blocks)) # 3 blocks founded, 2 for news and 1 for pagintion
    
    # Finding and concatenating news cards
    cards = blocks[0].find_all(class_ = 'Card') + blocks[1].find_all(class_ = 'Card')
    print('number of cards: ',len(cards))
    
    for card in cards:
        news.append({
            'title': card.find('h2', class_ = 'Card-Title').find('a').get_text(),
            'relative_path': card.find('h2', class_ = 'Card-Title').find('a')['href']
        })
        
    for n in news: 
        # Getting HTML content for news page
        soup = make_request(browser, n['relative_path'])

        # Extracting news metadata

        n['category'] = fc['title']

        if soup.find(class_ = 'ArticleHeader-Date').get_text():
            n['datetime'] = soup.find(class_ = 'ArticleHeader-Date').get_text()
        else: 
            n['datetime'] = 'Sin Fecha de publicación'

        # Extracting and concatenating news full text
        paragraphs = soup.find_all(class_ = 'font--secondary')
        n['full_text'] = ' '.join([p.get_text() for p in paragraphs])

In [None]:
news

### News obtained

In [None]:
news

### Storing extracted information for further analysis

In [None]:
collection.insert_many(news)