In [233]:
import config
from toolbox import DatabaseInterface

# python
import pandas as pd
import time
from datetime import date
from datetime import timedelta

# sql
import psycopg2

# selenium 4
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromiumService
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType

# driver = webdriver.Chrome(service=ChromiumService(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()))

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# python
import pandas as pd
import time

In [229]:
from os import environ

# name of database
database_name = "main"

# azure sql conn string
azure_conn_admin = f"host=vinci-db.postgres.database.azure.com port=5432 dbname=postgres user=postgres password={environ['POSTGRES_PASSWORD']} sslmode=require"
azure_conn_user = f"host=vinci-db.postgres.database.azure.com port=5432 dbname={database_name} user=postgres password={environ['POSTGRES_PASSWORD']} sslmode=require"

# number of page to scrap (set 0 for all)
nb_page = 0

# size of batch of contents to export
batch_size = 10

# headless mod for selenium driver
headless = True

# choose what you want update
article = True
content = True

# export mode
csv = False
database = True

# scrap only most recent articles than last time
only_new_articles = True

In [256]:
def scrap_articles_liberation(max_range=500, time_sleep=5, end_date="31/03/2023"):
    """Scrap all titles and dates from Libération, articles start to end"""
    
    month_dict = {
        "janvier": "01",
        "février": "02",
        "mars": "03",
        "avril": "04",
        "mai": "05",
        "juin": "06",
        "juillet": "07",
        "août": "08",
        "septembre": "09",
        "octobre": "10",
        "novembre": "11",
        "décembre": "12",
    }
    
    def close_popup() -> bool:
        try:
            print('Closing popup')
            driver.switch_to.default_content()
            time.sleep(10)
            #print('step 0')
            #iframe = driver.find_element(By.XPATH, '/html/body/iframe[3]')
            print('Step 1')
            driver.switch_to.frame(iframe)
            iframe = driver.find_element(By.XPATH, '//*[@id="mailmunch-popover-frame-*"]')
            print('Step 2')
            driver.switch_to.frame(iframe)
            driver.find_element(By.CSS_SELECTOR, 'html body.contacts.new div.step-container.live a#close-icon').click()
            print('Step 3')
            print("Popup closed")
            driver.switch_to.default_content()
            return True
        except:
            print("No popup")
            return False
       
    def select_dates(start="01/02/2021", end="31/03/2023"):
        time.sleep(2)
        # CSS selectors for text boxes
        start_box = "#datepicker_from"
        end_box = "#datepicker_to"
        submit_button = "#pubDate_filter > div:nth-child(7) > button:nth-child(3)"
        
        driver.find_element(By.CSS_SELECTOR, start_box).send_keys(start)
        driver.find_element(By.CSS_SELECTOR, end_box).send_keys(end)
        
        # submit
        driver.find_element(By.CSS_SELECTOR, submit_button).click()
        
        print(f"Dates submitted ({start} to {end})")
        time.sleep(2)
      
    def select_sort(type_sort="Récent"):
        """Put Récent or Pertinent"""
        
        from selenium.webdriver.support.ui import Select
        
        select = Select(driver.find_element(By.CSS_SELECTOR, '#sortby'))

        # select by visible text
        select.select_by_visible_text(type_sort)
        
        print(f"Sort articles with : {type_sort}")
        time.sleep(2)
    
    def transform_dates(date):
        if type(date) == pd._libs.tslibs.timestamps.Timestamp:
            return date
        else:
            try:
                date = date.split()
                date = pd.to_datetime(" ".join([date[2], month_dict[date[1]], date[0]]))
                return date
            except:
                return f"error date : {date}"
    
    def parse_one_page(time_sleep=5):
        print('Parsing one page')
        driver.set_window_size(1920,1080)
        parsed = []
        time.sleep(time_sleep)

        for article in driver.find_elements(By.CSS_SELECTOR, "div.queryly_item_row"):

            try:
                parsed.append({
                    'journal': 'liberation',
                    'title': article.find_element(
                        By.CLASS_NAME,
                        'queryly_item_title').text,
                    'article_date': article.find_element(
                    By.CSS_SELECTOR,
                    'div.queryly_item_description').find_element(By.CSS_SELECTOR, 'div').text.split(" / ")[0],
                    'link': article.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
                })
            except:
                print('FAILED PARSING PAGE')

        print('Page end')
        return parsed

    
    print("Start scraping Libération")
    start_time = time.time()
    
    # define driver
    driver = webdriver.Chrome()

    # open web page
    url = 'https://www.liberation.fr/recherche/?query=ukraine'
    driver.get(url)

    #change iframe for cookies button
    time.sleep(5)
    iframe = driver.find_element(By.XPATH, '//*[@id="sp_message_iframe_726760"]')
    driver.switch_to.frame(iframe)
    # accept cookies
    driver.find_element(By.XPATH, '//*[@id="notice"]/div[3]/div/button[1]').click()
    driver.switch_to.default_content()
    print('Cookies accepted')

    # close bottom bar
    driver.set_window_size(1920,1080)
    time.sleep(5)
    iframe = driver.find_element(By.XPATH, '/html/body/div[1]/iframe')
    driver.switch_to.frame(iframe)
    driver.find_element(By.XPATH, '//*[@id="close-icon"]').click()
    driver.switch_to.default_content()
    print('Bottom bar closed')

    # submit dates
    select_dates(end=end_date)
    
    # sort articles
    select_sort(type_sort="Récent")
    
    popup_closed = False
    parsed = []

    for i, page in enumerate(range(max_range)):
        print(f"Scraping page {i+1}")

        # parse one page
        parsed = parsed + parse_one_page(time_sleep)

        # go to next page
        driver.set_window_size(1920,1080)
        button_xpath_next = '/html/body/div[2]/section/div/div[2]/div/div[2]/div/div/div[2]/a'

        try:
            button_next = driver.find_element(By.XPATH, button_xpath_next)
            button_next.click()
        except:
            if not popup_closed:
                close_popup()
                popup_closed = True
            else:
                print("End of scraping (no more page)")
                break

    print("End of scraping (end of loop)")
    
    # close driver
    driver.quit()
    
    # convert data to dataframe
    df = pd.DataFrame(parsed)

    # transforme date
    month_dict = {
        "janvier": "01",
        "février": "02",
        "mars": "03",
        "avril": "04",
        "mai": "05",
        "juin": "06",
        "juillet": "07",
        "août": "08",
        "septembre": "09",
        "octobre": "10",
        "novembre": "11",
        "décembre": "12",
    }
    
    print(f"Time of scraping : {time.time() - start_time}")
    print(f"Number of articles scraped : {df.shape[0]}")
    
    # check for missing data
    print("Check for missing data...")
    missing_data = df[(df['title'] == '') | (df['article_date'] == '') | (df['link'] == '')].shape[0]
    if missing_data > 20:
        print(f"{missing_data} rows with missing data, check for scraping errors.")
    else:
        df = df[~(df['title'] == '') & ~(df['article_date'] == '') & ~(df['link'] == '')]
        print(f"{df.shape[0]} rows of data scraped")

        # convert dates to Pandas Timestamp
        df['article_date'] = df['article_date'].apply(transform_dates)
        
    def cut_link(link: str, lenght=255):
        if len(link) > lenght:
            return link[:lenght]
        else:
            return link

    #cut links too long
    print(f"{df[df['link'].str.len() > 255].shape[0]} links are too long, these will be cuted")
    df['link'] = df['link'].apply(cut_link)
    
    # export to postgresql database
    databaseInterface = DatabaseInterface()
    databaseInterface.export_to_database(df=df, table="articles")
    
    return df

# Working

In [128]:
result = scrap_articles_liberation(max_range=270, time_sleep=2, end_date="31/03/2023")
result

Start scraping Libération
Cookies accepted
Bottom bar closed
Dates submitted (01/02/2021 to 31/03/2023)
Sort articles with : Récent
Scraping page 1
Parsing one page
Page end
Scraping page 2
Parsing one page
Page end
Scraping page 3
Parsing one page
Page end
Closing popup
Step 1
No popup
Scraping page 4
Parsing one page
Page end
Scraping page 5
Parsing one page
Page end
Scraping page 6
Parsing one page
Page end
Scraping page 7
Parsing one page
Page end
Scraping page 8
Parsing one page
Page end
Scraping page 9
Parsing one page
Page end
Scraping page 10
Parsing one page
Page end
Scraping page 11
Parsing one page
Page end
Scraping page 12
Parsing one page
Page end
Scraping page 13
Parsing one page
Page end
Scraping page 14
Parsing one page
Page end
Scraping page 15
Parsing one page
Page end
Scraping page 16
Parsing one page
Page end
Scraping page 17
Parsing one page
Page end
Scraping page 18
Parsing one page
Page end
Scraping page 19
Parsing one page
Page end
Scraping page 20
Parsing one p

Scraping page 186
Parsing one page
Page end
Scraping page 187
Parsing one page
Page end
Scraping page 188
Parsing one page
Page end
Scraping page 189
Parsing one page
Page end
Scraping page 190
Parsing one page
Page end
Scraping page 191
Parsing one page
Page end
Scraping page 192
Parsing one page
Page end
Scraping page 193
Parsing one page
Page end
Scraping page 194
Parsing one page
Page end
Scraping page 195
Parsing one page
Page end
Scraping page 196
Parsing one page
Page end
Scraping page 197
Parsing one page
Page end
Scraping page 198
Parsing one page
Page end
Scraping page 199
Parsing one page
Page end
Scraping page 200
Parsing one page
Page end
Scraping page 201
Parsing one page
Page end
Scraping page 202
Parsing one page
Page end
Scraping page 203
Parsing one page
Page end
Scraping page 204
Parsing one page
Page end
Scraping page 205
Parsing one page
Page end
Scraping page 206
Parsing one page
Page end
Scraping page 207
Parsing one page
Page end
Scraping page 208
Parsing one pa

Unnamed: 0,journal,title,article_date,link
0,liberation,L’inflation française ralentit en mars mais le...,31 mars 2023,https://www.liberation.fr/economie/linflation-...
1,liberation,Donald Trump premier président américain incul...,31 mars 2023,https://www.liberation.fr/international/europe...
2,liberation,"En Allemagne, une première visite d’Etat pour ...",30 mars 2023,https://www.liberation.fr/international/europe...
3,liberation,"Avec la pénurie de Perrier dans les rayons, to...",30 mars 2023,https://www.liberation.fr/lifestyle/gastronomi...
4,liberation,Résolution «historique» de l’ONU sur le climat...,29 mars 2023,https://www.liberation.fr/international/resolu...
...,...,...,...,...
5392,liberation,"Entre la Chine et l’UE, l’art de la guerre dip...",23 mars 2021,https://www.liberation.fr/international/europe...
5393,liberation,"Léo Dubois, filer droit",23 mars 2021,https://www.liberation.fr/sports/football/leo-...
5394,liberation,Foot : devant le refus des clubs de libérer ce...,20 mars 2021,https://www.liberation.fr/sports/football/foot...
5395,liberation,Près de 80 pays dans le monde ont maintenu l’u...,18 mars 2021,https://www.liberation.fr/checknews/pres-de-80...
