In [1]:
import os
import sys

import re
import requests
from bs4 import BeautifulSoup

import sqlite3
import easyocr
import torch
import spacy

import scraping_v2_utils

sys.path.append(os.getcwd() + "/../../db")
sys.path.append(os.getcwd() + "/../../labos/nlp")

import sql_utils
import preprocessing
import model_utils

In [2]:
# Refs of catalog that dates already scrapped
catalog_dates_already_scraped = []  # refs
with open("./catalog_dates.txt") as catalog_dates_file:
    [
        catalog_dates_already_scraped.append(line.split("|")[0])
        for line in catalog_dates_file.read().split("\n")
    ]
print(catalog_dates_already_scraped)

# Refs of catalog with images already scrapped
catalog_images_already_scrapped = os.listdir("../../prod_dataset")
print(catalog_images_already_scrapped)

['']
[]


In [3]:
# Get the page

url = "https://lapub.re/"
response = requests.get(url=url, timeout=(5,15))

In [4]:
# Get the refs

soup = BeautifulSoup(response.text, "html.parser")

regex_pattern = re.compile(r"^https:\/\/lapub\.re\/imgpromo\/\d+.*maxi\.jpg$")
matching_imgs = soup.find_all("img", src=regex_pattern)

In [5]:
# Database

database_file_name = "db_prod_final.sqlite"

con = sqlite3.connect(os.getcwd() + f"/../../db/{database_file_name}")
cur = con.cursor()

sql_insert_catalog = sql_utils.get_sql_statement("insert_catalog_db_prod.sql")
sql_insert_page = sql_utils.get_sql_statement("insert_page_db_prod.sql") # To modify
sql_update_text = sql_utils.get_sql_statement("update_text_db_prod.sql")
sql_select_inference = sql_utils.get_sql_statement("select_pages_inference_v1.sql")
sql_update_category = sql_utils.get_sql_statement("update_category_db_prod.sql") # To modify

In [6]:
# TODO: Clean and refactor
# TODO: Fix conditions => put on top if already "traité"
# Toute la pipeline se fait d'un coup !
for img in matching_imgs[:1]:
    catalog_ref = img["src"][26:-9]

    # * Check if dates not already scraped
    if catalog_ref not in catalog_dates_already_scraped:
        # Get dates
        (start_date, end_date) = scraping_v2_utils.get_dates(img.parent.get("href"))

        # Save in `catalog_dates.txt`
        scraping_v2_utils.save_dates(catalog_ref, start_date, end_date)

        # Save in db
        cur.execute(sql_insert_catalog, (catalog_ref, start_date, end_date))
    
    con.commit()
    

    # * Check if images not already downloaded
    if catalog_ref not in catalog_images_already_scrapped:
        # Get full links
        full_links = scraping_v2_utils.get_full_links(catalog_ref)
        print("full_links", full_links)
        # Download images
        scraping_v2_utils.download_all_images(full_links, catalog_ref)
        print("save in db")
        # Save in db
        for image_name in os.listdir(
            os.getcwd() + f"/../../prod_dataset/{catalog_ref}"
        ):
            cur.execute(
                sql_insert_page,
                (
                    f"./prod_dataset/{catalog_ref}/{image_name}",
                    catalog_ref,
                ),
            )
    con.commit()

    # * Use OCR model to save raw text content in db
    # TODO: If champs text pas déjà renseigné
    reader = easyocr.Reader(lang_list=["en"])
    for image_path in os.listdir(os.getcwd() + f"/../../prod_dataset/{catalog_ref}"):
        raw_text = reader.readtext(image_path)

        cur.execute(sql_update_text, (raw_text, f"./prod_dataset/{catalog_ref}/{image_name}",))
    
    con.commit()

    # * Use NLP model to save insert page categories in db
    response = cur.execute(sql_select_inference).fetchall()
    """
    Format: list[(id: int, text: str)]
    """
    vocabulary = preprocessing.get_vocabulary(os.getcwd() + "/../../labos/nlp/vocab_v1.txt")
    labels = [i for i in range(16)]
    model = model_utils.PageClassifier(len(vocabulary), len(labels))
    model.load_state_dict(torch.load(os.getcwd() + "/../../labos/nlp/model_weight_v1.pth"))

    nlp = spacy.load("fr_core_news_lg")

    input = preprocessing.pipeline_from_raw_text_to_vectors(raw_text, nlp, vocabulary)
    # TODO: Rename output 
    category = model_utils.predict(model, input)

    cur.execute(sql_update_category, (category, response[0]))

    con.commit()
print("saved in db")


2024-02-23 09:23:37.242992 => https://lapub.re/prospectus/24769797950543861-1/HTML/files/assets/common/page-html5-substrates/
catalog_dict {'24769797950543861-1': ['page0001_1.jpg', 'page0001_2.jpg', 'page0001_3.jpg', 'page0001_4.jpg', 'page0002_1.jpg', 'page0002_2.jpg', 'page0002_3.jpg', 'page0002_4.jpg', 'page0003_1.jpg', 'page0003_2.jpg', 'page0003_3.jpg', 'page0003_4.jpg', 'page0004_1.jpg', 'page0004_2.jpg', 'page0004_3.jpg', 'page0004_4.jpg', 'page0005_1.jpg', 'page0005_2.jpg', 'page0005_3.jpg', 'page0005_4.jpg', 'page0006_1.jpg', 'page0006_2.jpg', 'page0006_3.jpg', 'page0006_4.jpg', 'page0007_1.jpg', 'page0007_2.jpg', 'page0007_3.jpg', 'page0007_4.jpg', 'page0008_1.jpg', 'page0008_2.jpg', 'page0008_3.jpg', 'page0008_4.jpg', 'page0009_1.jpg', 'page0009_2.jpg', 'page0009_3.jpg', 'page0009_4.jpg', 'page0010_1.jpg', 'page0010_2.jpg', 'page0010_3.jpg', 'page0010_4.jpg', 'page0011_1.jpg', 'page0011_2.jpg', 'page0011_3.jpg', 'page0011_4.jpg', 'page0012_1.jpg', 'page0012_2.jpg', 'page001

In [None]:
# TODO: Use AI model to add label into db_prod in another file ?!