In [1]:
import os
import sys

import re
import requests
from bs4 import BeautifulSoup

import sqlite3
import easyocr
import torch
import spacy

import scraping_v2_utils

sys.path.append(os.getcwd() + "/../../db")
sys.path.append(os.getcwd() + "/../../labos/nlp")

import sql_utils
import preprocessing
import model_utils

In [2]:
# Database

database_file_name = "db_prod_final.sqlite"

con = sqlite3.connect(os.getcwd() + f"/../../db/{database_file_name}")
cur = con.cursor()

sql_select_already_existing_catalog = sql_utils.get_sql_statement("sql_select_already_existing_catalog.sql")
sql_insert_catalog = sql_utils.get_sql_statement("insert_catalog_db_prod.sql")
sql_insert_page = sql_utils.get_sql_statement("insert_page_db_prod.sql")
sql_update_text = sql_utils.get_sql_statement("update_text_db_prod.sql")
sql_select_inference = sql_utils.get_sql_statement("select_pages_inference_final.sql")
sql_select_image_without_text = sql_utils.get_sql_statement("select_path_of_image_without_text.sql")
sql_insert_page_categories = sql_utils.get_sql_statement("sql_insert_page_categories.sql")
sql_update_category = sql_utils.get_sql_statement("update_category_db_prod.sql") # To modify

In [3]:
# Refs of catalog that dates already scrapped
response = cur.execute(sql_select_already_existing_catalog).fetchall()
catalog_dates_already_scraped = [elt[0] for elt in response]  # refs
print(catalog_dates_already_scraped)

# Refs of catalog with images already scrapped
catalog_images_already_scrapped = os.listdir("../../prod_dataset")
print(catalog_images_already_scrapped)

['24769797950543995-2', '24769797950543971-3', '24769797950543969-1', '24769797950543903-1', '24769797950544019-1', '24769797950543931-1', '24769797950544006-1', '24769797950544022-1', '24769797950544000-5', '24769797950543927-2', '24769797950544001-1', '24769797950544025-1', '24769797950544005-1', '24769797950544030-1', '24769797950544023-2', '24769797950544016-1', '24769797950543889-3', '24769797950544026-1', '24769797950543926-2', '24769797950544024-1', '24769797950543987-2', '24769797950544018-3', '24769797950544034-1', '24769797950543912-1', '24769797950543968-3', '24769797950544033-1', '24769797950543999-1', '24769797950544049-1', '24769797950544002-1', '24769797950544035-1', '24769797950544011-1', '24769797950543955-1', '24769797950543911-3', '24769797950543918-1', '24769797950543909-1', '24769797950544012-2', '24769797950544047-1', '24769797950544027-1', '24769797950543928-2', '24769797950544010-1', '24769797950544045-1', '24769797950544021-1', '24769797950544014-1', '247697979

In [4]:
# Get the refs

url = "https://lapub.re/"
response = requests.get(url=url, timeout=(5,15))

soup = BeautifulSoup(response.text, "html.parser")

regex_pattern = re.compile(r"^https:\/\/lapub\.re\/imgpromo\/\d+.*maxi\.jpg$")
matching_imgs = soup.find_all("img", src=regex_pattern)

## Insert catalog and page into db

In [6]:
# TODO: Clean and refactor
# TODO: Fix conditions => put on top if already "traité"
# Toute la pipeline se fait d'un coup !
for img in matching_imgs[:90]:
    catalog_ref = img["src"][26:-9]

    # * Check if dates not already scraped
    if catalog_ref not in catalog_dates_already_scraped:
        print("in")
        # Get dates
        (start_date, end_date) = scraping_v2_utils.get_dates(img.parent.get("href"))

        # Save in `catalog_dates.txt`
        # scraping_v2_utils.save_dates(catalog_ref, start_date, end_date)

        # Save in db
        cur.execute(sql_insert_catalog, (catalog_ref, start_date, end_date))
    
        con.commit()
    

    # * Check if images not already downloaded
    if catalog_ref not in catalog_images_already_scrapped:
        print("in")
        # Get full links
        full_links = scraping_v2_utils.get_full_links(catalog_ref)
        print("full_links", full_links)
        # Download images
        scraping_v2_utils.download_all_images(full_links, catalog_ref)
        print("save in db")
        # Save in db
        for image_name in os.listdir(
            os.getcwd() + f"/../../prod_dataset/{catalog_ref}"
        ):
            # TODO: Verify not already in db ?
            cur.execute(
                sql_insert_page,
                (
                    f"./prod_dataset/{catalog_ref}/{image_name}",
                    catalog_ref,
                ),
            )
        con.commit()

in
in
2024-03-24 22:31:12.167968 => https://lapub.re/prospectus/24769797950543956-2/HTML/files/assets/common/page-html5-substrates/
catalog_dict {'24769797950543956-2': ['page0001_1.jpg', 'page0001_2.jpg', 'page0001_3.jpg', 'page0002_1.jpg', 'page0002_2.jpg', 'page0002_3.jpg', 'page0003_1.jpg', 'page0003_2.jpg', 'page0003_3.jpg', 'page0004_1.jpg', 'page0004_2.jpg', 'page0004_3.jpg']}
catalog_dict keys dict_keys(['24769797950543956-2'])
full_links ['https://lapub.re/prospectus/24769797950543956-2/HTML/files/assets/common/page-html5-substrates/page0001_3.jpg', 'https://lapub.re/prospectus/24769797950543956-2/HTML/files/assets/common/page-html5-substrates/page0002_3.jpg', 'https://lapub.re/prospectus/24769797950543956-2/HTML/files/assets/common/page-html5-substrates/page0003_3.jpg', 'https://lapub.re/prospectus/24769797950543956-2/HTML/files/assets/common/page-html5-substrates/page0004_3.jpg']
download image
image downloaded
download image
image downloaded
download image
image downloaded


## Insert text into db

In [7]:
# * Use OCR model to save raw text content in db
reader = easyocr.Reader(lang_list=["en"])

response = cur.execute(sql_select_image_without_text).fetchall()
page_missing_text_in_db = [elt[0] for elt in response]

for path in page_missing_text_in_db:
    ocr_output = reader.readtext(
        os.getcwd() + f"/../../{path[2:]}"
    )
    raw_text = "".join([elt[1] + " " for elt in ocr_output])

    print("Inserting text to db =>", path)
    print("raw text =>", raw_text)
    cur.execute(
        sql_update_text,
        (
            raw_text,
            path,
        ),
    )

    con.commit()

Inserting text to db => ./prod_dataset/24769797950543956-2/page0001_3.jpg
raw text => OceanOR MERCI POUR CES SOURIRES INSPIRE PAR LES FEMMES 
Inserting text to db => ./prod_dataset/24769797950543956-2/page0003_3.jpg
raw text => NOUVEAUTE 25,90€ COLLIER NOUVEAUTE DORE* 35,90€ 52,90€ COLLIER COLLIER DORE* DORE* COLLECTION TALISMAN De bonhewiv L U N A B | J 0 U X NOUVEAUTE NOUVEAUTE 25,90€ 29,90€ 23,906 BOUCLES D'OREILLES BRACELET DOREES* BRACELET DORE" DORE PLUS FORTES, ENSEMBLE La Journee Internationale des Droits des Femmes est une occasion pour OceanOr NOUVEAUTE de saluer ses collaboratrices & travers le 25,90€ 29,90€ catalogue "Inspire par les Femmes"' une BRACELET BOUCLES D'OREILLES initiative visant a celebrer leur engagement DORE" DOREES MANDALA* DORE A L'OR 23 CARATS 
Inserting text to db => ./prod_dataset/24769797950543956-2/page0002_3.jpg
raw text => 155€ COLLIER ORI & DIAMANT Annick & Audrey Julie & Sylvaine Responsables de magasin Responsables de magasin 64,90€ BOUCLES I(` CR

## Insert Page_categories

In [10]:
response = cur.execute(sql_select_inference).fetchall()
print("response", response[0])

IndexError: list index out of range

In [9]:
# * Use NLP model to save insert page categories in db
response = cur.execute(sql_select_inference).fetchall()
"""
Format: list[(id: int, text: str)]
"""
vocabulary = preprocessing.get_vocabulary(os.getcwd() + "/../../labos/nlp/vocab_multi-class.txt")
labels = [i for i in range(16)]
model = model_utils.PageClassifier(len(vocabulary), len(labels))
model.load_state_dict(torch.load(os.getcwd() + "/../../labos/nlp/model_weight_multi-class.pth"))

nlp = spacy.load("fr_core_news_lg")

for response_elt in response:
    id = response_elt[0]
    raw_text = response_elt[1]


    input = preprocessing.pipeline_from_raw_text_to_vectors(raw_text, nlp, vocabulary)
    # print(input)
    # TODO: Rename output
    categories = model_utils.predict_multi_class_raw(model, input)
    print(categories)
    for category_value in categories:

        cur.execute(sql_insert_page_categories, (id,category_value,))
    con.commit()

# cur.execute(sql_update_category, (category, response[0]))

# con.commit()

[10]
[7]
[8]
[10]
[8]
[5]
[7]
[10]
[4]
[10]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[1]


In [7]:
# TODO: Use AI model to add label into db_prod in another file ?!