In [1]:
import os
import sys

import re
import requests
from bs4 import BeautifulSoup

import sqlite3
import easyocr
import torch
import spacy

import scraping_v2_utils

sys.path.append(os.getcwd() + "/../../db")
sys.path.append(os.getcwd() + "/../../labos/nlp")

import sql_utils
import preprocessing
import model_utils

In [2]:
# Database

database_file_name = "db_prod_final.sqlite"

con = sqlite3.connect(os.getcwd() + f"/../../db/{database_file_name}")
cur = con.cursor()

sql_select_already_existing_catalog = sql_utils.get_sql_statement("sql_select_already_existing_catalog.sql")
sql_insert_catalog = sql_utils.get_sql_statement("insert_catalog_db_prod.sql")
sql_insert_page = sql_utils.get_sql_statement("insert_page_db_prod.sql")
sql_update_text = sql_utils.get_sql_statement("update_text_db_prod.sql")
sql_select_inference = sql_utils.get_sql_statement("select_pages_inference_final.sql")
sql_select_image_without_text = sql_utils.get_sql_statement("select_path_of_image_without_text.sql")
sql_insert_page_categories = sql_utils.get_sql_statement("sql_insert_page_categories.sql")
sql_update_category = sql_utils.get_sql_statement("update_category_db_prod.sql") # To modify

In [3]:
# Refs of catalog that dates already scrapped
response = cur.execute(sql_select_already_existing_catalog).fetchall()
catalog_dates_already_scraped = [elt[0] for elt in response]  # refs
print(catalog_dates_already_scraped)

# Refs of catalog with images already scrapped
catalog_images_already_scrapped = os.listdir("../../prod_dataset")
print(catalog_images_already_scrapped)

['24769797950543995-2', '24769797950543971-3', '24769797950543969-1', '24769797950543903-1', '24769797950544019-1', '24769797950543931-1', '24769797950544006-1', '24769797950544022-1', '24769797950544000-5', '24769797950543927-2', '24769797950544001-1', '24769797950544025-1', '24769797950544005-1', '24769797950544030-1', '24769797950544023-2', '24769797950544016-1', '24769797950543889-3', '24769797950544026-1', '24769797950543926-2']
['24769797950544025-1', '24769797950544005-1', '24769797950543903-1', '24769797950544016-1', '24769797950544026-1', '24769797950543995-2', '24769797950543969-1', '24769797950544023-2', '24769797950543971-3', '24769797950544019-1', '24769797950543927-2', '24769797950544006-1', '24769797950543931-1', '24769797950544030-1', '24769797950544001-1', '24769797950543889-3', '24769797950544022-1', '24769797950544000-5', '24769797950543926-2']


In [4]:
# Get the refs

url = "https://lapub.re/"
response = requests.get(url=url, timeout=(5,15))

soup = BeautifulSoup(response.text, "html.parser")

regex_pattern = re.compile(r"^https:\/\/lapub\.re\/imgpromo\/\d+.*maxi\.jpg$")
matching_imgs = soup.find_all("img", src=regex_pattern)

## Insert catalog and page into db

In [5]:
# TODO: Clean and refactor
# TODO: Fix conditions => put on top if already "traité"
# Toute la pipeline se fait d'un coup !
for img in matching_imgs:
    catalog_ref = img["src"][26:-9]

    # * Check if dates not already scraped
    if catalog_ref not in catalog_dates_already_scraped:
        print("in")
        # Get dates
        (start_date, end_date) = scraping_v2_utils.get_dates(img.parent.get("href"))

        # Save in `catalog_dates.txt`
        # scraping_v2_utils.save_dates(catalog_ref, start_date, end_date)

        # Save in db
        cur.execute(sql_insert_catalog, (catalog_ref, start_date, end_date))
    
        con.commit()
    

    # * Check if images not already downloaded
    if catalog_ref not in catalog_images_already_scrapped:
        print("in")
        # Get full links
        full_links = scraping_v2_utils.get_full_links(catalog_ref)
        print("full_links", full_links)
        # Download images
        scraping_v2_utils.download_all_images(full_links, catalog_ref)
        print("save in db")
        # Save in db
        for image_name in os.listdir(
            os.getcwd() + f"/../../prod_dataset/{catalog_ref}"
        ):
            # TODO: Verify not already in db ?
            cur.execute(
                sql_insert_page,
                (
                    f"./prod_dataset/{catalog_ref}/{image_name}",
                    catalog_ref,
                ),
            )
        con.commit()

in
in
2024-03-23 16:31:16.880092 => https://lapub.re/prospectus/24769797950544024-1/HTML/files/assets/common/page-html5-substrates/
catalog_dict {'24769797950544024-1': ['page0001_1.jpg', 'page0001_2.jpg', 'page0001_3.jpg', 'page0001_4.jpg', 'page0002_1.jpg', 'page0002_2.jpg', 'page0002_3.jpg', 'page0002_4.jpg', 'page0003_1.jpg', 'page0003_2.jpg', 'page0003_3.jpg', 'page0003_4.jpg', 'page0004_1.jpg', 'page0004_2.jpg', 'page0004_3.jpg', 'page0004_4.jpg', 'page0005_1.jpg', 'page0005_2.jpg', 'page0005_3.jpg', 'page0005_4.jpg', 'page0006_1.jpg', 'page0006_2.jpg', 'page0006_3.jpg', 'page0006_4.jpg', 'page0007_1.jpg', 'page0007_2.jpg', 'page0007_3.jpg', 'page0007_4.jpg', 'page0008_1.jpg', 'page0008_2.jpg', 'page0008_3.jpg', 'page0008_4.jpg', 'page0009_1.jpg', 'page0009_2.jpg', 'page0009_3.jpg', 'page0009_4.jpg', 'page0010_1.jpg', 'page0010_2.jpg', 'page0010_3.jpg', 'page0010_4.jpg', 'page0011_1.jpg', 'page0011_2.jpg', 'page0011_3.jpg', 'page0011_4.jpg']}
catalog_dict keys dict_keys(['2476979

KeyboardInterrupt: 

## Insert text into db

In [6]:
# * Use OCR model to save raw text content in db
reader = easyocr.Reader(lang_list=["en"])

response = cur.execute(sql_select_image_without_text).fetchall()
page_missing_text_in_db = [elt[0] for elt in response]

for path in page_missing_text_in_db:
    ocr_output = reader.readtext(
        os.getcwd() + f"/../../{path[2:]}"
    )
    raw_text = "".join([elt[1] + " " for elt in ocr_output])

    print("Inserting text to db =>", path)
    print("raw text =>", raw_text)
    cur.execute(
        sql_update_text,
        (
            raw_text,
            path,
        ),
    )

    con.commit()

Inserting text to db => ./prod_dataset/24769797950544024-1/page0001_4.jpg
raw text => Carrefour Du lundi 18 mars au dimanche 31 mars 2024 Reunion MoDe PAQUES DECO CROYABLE HIGH TECH MAISON Je 888 PRIX CAGNOTTE DEDUITE 6490 SUR MA CARTE PRIX PAYE EN CAISSE 5e 6990 Dont 0,27€ d'eco-participation Friteuse sans huile FAGOR FG5136 (HORS AVANTAGE SUP RSRFdaHS= SUR HRRRESOSKOITRE P3REE RESEAVRLHOUE HERBREOR DE Voon KaPSSRBEOHRGE GigHo8 DE DFFRovHHS eIHMQGEE NARONAHES BYPRROBURSE = DE GARraequa YARRYFOR ET DE PREMIERS PRIX A DES PRIX PARMI LES MOINS CHERS DU MARCHE choisis MOINS CHER 
Inserting text to db => ./prod_dataset/24769797950544024-1/page0004_4.jpg
raw text => 3 2 TEX Lor Lou 3 2 2 0 TEX TEX 3 8 S 1 c Te Te 3 SUR MA CARTE" SUR MA CARTE PRIX PAyE EN CAISSE PRIX PAYE EN CAISSE 8 Salopette Tee-shirt 1285 TEX BABY 1095 Lot de 2 Pyiamas PRIX CAGNOTTE DEDUITE  Du 6 qu 36 mois, PriX CAGNOTTE DEDUITE TEX BAB Lot de 2 Combicourt 60% Coton; Du 9 au 36 mois BIO TEX BABY 1195 40% Polyester 985 10

## Insert Page_categories

In [7]:
response = cur.execute(sql_select_inference).fetchall()
print("response", response[0])

response (314, "Carrefour Du lundi 18 mars au dimanche 31 mars 2024 Reunion MoDe PAQUES DECO CROYABLE HIGH TECH MAISON Je 888 PRIX CAGNOTTE DEDUITE 6490 SUR MA CARTE PRIX PAYE EN CAISSE 5e 6990 Dont 0,27€ d'eco-participation Friteuse sans huile FAGOR FG5136 (HORS AVANTAGE SUP RSRFdaHS= SUR HRRRESOSKOITRE P3REE RESEAVRLHOUE HERBREOR DE Voon KaPSSRBEOHRGE GigHo8 DE DFFRovHHS eIHMQGEE NARONAHES BYPRROBURSE = DE GARraequa YARRYFOR ET DE PREMIERS PRIX A DES PRIX PARMI LES MOINS CHERS DU MARCHE choisis MOINS CHER ", None)


In [8]:
# * Use NLP model to save insert page categories in db
response = cur.execute(sql_select_inference).fetchall()
"""
Format: list[(id: int, text: str)]
"""
vocabulary = preprocessing.get_vocabulary(os.getcwd() + "/../../labos/nlp/vocab_multi-class.txt")
labels = [i for i in range(16)]
model = model_utils.PageClassifier(len(vocabulary), len(labels))
model.load_state_dict(torch.load(os.getcwd() + "/../../labos/nlp/model_weight_multi-class.pth"))

nlp = spacy.load("fr_core_news_lg")

for response_elt in response:
    id = response_elt[0]
    raw_text = response_elt[1]


    input = preprocessing.pipeline_from_raw_text_to_vectors(raw_text, nlp, vocabulary)
    # print(input)
    # TODO: Rename output
    categories = model_utils.predict_multi_class_raw(model, input)
    print(categories)
    for category_value in categories:

        cur.execute(sql_insert_page_categories, (id,category_value,))
    con.commit()

# cur.execute(sql_update_category, (category, response[0]))

# con.commit()

[7]
[12]
[7]
[1]
[1]
[7]
[1]
[7]
[10]
[3]
[12]
[9]
[9]
[9]
[9]
[9]
[5]
[1]
[9]
[9]
[6]
[6]
[9]
[6]
[5]
[1]
[10]
[1]
[1]
[1]
[1]
[1]
[1]
[0]
[7]
[7]
[7]
[1]
[0]
[1]
[0]
[3]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[8]
[10]
[10]
[12]
[15]
[15]
[10]
[0]
[12]
[8]
[8]
[7]
[8]
[8]
[10]
[8]
[8]
[6]
[10]
[10]
[10]
[10]
[10]
[0]
[8]
[8]
[10]
[3]
[7]
[7]
[7]
[7]
[7]
[7]
[1]
[7]
[7]
[10]
[7]
[7]
[7]
[1]
[7]
[7]
[9]
[7]
[7]
[7]
[7]
[7]
[1]
[7]
[1]
[7]
[7]
[7]
[7]
[10]
[7]
[2]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[1]
[8]
[7]
[2]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[9]
[5]
[7]
[7]
[7]
[7]
[1]
[7]
[7]
[7]
[7]
[3]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[0]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[7]
[8]
[7]
[7]
[7]
[2]
[7]
[7]
[7]
[7]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[3]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[8]
[0]
[0]
[0]
[0]
[3]
[1]
[3]
[8]
[0]
[8]
[8]
[0]
[0]
[3]
[3]
[0]
[1]
[8]
[8]
[8]
[8]
[3]
[8]
[8]
[8]
[3]
[8]
[8]
[8]
[8]
[3]
[8]


In [7]:
# TODO: Use AI model to add label into db_prod in another file ?!