In [1]:
# OCR
import easyocr
from PIL import Image, ImageDraw, ImageFont
import pandas as pd

# NLP
import spacy
import torch


import sys
sys.path.append('..')
from nlp import preprocessing, model_utils

In [2]:
# TODO: Utiliser un exemple ou le modèle d'OCR fonctionne mieux !! Page avec une grande police pour visibilité sur le schéma
# TODO: Ajouter la partie clustering

# 1. OCR

In [3]:
image_path = "../../datas/02/page0004_5.jpg"
# image_path = "../../datas/01/page0002_4.jpg"


reader = easyocr.Reader(lang_list=["en"])  # or fr ?
results = reader.readtext(image_path)

results

[([[220, 75], [920, 75], [920, 186], [220, 186]],
  'REDMI 12C',
  0.9431594842359052),
 ([[522, 184], [916, 184], [916, 286], [522, 286]],
  '6/128G0',
  0.7904448339485585),
 ([[1165, 161], [1205, 161], [1205, 270], [1165, 270]],
  '2',
  0.07480668169648741),
 ([[218, 372], [899, 372], [899, 440], [218, 440]],
  'Mediateck Helio G85',
  0.7956549168575436),
 ([[369, 450], [899, 450], [899, 519], [369, 519]],
  'ecran 6.71" LCD',
  0.9979231421001593),
 ([[267, 518], [906, 518], [906, 611], [267, 611]],
  'capteur 50 mpixels',
  0.9994344017238052),
 ([[283, 604], [901, 604], [901, 669], [283, 669]],
  'batterie SOOOMAH',
  0.8736896506456936),
 ([[408, 788], [932, 788], [932, 974], [408, 974]], '150', 0.9857664877309339),
 ([[498, 968], [885, 968], [885, 1018], [498, 1018]],
  'au lieu de 210€',
  0.5069783764983952),
 ([[1001, 1051], [1061, 1051], [1061, 1095], [1001, 1095]],
  'Mi',
  0.40643546321609453),
 ([[1050, 1299], [1760, 1299], [1760, 1400], [1050, 1400]],
  'REDMI 12 46'

In [4]:
words_data = []
for word_group in results:

    y = word_group[0][0][1]
    x = word_group[0][0][0]

    words_data.append({'word': word_group[1], "position": (int(x),int(y))})

words_data

[{'word': 'REDMI 12C', 'position': (220, 75)},
 {'word': '6/128G0', 'position': (522, 184)},
 {'word': '2', 'position': (1165, 161)},
 {'word': 'Mediateck Helio G85', 'position': (218, 372)},
 {'word': 'ecran 6.71" LCD', 'position': (369, 450)},
 {'word': 'capteur 50 mpixels', 'position': (267, 518)},
 {'word': 'batterie SOOOMAH', 'position': (283, 604)},
 {'word': '150', 'position': (408, 788)},
 {'word': 'au lieu de 210€', 'position': (498, 968)},
 {'word': 'Mi', 'position': (1001, 1051)},
 {'word': 'REDMI 12 46', 'position': (1050, 1299)},
 {'word': '4/128G0', 'position': (1374, 1407)},
 {'word': 'MediaTek Helio G88', 'position': (1049, 1604)},
 {'word': 'ecran 6.79" 90HZ', 'position': (1045, 1682)},
 {'word': 'capteur 50 mpixels', 'position': (1039, 1754)},
 {'word': 'batterie SOOOMAH', 'position': (1047, 1834)},
 {'word': '2z08', 'position': (1032, 1987)},
 {'word': 'au lieu de 250€', 'position': (1067, 2185)},
 {'word': '!', 'position': (144, 2239)},
 {'word': 'M', 'position': (1

In [12]:
# Visualise with pandas Dataframe
df = pd.DataFrame({
    "Mots": [word["word"] for word in words_data],
    "Position": [word["position"] for word in words_data]
})

styled_df = df.style.hide()
styled_df

Mots,Position
REDMI 12C,"(220, 75)"
6/128G0,"(522, 184)"
2,"(1165, 161)"
Mediateck Helio G85,"(218, 372)"
"ecran 6.71"" LCD","(369, 450)"
capteur 50 mpixels,"(267, 518)"
batterie SOOOMAH,"(283, 604)"
150,"(408, 788)"
au lieu de 210€,"(498, 968)"
Mi,"(1001, 1051)"


# 2. Texte sur fond blanc

In [6]:

def create_image_with_words(image_path, words_data):
    # Ouvrir l'image
    original_image = Image.open(image_path)
    
    # Créer une nouvelle image avec le même mode et la même taille que l'originale
    new_image = Image.new(original_image.mode, original_image.size, color='white')

    # Initialiser le dessinateur
    draw = ImageDraw.Draw(new_image)

    # Copier les mots à leurs positions
    for word_info in words_data:
        word = word_info['word']
        position = word_info['position']

        # Extraire les coordonnées de la position
        x, y = position

        # Utiliser une police avec la taille spécifiée
        font = ImageFont.load_default()
        font = font.font_variant(size=70)
        # font = font.font_variant(size=10)


        draw.text((x, y), word, fill='black', font=font)

    # Enregistrer l'image résultante
    new_image.save("ocr_output_drawn.png")

In [7]:
create_image_with_words(image_path, words_data)

# 3. Nettoyage du texte

In [8]:
words = []
for word_group in words_data:
    words.append(word_group["word"])

raw_text = " ".join(words)

nlp = spacy.load("fr_core_news_lg")

cleaned_text = preprocessing.full_cleaning(raw_text, nlp)

cleaned_text

'redmi ecran lcd capteur batterie au lieu de mi redmi ecran hz capteur batterie au lieu de notre'

# 4. NLP - Catégorisation

In [9]:
with open("../nlp/vocab.txt", "r") as vocab_file:
    vocabulary = vocab_file.read().split("|")
    vocab_file.close()

input = preprocessing.get_bags_of_words_vector(cleaned_text,vocabulary)

model = model_utils.PageClassifier(len(vocabulary), 3) # Carefull here 3 is static parameters
model.load_state_dict(torch.load("../nlp/model_weight.pth"))

model_utils.predict(model,input)

result [[0.10481512 0.66739035 0.22779457]]
class_prediction => Électroménager  | Ustensiles
