## 00. Preliminary

### Import Library

In [None]:
!sudo apt update
!sudo apt install tesseract-ocr
!pip install pytesseract

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Waiting for headers] [C[0m                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
0% [Working][0m^C
Reading package lists.

In [None]:
import pytesseract
from PIL import Image, ImageDraw, ImageFont
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import nltk
from nltk.corpus import wordnet
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [None]:
!pip install tensorflow
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils import to_categorical



### Mounted Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/FinalCapstone/

/content/drive/My Drive/FinalCapstone


### Read Datasets

In [None]:
data = pd.read_csv('data.csv')

data.head()

Unnamed: 0,nama,harga,kategori
0,Biskuit Roma Kelapa,13200,Food
1,Twister Wafer Roll,8600,Food
2,Susu Vanilla,7800,Food
3,Susu Cokelat,7900,Food
4,Susu Strawberry,7900,Food


Shuffle the dataset to mix up the data

In [None]:
df = shuffle(data, random_state=42)

df

Unnamed: 0,nama,harga,kategori
1116,DVD Box Set Serial TV,466000,Entertainment
1368,Kaos Panjang,654000,Clothing
422,Printer,1000000,Others
413,Set Top Box,250000,Others
451,Pisau,20000,Others
...,...,...,...
1130,Kaset Musik,671000,Entertainment
1294,Buku Teknik Film Making,282000,Entertainment
860,Tinta Pulpen,12000,Stationery
1459,Rompi Reflective,868000,Clothing


## 01. Build & Train NLP Model

### Feature Engineering

In [None]:
# Separate the features and the labels
features = df['nama']
labels = df['kategori']

In [None]:
# Tokenize the features
tokenizer = Tokenizer()
tokenizer.fit_on_texts(features)
sequences = tokenizer.texts_to_sequences(features)
word_index = tokenizer.word_index

In [None]:
# Padding sequences
max_length = max(len(seq) for seq in sequences)
data_padded = pad_sequences(sequences, maxlen=max_length)

In [None]:
# Encode labels to numerical
label_binarizer = LabelBinarizer()
train_labels = label_binarizer.fit_transform(labels)
test_labels = label_binarizer.transform(labels)

### Split Data Training Testing, 80:20

In [None]:
# Split the data into a training set and a validation set
x_train, x_val, y_train, y_val = train_test_split(data_padded, test_labels, test_size=0.2, stratify=labels, random_state=42)

### Defining Model

In [None]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=len(word_index) + 1, output_dim=8, input_length=max_length),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, activation='tanh', return_sequences=True, dropout=0.3)),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, activation='tanh', dropout=0.3)),
  tf.keras.layers.Dense(len(label_binarizer.classes_), activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 6, 8)              12184     
                                                                 
 bidirectional (Bidirection  (None, 6, 200)            87200     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 200)               240800    
 onal)                                                           
                                                                 
 dense (Dense)               (None, 7)                 1407      
                                                                 
Total params: 341591 (1.30 MB)
Trainable params: 341591 (1.30 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, epochs=200, verbose=1, validation_data=(x_val, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Mengambil data akurasi dan akurasi validasi dari history
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Menampilkan akurasi dan akurasi validasi
print("Accuracy:", accuracy)
print("Validation Accuracy:", val_accuracy)

Accuracy: [0.9883333444595337, 0.9916666746139526, 0.9866666793823242, 0.9858333468437195, 0.9825000166893005, 0.9866666793823242, 0.9866666793823242, 0.98416668176651, 0.9858333468437195, 0.9800000190734863, 0.9833333492279053, 0.9833333492279053, 0.9850000143051147, 0.987500011920929, 0.9833333492279053, 0.9858333468437195, 0.9891666769981384, 0.9858333468437195, 0.9883333444595337, 0.9883333444595337, 0.9808333516120911, 0.9908333420753479, 0.98416668176651, 0.9850000143051147, 0.98416668176651, 0.9825000166893005, 0.98416668176651, 0.9858333468437195, 0.9858333468437195, 0.9858333468437195, 0.9858333468437195, 0.9858333468437195, 0.9858333468437195, 0.9866666793823242, 0.9900000095367432, 0.98416668176651, 0.9858333468437195, 0.987500011920929, 0.9858333468437195, 0.98416668176651, 0.987500011920929, 0.987500011920929, 0.9883333444595337, 0.9900000095367432, 0.987500011920929, 0.9858333468437195, 0.987500011920929, 0.9866666793823242, 0.9816666841506958, 0.9891666769981384, 0.98416

### Predict New Items

In [None]:
# Predict the category of a new text
new_text = ["Kursi kantor"]
new_sequence = tokenizer.texts_to_sequences(new_text)
new_padded = pad_sequences(new_sequence, maxlen=max_length)
prediction = model.predict(new_padded)

predicted_label_index = np.argmax(prediction)
predicted_label = label_binarizer.classes_[predicted_label_index]

confidence = prediction[0][predicted_label_index]

confidence_threshold = 0.6

if confidence < confidence_threshold:
    predicted_label = 'Others'
else:
    predicted_label = label_binarizer.classes_[predicted_label_index]

print(f'Text "{new_text[0]}" classified as category: {predicted_label}')
print(f'Confidence: {confidence:.4f}')

Text "Kursi kantor" classified as category: Others
Confidence: 0.9999


Hasil percobaan:
- Kemeja Stripes (Clothing), Nasi goreng (food), Anti dandruff shampoo (toiletries) **SUDAH BENAR**

- Voucher Spotify (Entertainment) -> diprediksi sebagai Others

- Stiker Lucu (Others) -> diprediksi sebagai Stationery

### Saving Model

In [None]:
# Save the model
model.save('nlp_model_eldira.h5')

## 02. Make Receipt Datasets

In [None]:
def draw_text(draw, text, position, font, max_width):
    """
    Draws text within a specified width on an image.

    :param draw: ImageDraw object to draw on the image.
    :param text: The text to be drawn.
    :param position: Tuple (x, y) where the text will start.
    :param font: The font of the text.
    :param max_width: The maximum width allowed for the text.
    :return: The y position after the text is drawn.
    """
    lines = []
    # Split the text into lines that fit within the max_width
    if draw.textsize(text, font=font)[0] <= max_width:
        lines.append(text)
    else:
        words = text.split(' ')
        line = ''
        for word in words:
            if draw.textsize(line + word, font=font)[0] <= max_width:
                line += word + ' '
            else:
                lines.append(line)
                line = word + ' '
        lines.append(line)

    y = position[1]
    # Draw each line on the image
    for line in lines:
        draw.text((position[0], y), line, font=font, fill='black')
        y += font.getsize(line)[1]

    return y

In [None]:
def generate_random_receipts(list_items, num_receipts, font_path, logo_path):
    """
    Generates random receipt images with varying items and quantities.

    :param list_items: A list of dictionaries containing item details.
    :param num_receipts: The number of receipts to generate.
    :param font_path: Path to the font file used for drawing text.
    :param logo_path: Path to the logo image file.
    """

    # Load fonts
    font = ImageFont.truetype(font_path, 20)
    total_font = ImageFont.truetype(font_path, 24)

    # Set image dimensions and styling parameters
    image_width = 800
    margin = 20
    line_height = 30

    # Generate each receipt image
    for receipt_id in range(1, num_receipts + 1):
        num_items = random.randint(1, 50)
        chosen_items = random.sample(list_items, num_items)
        total_price = 0

        # Calculate receipt height based on number of items
        receipt_height = (num_items + 6) * line_height + 6 * margin
        image = Image.new('RGB', (image_width, receipt_height), 'white')
        draw = ImageDraw.Draw(image)

        y = margin
        # Try to open and paste the logo onto the receipt
        try:
            logo = Image.open(logo_path)
            logo.thumbnail((200, 200))
            image.paste(logo, (margin + 278, margin))
        except IOError:
            print(f"Logo file not found at {logo_path}, skipping logo.")

        # Skip space for logo height
        y += line_height + 50

        # Header information for the receipt
        header_info = [
            f"{'JL. WR. SUPRATMAN, LABUHANBATU':^102}",
            '-' * 114,
            f"{'16.06.18-17:00'}{'1.6.24':^77}{'031153/JOKO/5501'}",
            '-' * 114
        ]

        # Draw header information on the receipt
        for info in header_info:
            draw.text((margin, y), info, font=font, fill='black')
            y += line_height

        # Draw each item with its quantity and price on the receipt
        for item in chosen_items:
            quantity = random.randint(1, 3)
            item_total = quantity * item['harga']
            total_price += item_total

            draw.text((margin, y), item['nama'], font=font, fill='black')
            draw.text((margin + 450, y), "{:.0f}".format(quantity), font=font, fill='black')
            draw.text((margin + 525, y), "{:,.0f}".format(item['harga']), font=font, fill='black')
            draw.text((margin + 650, y), "{:,.0f}".format(item_total), font=font, fill='black')
            y += line_height

        # Draw a line before the total
        draw.text((margin + 450, y), '-' * 46, font=font, fill='black')
        y += line_height

        # Draw the total price of all items
        draw.text((margin + 450, y), "Total:", font=total_font, fill='black')
        draw.text((margin + 630, y), "{:,.0f}".format(total_price), font=total_font, fill='black')

        # Save the receipt image with a unique filename
        image_filename = f"data/receipt_{receipt_id:02d}.jpg"
        image.save(image_filename)

    print("Receipt images generated.")

In [None]:
# Load data from a CSV file into a DataFrame and convert it to a list of dictionaries
receipts_df = pd.read_csv('data.csv')
list_of_dicts = receipts_df.to_dict('records')

In [None]:
# Parameters for receipt generation
font_path = "arial.ttf"  # Path to the font file
logo_path = "indomaret_logo.png"  # Path to the logo image file
num_receipts = 10  # Number of receipts to generate

In [None]:
# Generate random receipts images
generate_random_receipts(list_of_dicts, num_receipts, font_path, logo_path)

## 03. [OCR] Text Localization & Text Recognition

## 04. Key Information Extraction (Convert to JSON)