# Taking the Images to Create the Dataset

In [1]:
import os

import cv2


DATA_DIR = './z'
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

number_of_classes = 1
dataset_size = 1000

cap = cv2.VideoCapture(0)
for j in range(number_of_classes):
    if not os.path.exists(os.path.join(DATA_DIR, str(j))):
        os.makedirs(os.path.join(DATA_DIR, str(j)))

    print('Collecting data for class {}'.format(j))

    done = False
    while True:
        ret, frame = cap.read()
        cv2.putText(frame, 'Ready? Press "Q" ! :)', (100, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 0), 3,
                    cv2.LINE_AA)
        cv2.imshow('frame', frame)
        if cv2.waitKey(25) == ord('q'):
            break

    # Start collecting new images without overwriting existing ones
    existing_files = os.listdir(os.path.join(DATA_DIR, str(j)))
    existing_numbers = [int(f.split('.')[0]) for f in existing_files if f.endswith('.jpg') and f.split('.')[0].isdigit()]
    counter = max(existing_numbers) + 1 if existing_numbers else 0

    while counter < dataset_size + len(existing_numbers):
        ret, frame = cap.read()
        cv2.imshow('frame', frame)
        cv2.waitKey(25)
        filename = os.path.join(DATA_DIR, str(j), f'{counter}.jpg')
        cv2.imwrite(filename, frame)
        counter += 1


cap.release()
cv2.destroyAllWindows()


Collecting data for class 0


# 

# 

# 

# 

# Creating The Dataset from Taken Images

In [2]:
import os
import pickle

import mediapipe as mp
import cv2
import matplotlib.pyplot as plt


mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

DATA_DIR = './data'

data = []
labels = []
for dir_ in os.listdir(DATA_DIR):
    for img_path in os.listdir(os.path.join(DATA_DIR, dir_)):
        data_aux = []

        x_ = []
        y_ = []

        img = cv2.imread(os.path.join(DATA_DIR, dir_, img_path))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        results = hands.process(img_rgb)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y

                    x_.append(x)
                    y_.append(y)

                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    data_aux.append(x - min(x_))
                    data_aux.append(y - min(y_))

            data.append(data_aux)
            labels.append(dir_)

f = open('data.pickle', 'wb')
pickle.dump({'data': data, 'labels': labels}, f)
f.close()


# 

# 

# 

# 

# Building The Classifier and training on Dataset

In [3]:
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np


data_dict = pickle.load(open('./data.pickle', 'rb'))

data = np.asarray(data_dict['data'])
labels = np.asarray(data_dict['labels'])

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, shuffle=True, stratify=labels)

model = RandomForestClassifier()

model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = accuracy_score(y_predict, y_test)

print('{}% of samples were classified correctly !'.format(score * 100))

f = open('model.p', 'wb')
pickle.dump({'model': model}, f)
f.close()


100.0% of samples were classified correctly !


# 

# 

# 

# 

# Testing the Model with Real-Time Inference

In [5]:
import pickle
import cv2
import mediapipe as mp
import numpy as np
import time
from collections import deque, Counter
from PIL import ImageFont, ImageDraw, Image
import arabic_reshaper
from bidi.algorithm import get_display

# ------------------------------
# Load Arabic model
# ------------------------------
model_dict = pickle.load(open('./model.p', 'rb'))
model = model_dict['model']

# ------------------------------
# Camera
# ------------------------------
cap = cv2.VideoCapture(0)

# Mediapipe setup
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

# ------------------------------
# Arabic Labels
# ------------------------------
arabic_letters = [
    "ا", "ب", "ت", "ث", "ج", "ح", "خ",
    "د", "ذ", "ر", "ز", "س", "ش", "ص", "ض",
    "ط", "ظ", "ع", "غ", "ف", "ق", "ك", "ل",
    "م", "ن", "ه", "و", "ي"
]
labels_dict = {i: arabic_letters[i] for i in range(len(arabic_letters))}
labels_dict[len(arabic_letters)] = "Space"
labels_dict[len(arabic_letters)+1] = "Backspace"

# ------------------------------
# Sentence tracking
# ------------------------------
sentence = ""
predictions_queue = deque(maxlen=20)
last_added_char = ""
last_time_added = time.time()
ADD_LETTER_DELAY = 3.0  # seconds between letters

# ------------------------------
# Scanning effect variables
# ------------------------------
scan_start_time = 0
scan_duration = 0.6  # seconds
scanning = False

# ------------------------------
# Function to draw Arabic text using PIL
# ------------------------------
def render_arabic_text(img, text, position=(620, 405), font_size=40, rtl=True):
    # reshape + bidi
    reshaped = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped)

    # convert cv2 -> PIL
    img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(img_pil)

    # اختار خط عربي (غير المسار لو عايز خط تاني)
    font = ImageFont.truetype("arial.ttf", font_size)

    if rtl:
        # Pillow ≥10.0 عندها textbbox بدلاً من textsize
        try:
            bbox = draw.textbbox((0, 0), bidi_text, font=font)
            text_width = bbox[2] - bbox[0]
        except AttributeError:
            text_width, _ = draw.textsize(bidi_text, font=font)  # fallback للقديم

        # عدل البداية عشان يبدأ من اليمين
        x, y = position
        x = x - text_width
        position = (x, y)

    # ارسم النص
    draw.text(position, bidi_text, font=font, fill=(0, 0, 0))

    # convert back to cv2
    return cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)



# ------------------------------
# Function to draw box
# ------------------------------
def draw_camera_box(img, x1, y1, x2, y2, color=(0, 0, 255), thickness=3, corner_len=150, full_box=True):
    if full_box:
        cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)
    else:
        cv2.line(img, (x1, y1), (x1 + corner_len, y1), color, thickness)
        cv2.line(img, (x1, y1), (x1, y1 + corner_len), color, thickness)
        cv2.line(img, (x2, y1), (x2 - corner_len, y1), color, thickness)
        cv2.line(img, (x2, y1), (x2, y1 + corner_len), color, thickness)
        cv2.line(img, (x1, y2), (x1 + corner_len, y2), color, thickness)
        cv2.line(img, (x1, y2), (x1, y2 - corner_len), color, thickness)
        cv2.line(img, (x2, y2), (x2 - corner_len, y2), color, thickness)
        cv2.line(img, (x2, y2), (x2, y2 - corner_len), color, thickness)

# ------------------------------
# Main loop
# ------------------------------
while True:
    data_aux = []
    x_, y_ = [], []
    ret, frame = cap.read()
    H, W, _ = frame.shape
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    current_time = time.time()

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                frame,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style())

            for landmark in hand_landmarks.landmark:
                x = landmark.x
                y = landmark.y
                x_.append(x)
                y_.append(y)

            for landmark in hand_landmarks.landmark:
                data_aux.append(landmark.x - min(x_))
                data_aux.append(landmark.y - min(y_))

            x1 = int(min(x_) * W) - 20
            y1 = int(min(y_) * H) - 20
            x2 = int(max(x_) * W) + 20
            y2 = int(max(y_) * H) + 20

            prediction = model.predict([np.asarray(data_aux)])
            predicted_character = labels_dict[int(prediction[0])]
            predictions_queue.append(predicted_character)

            most_common_char, count = Counter(predictions_queue).most_common(1)[0]

            if count > 15 and (most_common_char != last_added_char or current_time - last_time_added > ADD_LETTER_DELAY):
                if most_common_char == 'Space':
                    sentence += ' '
                elif most_common_char == 'Backspace':
                    sentence = sentence[:-1]
                else:
                    sentence += most_common_char
                last_added_char = most_common_char
                last_time_added = current_time
                scan_start_time = current_time
                scanning = True

            # Draw box
            draw_camera_box(frame, x1, y1, x2, y2, color=(0, 0, 255))

            # Scanning effect
            if scanning and current_time - scan_start_time < scan_duration:
                progress = (current_time - scan_start_time) / scan_duration
                scan_y = int(y1 + progress * (y2 - y1))
                cv2.line(frame, (x1, scan_y), (x2, scan_y), (0, 255, 0), 2)
            else:
                scanning = False

            # Show predicted char (small overlay)
            frame = render_arabic_text(frame, most_common_char, position=(x1, y1 - 40), font_size=40)

    # Show sentence box
    cv2.rectangle(frame, (20, 400), (620, 450), (255, 255, 255), -1)
    frame = render_arabic_text(frame, sentence, position=(600, 405), font_size=40, rtl=True)

    # Show frame
    cv2.imshow('frame', frame)

    # Keyboard input
    key = cv2.waitKey(1)
    if key == ord('q'):
        break
    elif key == ord('c'):
        sentence = ""
        last_added_char = ""
    elif key == 32:  # Space
        sentence += ' '
        last_added_char = ""
        predictions_queue.clear()
    elif key == ord('z'):  # Backspace
        if sentence:
            sentence = sentence[:-1]
            last_added_char = ""
            predictions_queue.clear()

cap.release()
cv2.destroyAllWindows()
