In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import zipfile
import os

zip_path = "/content/drive/My Drive/archive.zip"  # Update if needed
extract_path = "/content/sample_data/ML_DATA"

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction completed!")

Extraction completed!


In [None]:
import os
import json
import tensorflow as tf
import cv2
import numpy as np

# Define paths
DATASET_PATH = "/content/sample_data/ML_DATA/subset"  # Change this to your dataset path
IMAGE_DIRS = {
    "train": os.path.join(DATASET_PATH, "img_train"),
    "val": os.path.join(DATASET_PATH, "img_val"),
    "test": os.path.join(DATASET_PATH, "img_test"),
}
ANNOTATION_DIRS = {
    "train": os.path.join(DATASET_PATH, "words_train"),
    "val": os.path.join(DATASET_PATH, "words_val"),
    "test": os.path.join(DATASET_PATH, "words_test"),
}





In [None]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import Sequence


# Step 1: Data Generator for Table Detection
class DataGenerator(Sequence):
    def __init__(self, img_folder, json_folder, batch_size, img_size=(128, 128)):
        self.img_folder = img_folder
        self.json_folder = json_folder
        self.batch_size = batch_size
        self.img_size = img_size
        self.img_files = os.listdir(img_folder)
        self.json_files = os.listdir(json_folder)
        self.indexes = np.arange(len(self.img_files))

    def __len__(self):
        return int(np.ceil(len(self.img_files) / self.batch_size))

    def __getitem__(self, index):
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_img_files = [self.img_files[k] for k in batch_indexes]
        batch_json_files = [self.json_files[k] for k in batch_indexes]

        X = np.zeros((len(batch_img_files), *self.img_size, 3), dtype=np.float32)
        y = []

        for i, (img_file, json_file) in enumerate(zip(batch_img_files, batch_json_files)):
            img_path = os.path.join(self.img_folder, img_file)
            json_path = os.path.join(self.json_folder, json_file)

            # Load and resize image
            img = Image.open(img_path).resize(self.img_size)
            X[i] = np.array(img) / 255.0  # Normalize to [0, 1]

            # Load JSON and extract bbox
            with open(json_path, 'r') as f:
                data = json.load(f)
                if isinstance(data, list) and len(data) > 0:
                    bbox = data[0]['bbox']
                    # Normalize bbox to [0, 1]
                    bbox = [
                        bbox[0] / self.img_size[0],  # x_min / width
                        bbox[1] / self.img_size[1],  # y_min / height
                        bbox[2] / self.img_size[0],  # x_max / width
                        bbox[3] / self.img_size[1]   # y_max / height
                    ]
                    y.append(bbox)
                else:
                    raise ValueError(f"Invalid JSON format in {json_path}. Expected a list of dictionaries with 'bbox' key.")

        return X, np.array(y)

    def on_epoch_end(self):
        np.random.shuffle(self.indexes)

In [None]:
# Step 2: Define the CNN Model for Table Detection
def create_cnn_model(input_shape):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(4)  # Output 4 values for bbox [x_min, y_min, x_max, y_max]
    ])
    return model

# Step 3: Train the Table Detection Model
def train_table_detection_model(train_img_folder, train_json_folder, val_img_folder, val_json_folder, batch_size=12, img_size=(128, 128), epochs=5):
    input_shape = (*img_size, 3)

    # Create data generators
    train_generator = DataGenerator(train_img_folder, train_json_folder, batch_size, img_size)
    val_generator = DataGenerator(val_img_folder, val_json_folder, batch_size, img_size)

    # Create and compile the model
    model = create_cnn_model(input_shape)
    model.compile(optimizer='adam', loss='mean_squared_error')  # Use MSE for regression

    # Train the model
    history = model.fit(train_generator, validation_data=val_generator, epochs=epochs)

    return model, history


# Step 4: Predict Bounding Boxes for Test Images
def predict_bboxes(model, img_folder, output_csv):
    results = []

    for img_file in os.listdir(img_folder):
        img_path = os.path.join(img_folder, img_file)

        # Load and preprocess the image
        img = Image.open(img_path)
        img_array = np.array(img.resize((128, 128))) / 255.0  # Resize and normalize
        img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension

        # Predict the bounding box
        bbox = model.predict(img_array)[0]

        # Save results
        results.append({
            'image': img_file,
            'bbox': bbox
        })


    # Save results to a CSV file
    import pandas as pd
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")

In [None]:
  # Step 5: Run the Pipeline
if __name__ == "__main__":
    # Paths to your data
    train_img_folder = '/content/sample_data/ML_DATA/subset/img_train'
    train_json_folder = '/content/sample_data/ML_DATA/subset/words_train'
    val_img_folder = '/content/sample_data/ML_DATA/subset/img_val'
    val_json_folder = '/content/sample_data/ML_DATA/subset/words_val'
    test_img_folder = '/content/sample_data/ML_DATA/subset/img_test'
    # output_csv = '/content/sample_data/ML_DATA/subset/output_bboxes.csv'

    # Set batch size
    batch_size = 12  # You can change this value

    # Step 1: Train the table detection model
    print("Training table detection model...")
    model, history = train_table_detection_model(train_img_folder, train_json_folder, val_img_folder, val_json_folder, batch_size=batch_size)
    model.save('/content/drive/My Drive/table_detection_model.h5')
    # # Ensure you're passing only the model to the prediction function
    # print("Predicting bounding boxes for test images...")
    # predict_bboxes(model, test_img_folder, output_csv)


Training table detection model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


Epoch 1/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 88ms/step - loss: 0.4804 - val_loss: 0.4733
Epoch 2/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m423s[0m 85ms/step - loss: 0.4457 - val_loss: 0.4759
Epoch 3/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m377s[0m 75ms/step - loss: 0.4487 - val_loss: 0.4776
Epoch 4/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 73ms/step - loss: 0.4485 - val_loss: 0.4759
Epoch 5/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 72ms/step - loss: 0.4447 - val_loss: 0.4749


