In [22]:
import pandas as pd

# Load the CSV file
file_path = 'data/cropped-by-semantic-tag/_images_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
data.head()

Unnamed: 0,id,tag,img_path
0,0,a,..\..\..\data\cropped-by-semantic-tag\0.png
1,1,a,..\..\..\data\cropped-by-semantic-tag\1.png
2,2,header,..\..\..\data\cropped-by-semantic-tag\2.png
3,3,button,..\..\..\data\cropped-by-semantic-tag\3.png
4,4,button,..\..\..\data\cropped-by-semantic-tag\4.png


In [23]:
from sklearn.model_selection import train_test_split

# Split the data into training and remaining data with a 70-30 split
train_data, remaining_data = train_test_split(data, test_size=0.3, stratify=data['tag'], random_state=42)

# Split the remaining data equally into validation and test sets
validation_data, test_data = train_test_split(remaining_data, test_size=0.5, stratify=remaining_data['tag'], random_state=42)

# Display the size of each set
(len(train_data), len(validation_data), len(test_data))


(9323, 1998, 1998)

In [24]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [25]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
IMG_WIDTH, IMG_HEIGHT = 150, 150  # Define the target width and height of images

train_datagen = ImageDataGenerator(
    rescale=1./255,  # Rescale pixel values from [0, 255] to [0, 1]
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Note: No data augmentation should be applied to validation and test sets
validation_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [28]:
# //Create generators
def update_image_path(old_path):
    # Split the path and get the last part (the filename)
    filename = old_path.split('\\')[-1]
    # Construct the new path
    new_path = f'data/cropped-by-semantic-tag/{filename}'
    return new_path


train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_data,
    x_col='img_path',
    y_col='tag',
    target_size=(150, 150),  # Adjust based on your image size
    batch_size=32,
    class_mode='categorical'
)

validation_generator = validation_datagen.flow_from_dataframe(
    dataframe=validation_data,
    x_col='img_path',
    y_col='tag',
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical'
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_data,
    x_col='img_path',
    y_col='tag',
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical'
)

# Apply this transformation to each DataFrame
# train_data['img_path'] = train_data['img_path'].apply(update_image_path)
# validation_data['img_path'] = validation_data['img_path'].apply(update_image_path)
# test_data['img_path'] = test_data['img_path'].apply(update_image_path)





Found 9323 validated image filenames belonging to 11 classes.
Found 1998 validated image filenames belonging to 11 classes.
Found 1998 validated image filenames belonging to 11 classes.


In [35]:
train_tag_counts = train_data['tag'].value_counts()
print("Training Data Tag Counts:")
print(train_tag_counts)

# Count instances of each tag in the validation data
validation_tag_counts = validation_data['tag'].value_counts()
print("\nValidation Data Tag Counts:")
print(validation_tag_counts)

# Count instances of each tag in the test data
test_tag_counts = test_data['tag'].value_counts()
print("\nTest Data Tag Counts:")
print(test_tag_counts)


Training Data Tag Counts:
tag
h3          1885
button      1837
h2          1656
a           1376
input        863
h4           573
header       373
h1           303
form         288
footer       161
textarea       8
Name: count, dtype: int64

Validation Data Tag Counts:
tag
h3          404
button      394
h2          354
a           295
input       185
h4          123
header       80
h1           65
form         62
footer       34
textarea      2
Name: count, dtype: int64

Test Data Tag Counts:
tag
h3          404
button      394
h2          355
a           295
input       185
h4          122
header       80
h1           65
form         61
footer       35
textarea      2
Name: count, dtype: int64


Collecting imbalanced-learn==0.8.0
  Obtaining dependency information for imbalanced-learn==0.8.0 from https://files.pythonhosted.org/packages/80/98/dc784205a7e3034e84d41ac4781660c67ad6327f2f5a80c568df31673d1c/imbalanced_learn-0.8.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl.metadata (11 kB)
Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.5/206.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.12.0
    Uninstalling imbalanced-learn-0.12.0:
      Successfully uninstalled imbalanced-learn-0.12.0
Successfully installed imbalanced-learn-0.8.0
Note: you may need to restart the kernel to use updated packages.


In [53]:
from sklearn.utils import resample

# Identify the majority class
majority_class_size_train = train_data['tag'].value_counts().max()

# Create a new DataFrame for the oversampled data
oversampled_data_train = pd.DataFrame()

for tag, group in train_data.groupby('tag'):
    oversampled_group = resample(group,
                                  replace=True,  # sample with replacement
                                  n_samples=majority_class_size_train,  # match number in majority class
                                  random_state=42)  # reproducible results
    oversampled_data_train = pd.concat([oversampled_data, oversampled_group])

# Now, `oversampled_data` contains a balanced dataset

print(oversampled_data_train['tag'].value_counts())



tag
textarea    1885
Name: count, dtype: int64


In [46]:
train_tag_counts = train_data['tag'].value_counts()
print("Training Data Tag Counts:")
print(train_tag_counts)

# Count instances of each tag in the validation data
validation_tag_counts = validation_data['tag'].value_counts()
print("\nValidation Data Tag Counts:")
print(validation_tag_counts)

# Count instances of each tag in the test data
test_tag_counts = test_data['tag'].value_counts()
print("\nTest Data Tag Counts:")
print(test_tag_counts)

Training Data Tag Counts:
tag
h3          1885
button      1837
h2          1656
a           1376
input        863
h4           573
header       373
h1           303
form         288
footer       161
textarea       8
Name: count, dtype: int64

Validation Data Tag Counts:
tag
h3          404
button      394
h2          354
a           295
input       185
h4          123
header       80
h1           65
form         62
footer       34
textarea      2
Name: count, dtype: int64

Test Data Tag Counts:
tag
h3          404
button      394
h2          355
a           295
input       185
h4          122
header       80
h1           65
form         61
footer       35
textarea      2
Name: count, dtype: int64


In [11]:
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(512, activation='relu'),
    layers.Dense(len(train_generator.class_indices), activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


2024-03-04 22:35:02.286227: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-03-04 22:35:02.286263: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-03-04 22:35:02.286268: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-03-04 22:35:02.286303: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-04 22:35:02.286321: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
history = model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=10,  # You can adjust the number of epochs
    validation_data=validation_generator,
    validation_steps=len(validation_generator)
)


Epoch 1/10


2024-03-04 22:35:06.104938: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 