In [77]:
import pandas as pd

# Load the CSV file
file_path = 'data/cropped-by-semantic-tag/_images_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
data.head()

Unnamed: 0,id,tag,img_path
0,0,a,..\..\..\data\cropped-by-semantic-tag\0.png
1,1,a,..\..\..\data\cropped-by-semantic-tag\1.png
2,2,header,..\..\..\data\cropped-by-semantic-tag\2.png
3,3,button,..\..\..\data\cropped-by-semantic-tag\3.png
4,4,button,..\..\..\data\cropped-by-semantic-tag\4.png


In [78]:
from sklearn.model_selection import train_test_split

# Split the data into training and remaining data with a 70-30 split
train_data, remaining_data = train_test_split(data, test_size=0.3, stratify=data['tag'], random_state=42)

# Split the remaining data equally into validation and test sets
validation_data, test_data = train_test_split(remaining_data, test_size=0.5, stratify=remaining_data['tag'], random_state=42)

# Display the size of each set
(len(train_data), len(validation_data), len(test_data))


(9323, 1998, 1998)

In [80]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
IMG_WIDTH, IMG_HEIGHT = 150, 150  # Define the target width and height of images

train_datagen = ImageDataGenerator(
    rescale=1./255,  # Rescale pixel values from [0, 255] to [0, 1]
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Note: No data augmentation should be applied to validation and test sets
validation_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [82]:

# //Create generators
def update_image_path(old_path):
    # Split the path and get the last part (the filename)
    filename = old_path.split('\\')[-1]
    # Construct the new path
    new_path = f'data/cropped-by-semantic-tag/{filename}'
    return new_path


train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_data,
    x_col='img_path',
    y_col='tag',
    target_size=(150, 150),  # Adjust based on your image size
    batch_size=32,
    class_mode='categorical'
)

validation_generator = validation_datagen.flow_from_dataframe(
    dataframe=validation_data,
    x_col='img_path',
    y_col='tag',
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical'
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_data,
    x_col='img_path',
    y_col='tag',
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical'
)
# Apply this transformation to each DataFrame
train_data['img_path'] = train_data['img_path']
# validation_data['img_path'] = validation_data['img_path'].apply(update_image_path)
# test_data['img_path'] = test_data['img_path'].apply(update_image_path)





Found 9323 validated image filenames belonging to 11 classes.
Found 1998 validated image filenames belonging to 11 classes.
Found 1998 validated image filenames belonging to 11 classes.


In [83]:
train_tag_counts = train_data['tag'].value_counts()
print("Training Data Tag Counts:")
print(train_tag_counts)

# Count instances of each tag in the validation data
validation_tag_counts = validation_data['tag'].value_counts()
print("\nValidation Data Tag Counts:")
print(validation_tag_counts)

# Count instances of each tag in the test data
test_tag_counts = test_data['tag'].value_counts()
print("\nTest Data Tag Counts:")
print(test_tag_counts)


Training Data Tag Counts:
tag
h3          1885
button      1837
h2          1656
a           1376
input        863
h4           573
header       373
h1           303
form         288
footer       161
textarea       8
Name: count, dtype: int64

Validation Data Tag Counts:
tag
h3          404
button      394
h2          354
a           295
input       185
h4          123
header       80
h1           65
form         62
footer       34
textarea      2
Name: count, dtype: int64

Test Data Tag Counts:
tag
h3          404
button      394
h2          355
a           295
input       185
h4          122
header       80
h1           65
form         61
footer       35
textarea      2
Name: count, dtype: int64


In [84]:
# Function to oversample a dataset
def oversample_dataset(dataset):
    majority_class_size = dataset['tag'].value_counts().max()
    oversampled_data = pd.DataFrame()

    for tag, group in dataset.groupby('tag'):
        oversampled_group = resample(group,
                                      replace=True,
                                      n_samples=majority_class_size,
                                      random_state=42)
        oversampled_data = pd.concat([oversampled_data, oversampled_group])

    return oversampled_data

oversampled_train = oversample_dataset(train_data)
oversampled_validation = oversample_dataset(validation_data)
oversampled_test = oversample_dataset(test_data)

# Now, `oversampled_data` contains a balanced dataset

print(oversampled_train['img_path'])
# print(oversampled_validation['tag'].value_counts())
# print(oversampled_test['tag'].value_counts())


631        data/cropped-by-semantic-tag/631.png
8262      data/cropped-by-semantic-tag/8262.png
9380      data/cropped-by-semantic-tag/9380.png
1609      data/cropped-by-semantic-tag/1609.png
2929      data/cropped-by-semantic-tag/2929.png
                          ...                  
5797      data/cropped-by-semantic-tag/5797.png
12296    data/cropped-by-semantic-tag/12296.png
8041      data/cropped-by-semantic-tag/8041.png
10035    data/cropped-by-semantic-tag/10035.png
8041      data/cropped-by-semantic-tag/8041.png
Name: img_path, Length: 20735, dtype: object


In [85]:
# //Create generators


train_generator = train_datagen.flow_from_dataframe(
    dataframe=oversampled_train,
    x_col='img_path',
    y_col='tag',
    target_size=(150, 150),  # Adjust based on your image size
    batch_size=32,
    class_mode='categorical'
)

validation_generator = validation_datagen.flow_from_dataframe(
    dataframe=oversampled_validation,
    x_col='img_path',
    y_col='tag',
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical'
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=oversampled_test,
    x_col='img_path',
    y_col='tag',
    target_size=(150, 150),
    batch_size=32,
    class_mode='categorical'
)

Found 20735 validated image filenames belonging to 11 classes.
Found 4444 validated image filenames belonging to 11 classes.
Found 4444 validated image filenames belonging to 11 classes.


In [86]:
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(512, activation='relu'),
    layers.Dense(len(train_generator.class_indices), activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [87]:
history = model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=10,  
    validation_data=validation_generator,
    validation_steps=len(validation_generator)
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [88]:
model.save('model.keras')