In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [2]:
# -------------------------------
# 1. Read CSV and Prepare the Data
# -------------------------------
# CSV file path on Kaggle
csv_path = '/kaggle/input/offensivelabels/Dataset.csv'
df = pd.read_csv(csv_path)

# Create a new column 'category' based on the CSV labels:
# If 'nude' column is 1, then label as 'nude', otherwise label as 'safe'.
df['category'] = df.apply(lambda row: 'nude' if row['nude'] == 1 else 'safe', axis=1)

# Create a 'filepath' column to indicate the relative path from the image directory.
# For example, if category is 'nude', then filepath will be 'nude/imagename.jpg'.
df['filepath'] = df['category'] + '/' + df['image_name']

In [3]:
# -------------------------------
# 2. Split and Oversample the Data
# -------------------------------
# Split the data into training and validation sets (using stratification to preserve class distribution)
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=42)

# Separate training data by category
train_df_nude = train_df[train_df['category'] == 'nude']
train_df_safe = train_df[train_df['category'] == 'safe']

# Oversample the minority class (nude) to match the number of safe images.
train_df_nude_oversampled = resample(train_df_nude,
                                     replace=True,
                                     n_samples=len(train_df_safe),
                                     random_state=42)

# Combine the oversampled nude images with the safe images
train_df_balanced = pd.concat([train_df_safe, train_df_nude_oversampled])

print("After oversampling, class distribution:")
print(train_df_balanced['category'].value_counts())

After oversampling, class distribution:
category
safe    837
nude    837
Name: count, dtype: int64


In [4]:
# -------------------------------
# 3. Set Up Data Generators
# -------------------------------
IMG_HEIGHT, IMG_WIDTH = 224, 224
BATCH_SIZE = 32

# Define a data augmentation generator for the training set.
train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    horizontal_flip=True,
    rotation_range=20,
    zoom_range=0.2
)

# For the validation set, only preprocessing is applied.
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

In [5]:
# Directory where the images are stored (contains subfolders 'nude' and 'safe')
images_dir = '/kaggle/input/offensiveimg/dataset'

In [6]:
# Create the training generator from the balanced training DataFrame.
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df_balanced,
    directory=images_dir,  # base directory with subfolders
    x_col='filepath',
    y_col='category',
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='categorical',  # since we have two classes (nude and safe)
    shuffle=True
)

# Create the validation generator.
validation_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=images_dir,
    x_col='filepath',
    y_col='category',
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

Found 1362 validated image filenames belonging to 2 classes.




Found 260 validated image filenames belonging to 2 classes.




In [None]:
# -------------------------------
# 4. Build the Model Using Transfer Learning
# -------------------------------
# Load MobileNetV2 with pretrained ImageNet weights (without the top layer)
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(IMG_HEIGHT, IMG_WIDTH, 3))
base_model.trainable = False  # Freeze the base model

# Add custom layers for classification
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(128, activation='relu')(x)
predictions = Dense(2, activation='softmax')(x)  # Two classes: nude and safe

# Define the full model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [8]:
# -------------------------------
# 5. Train the Model
# -------------------------------
# Define callbacks for saving the best model and early stopping
callbacks = [
    ModelCheckpoint('model_best.keras', monitor='val_accuracy', mode='max', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True, verbose=1)
]

EPOCHS = 20
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // BATCH_SIZE,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks
)

Epoch 1/20


  self._warn_if_super_not_called()


[1m24/42[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m23s[0m 1s/step - accuracy: 0.6147 - loss: 0.6976



[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.6991 - loss: 0.5764




Epoch 1: val_accuracy improved from -inf to 0.96484, saving model to model_best.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1s/step - accuracy: 0.7025 - loss: 0.5713 - val_accuracy: 0.9648 - val_loss: 0.1269
Epoch 2/20
[1m 1/42[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 38ms/step - accuracy: 1.0000 - loss: 0.0504

  self.gen.throw(typ, value, traceback)



Epoch 2: val_accuracy improved from 0.96484 to 1.00000, saving model to model_best.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 79ms/step - accuracy: 1.0000 - loss: 0.0504 - val_accuracy: 1.0000 - val_loss: 0.1020
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 958ms/step - accuracy: 0.9847 - loss: 0.0837
Epoch 3: val_accuracy did not improve from 1.00000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 1s/step - accuracy: 0.9846 - loss: 0.0837 - val_accuracy: 0.9766 - val_loss: 0.0775
Epoch 4/20
[1m 1/42[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 45ms/step - accuracy: 0.9688 - loss: 0.0806
Epoch 4: val_accuracy did not improve from 1.00000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9688 - loss: 0.0806 - val_accuracy: 1.0000 - val_loss: 0.1124
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 912ms/step - accuracy: 0.9811 - loss: 0.0670
Epoch 5: v

In [9]:
# -------------------------------
# 6. Save and Evaluate the Model
# -------------------------------
model.save('final_model_oversampled.keras')

val_loss, val_acc = model.evaluate(validation_generator)
print(f'Validation accuracy after oversampling: {val_acc:.4f}')

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 839ms/step - accuracy: 0.9761 - loss: 0.1240
Validation accuracy after oversampling: 0.9654
