<a href="https://colab.research.google.com/github/Quixowo/AI-vs.-Real-Images-Classifier-/blob/main/classifer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# --- Environment Setup & Imports ---
# Standard libraries
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import kagglehub

# Deep Learning libraries
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV3Large
from tensorflow.keras.applications.mobilenet_v3 import preprocess_input

# --- Data Acquisition ---
# Download dataset (only runs if not already present)
path = kagglehub.dataset_download("alessandrasala79/ai-vs-human-generated-dataset")
print("Dataset base path:", path)

TensorFlow Version: 2.19.0
Resuming download from 463470592 bytes (10011918990 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/alessandrasala79/ai-vs-human-generated-dataset?dataset_version_number=4 (463470592/10475389582) bytes left.


100%|██████████| 9.76G/9.76G [01:41<00:00, 99.0MB/s]

Extracting files...





Dataset base path: /root/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4


In [8]:
# ---- Data Preparation ----

# Load train CSV
train_csv = os.path.join(path, "train.csv")
train_df = pd.read_csv(train_csv)

# Create filtered DataFrames with full paths

real_df = train_df[train_df['label'] == 0].reset_index(drop=True)
fake_df = train_df[train_df['label'] == 1].reset_index(drop=True)

real_df['full_path'] = real_df['file_name'].apply(lambda x: os.path.join(path, x))
fake_df['full_path'] = fake_df['file_name'].apply(lambda x: os.path.join(path, x))
real_df = real_df.drop(columns = ['Unnamed: 0', 'file_name'])
fake_df = fake_df.drop(columns = ['Unnamed: 0', 'file_name'])

In [9]:
# --- 1. Combine and Shuffle ---
# Combine both DataFrames into one.
all_data_df = pd.concat([real_df, fake_df])

# IMPORTANT: Must shuffle the combined data
all_data_df = all_data_df.sample(frac=1).reset_index(drop=True)

print(f"\nTotal combined images: {len(all_data_df)}")
print(all_data_df.head())

# This makes it easier for the generator to understand the class names.
# It will see '0' and '1' as two distinct class names.
all_data_df['label'] = all_data_df['label'].astype(str)

# --- 2. Improved Configuration ---
TARGET_IMG_SIZE = (256, 256)
BATCH_SIZE = 32

# --- 3. Create the ImageDataGenerator ---
# This generator will read paths from the DataFrame and load images in batches.
# It also automatically:
# - Rescales pixels (divides by 255)
# - Splits data into training (80%) and validation (20%)
datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    validation_split=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1,
    fill_mode='nearest',
)


print("\nCreating Training Generator...")
train_generator = datagen.flow_from_dataframe(
    dataframe=all_data_df,
    x_col='full_path',        # Column in your DF with file paths
    y_col='label',            # Column in your DF with labels
    target_size=TARGET_IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',      # Use 'binary' for 2 classes (0 and 1)
    subset='training'         # Specify this is the training set
)

print("\nCreating Validation Generator...")
validation_generator = datagen.flow_from_dataframe(
    dataframe=all_data_df,
    x_col='full_path',
    y_col='label',
    target_size=TARGET_IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation'       # Specify this is the validation set
)


Total combined images: 79950
   label                                          full_path
0      0  /root/.cache/kagglehub/datasets/alessandrasala...
1      1  /root/.cache/kagglehub/datasets/alessandrasala...
2      0  /root/.cache/kagglehub/datasets/alessandrasala...
3      0  /root/.cache/kagglehub/datasets/alessandrasala...
4      1  /root/.cache/kagglehub/datasets/alessandrasala...

Creating Training Generator...
Found 63960 validated image filenames belonging to 2 classes.

Creating Validation Generator...
Found 15990 validated image filenames belonging to 2 classes.


In [None]:
# --- 4. Build Optimized MobileNetV3Large ---
base_model = MobileNetV3Large(
    input_shape=(TARGET_IMG_SIZE[0], TARGET_IMG_SIZE[1], 3),
    include_top=False,
    weights='imagenet'
)

# Freeze all layers initially
base_model.trainable = False

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=base_model.input, outputs=output)

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# --- 5. Training Phase 1: The "Warm Up" (1 Epoch only) ---
print("\nTraining phase 1: Transfer Learning (Base Frozen)")
model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=1
)

# --- 6. Training Phase 2: Aggressive Fine-Tuning (9 Epochs) ---
# Unfreeze top 50 layers of MobileNetV3Large
for layer in base_model.layers[-50:]:
    layer.trainable = True

# Add a scheduler: If validation loss plateaus, drop LR to squeeze out accuracy
lr_reducer = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=2,
    min_lr=1e-6,
    verbose=1
)

model.compile(
    optimizer=Adam(learning_rate=5e-5),  # very low LR for fine tuning
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\nTraining phase 2: Fine-tuning")
history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=9,
    callbacks=[lr_reducer]
)

print("\nTraining Complete.")

In [13]:
from tensorflow.keras.models import load_model

# 1. Load model, model was downloaded when training completed
model = load_model('ai_vs_human_mobilenetv3.keras') # or .h5

# 2. Run evaluation on validation set
print("Evaluating model on validation data...")
val_loss, val_acc = model.evaluate(validation_generator)

print(f"\nFinal Results on Validation Set:")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Validation Loss: {val_loss:.4f}")

Evaluating model on validation data...
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 524ms/step - accuracy: 0.9992 - loss: 0.0025

Final Results on Validation Set:
Validation Accuracy: 0.9989
Validation Loss: 0.0038
