<a href="https://colab.research.google.com/github/Ritviks21/Ai-image-detection-/blob/main/AI_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Day 1-2: Environment Setup

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))

# Expected output:
# TensorFlow version: X.X.X (some version number)
# GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
# If GPU available is [], it means your GPU runtime is not enabled or recognized. Go back to step 3 above.

TensorFlow version: 2.18.0
GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [10]:
# Day 3-5: Get Datasets - CIFAKE Dataset (Corrected for /kaggle.json path)

import os # Ensure os is imported

# 1. Install Kaggle library
print("Installing Kaggle library...")
!pip install kaggle -q

# 2. Set up Kaggle API key
print("\nSetting up Kaggle API key...")
# --- IMPORTANT CHANGE HERE: PATH IS NOW /kaggle.json ---
kaggle_json_source_path = '/kaggle.json' # Look in the root directory
kaggle_config_dir = os.path.expanduser('~/.kaggle') # This correctly resolves to /root/.kaggle/

if not os.path.exists(kaggle_json_source_path):
    print(f"ERROR: '{kaggle_json_source_path}' not found. Please ensure it's uploaded to the root directory '/'.")
else:
    # Create the target directory if it doesn't exist
    !mkdir -p {kaggle_config_dir}
    # Copy from the root directory to the correct Kaggle config directory
    !cp {kaggle_json_source_path} {kaggle_config_dir}/kaggle.json
    !chmod 600 {kaggle_config_dir}/kaggle.json # Set permissions for security
    print("Kaggle API key setup complete.")

# 3. Try to download the CIFAKE dataset using Kaggle API
# This will only succeed if kaggle.json was correctly set up.
print("\nAttempting to download CIFAKE dataset via Kaggle API...")
# We use the '--force' flag to ensure it tries to download even if a partial file exists.
# Download will go to /content/ by default.
!kaggle datasets download -d birdy654/cifake-real-and-ai-generated-synthetic-images --force

# 4. Determine the correct zip file path
# We check for both possible names: the default Kaggle API download name and 'archive.zip'
zip_file_name_kag = 'cifake-real-and-ai-generated-synthetic-images.zip'
zip_file_name_arc = 'archive.zip'

downloaded_zip_path = os.path.join('/content/', zip_file_name_kag)
uploaded_zip_path = os.path.join('/content/', zip_file_name_arc) # Assuming 'archive.zip' is in /content/

final_zip_path = None
if os.path.exists(downloaded_zip_path):
    final_zip_path = downloaded_zip_path
    print(f"\nFound Kaggle-downloaded zip file: {final_zip_path}")
elif os.path.exists(uploaded_zip_path):
    final_zip_path = uploaded_zip_path
    print(f"\nFound manually uploaded zip file: {final_zip_path}")
else:
    print("\nERROR: No expected dataset zip file found. Neither the Kaggle download nor 'archive.zip' is present.")
    print("Please ensure the download succeeded OR you have manually uploaded 'archive.zip' to /content/.")


# 5. Unzip the dataset if a zip file was found
if final_zip_path:
    dataset_extract_path = '/content/cifake_dataset'
    print(f"Unzipping dataset from {final_zip_path} to {dataset_extract_path}...")
    !unzip -q -o {final_zip_path} -d {dataset_extract_path}
    print("Dataset unzipping complete.")

    # 6. Verify the contents after unzipping
    print(f"\n--- Verifying contents of {dataset_extract_path} ---")
    if os.path.exists(dataset_extract_path):
        print(f"Directory '{dataset_extract_path}' created successfully.")
        print("Contents of cifake_dataset:")
        !ls -F {dataset_extract_path}/
        print("\nContents of cifake_dataset/train:")
        !ls -F {dataset_extract_path}/train/
        print("\nContents of cifake_dataset/test:")
        !ls -F {dataset_extract_path}/test/
    else:
        print(f"ERROR: Directory '{dataset_extract_path}' was NOT created after unzipping.")
        print("Something might be wrong with the zip file content or extraction process.")

Installing Kaggle library...

Setting up Kaggle API key...
Kaggle API key setup complete.

Attempting to download CIFAKE dataset via Kaggle API...
Dataset URL: https://www.kaggle.com/datasets/birdy654/cifake-real-and-ai-generated-synthetic-images
License(s): other
Downloading cifake-real-and-ai-generated-synthetic-images.zip to /content
  0% 0.00/105M [00:00<?, ?B/s]
100% 105M/105M [00:00<00:00, 1.40GB/s]

Found Kaggle-downloaded zip file: /content/cifake-real-and-ai-generated-synthetic-images.zip
Unzipping dataset from /content/cifake-real-and-ai-generated-synthetic-images.zip to /content/cifake_dataset...
Dataset unzipping complete.

--- Verifying contents of /content/cifake_dataset ---
Directory '/content/cifake_dataset' created successfully.
Contents of cifake_dataset:
test/  train/

Contents of cifake_dataset/train:
FAKE/  REAL/

Contents of cifake_dataset/test:
FAKE/  REAL/


In [11]:
# Day 6-7: Data Exploration (Modified to avoid memory issues)

import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator # We'll use this later

# --- IMPORTANT: Define the base path to your dataset ---
# This path is where you unzipped the CIFAKE dataset in the previous step.
# It should be 'cifake_dataset' relative to your current Colab directory.
dataset_base_path = '/content/cifake_dataset' # This is the default location if you used `!unzip -d cifake_dataset`

# Verify that the train and test directories exist within the base path
train_dir = os.path.join(dataset_base_path, 'train')
test_dir = os.path.join(dataset_base_path, 'test') # CIFAKE uses 'test' for validation

if not os.path.exists(train_dir):
    print(f"ERROR: Training directory not found at {train_dir}")
    print("Please check your 'dataset_base_path' or how you unzipped the dataset.")
if not os.path.exists(test_dir):
    print(f"ERROR: Test/Validation directory not found at {test_dir}")
    print("Please check your 'dataset_base_path' or how you unzipped the dataset.")

print(f"Dataset base path: {dataset_base_path}")
print(f"Train directory: {train_dir}")
print(f"Test (Validation) directory: {test_dir}")

# You can check the number of images in each category
print(f"\nImages in Train/FAKE: {len(os.listdir(os.path.join(train_dir, 'FAKE')))}")
print(f"Images in Train/REAL: {len(os.listdir(os.path.join(train_dir, 'REAL')))}")
print(f"Images in Test/FAKE: {len(os.listdir(os.path.join(test_dir, 'FAKE')))}")
print(f"Images in Test/REAL: {len(os.listdir(os.path.join(test_dir, 'REAL')))}")

# We will NOT load all images into NumPy arrays here to save RAM.
# Instead, we'll use TensorFlow's ImageDataGenerator to load them in batches during training.
print("\nProceeding to Week 2. No large image arrays loaded into RAM yet.")

Dataset base path: /content/cifake_dataset
Train directory: /content/cifake_dataset/train
Test (Validation) directory: /content/cifake_dataset/test

Images in Train/FAKE: 50000
Images in Train/REAL: 50000
Images in Test/FAKE: 10000
Images in Test/REAL: 10000

Proceeding to Week 2. No large image arrays loaded into RAM yet.


In [12]:
# Day 8-10: Build Base Model

import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model

# Create the model using transfer learning
def create_model():
    # Load pre-trained EfficientNetB0 model.
    # 'imagenet' weights provide a strong starting point.
    # 'include_top=False' means we don't include the classification head of EfficientNet,
    # as we'll add our own for binary classification (Real/AI).
    # 'input_shape' must match the image size we'll use (224x224 pixels, 3 color channels).
    base_model = EfficientNetB0(
        weights='imagenet',
        include_top=False,
        input_shape=(224, 224, 3)
    )

    # Freeze the layers of the base model.
    # This means their weights won't be updated during the initial training phase.
    # This prevents destroying the learned features and speeds up training.
    base_model.trainable = False

    # Add custom classification head on top of the base model
    x = base_model.output # Output of the EfficientNet base model
    x = GlobalAveragePooling2D()(x) # Reduces spatial dimensions to a single vector
    x = Dropout(0.2)(x) # Helps prevent overfitting by randomly setting some inputs to 0
    x = Dense(128, activation='relu')(x) # A fully connected layer with ReLU activation
    x = Dropout(0.2)(x) # Another dropout layer
    predictions = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification
                                                     # Sigmoid activation outputs a probability between 0 and 1

    # Combine base model and custom head into a single model
    model = Model(inputs=base_model.input, outputs=predictions)
    return model

# Create and compile the model
model = create_model()

# Compile the model:
# 'optimizer='adam'' is a popular and effective optimizer.
# 'loss='binary_crossentropy'' is suitable for binary classification problems (0 or 1).
# 'metrics=['accuracy']' tells TensorFlow to track accuracy during training.
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Print a summary of the model's architecture
model.summary()

print("\nBase model created and compiled. Ready for data preprocessing.")

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step



Base model created and compiled. Ready for data preprocessing.


In [14]:
# Day 11-12: Data Preprocessing (MODIFIED for ImageDataGenerator)

from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os # Ensure os is imported for path joining

# --- Define image dimensions and batch size ---
image_size = (224, 224) # This must match the 'input_shape' defined in your model
batch_size = 32         # Number of images to process at once.
                        # Adjust this if you encounter Out-Of-Memory (OOM) errors during training.
                        # Common values: 32, 16, 8 (smaller if memory is an issue)

# --- Define the paths to your dataset folders (already verified) ---
dataset_base_path = '/content/cifake_dataset'
train_dir = os.path.join(dataset_base_path, 'train')
test_dir = os.path.join(dataset_base_path, 'test') # CIFAKE uses 'test' for validation

# --- Data Augmentation for Training Data ---
# 'rescale=1./255': Normalizes pixel values from [0, 255] to [0, 1]. This is essential for neural networks.
# 'rotation_range', 'width_shift_range', 'height_shift_range', 'horizontal_flip':
#   These randomly apply transformations to training images, creating slightly altered versions.
#   This helps the model generalize better and reduces overfitting.
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

# --- No Augmentation for Validation/Test Data ---
# Validation data should represent real-world conditions, so we only rescale/normalize it.
test_datagen = ImageDataGenerator(rescale=1./255)

# --- Create Data Generators ---
# 'flow_from_directory': This function reads images directly from your specified folders.
#   It automatically infers class labels from subfolder names (e.g., 'FAKE', 'REAL').
# 'target_size': All images will be resized to this dimension.
# 'color_mode': 'rgb' for color images.
# 'batch_size': How many images to yield at a time.
# 'class_mode': 'binary' because we have two classes (0 or 1).
# 'shuffle': 'True' for training data to randomize batch order, 'False' for test data for consistent evaluation.

print("Creating training data generator...")
train_generator = train_datagen.flow_from_directory(
    directory=train_dir,
    target_size=image_size,
    color_mode='rgb',
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True
)

print("\nCreating validation (test) data generator...")
test_generator = test_datagen.flow_from_directory(
    directory=test_dir, # Using the 'test' folder for validation
    target_size=image_size,
    color_mode='rgb',
    batch_size=batch_size,
    class_mode='binary',
    shuffle=False # Important: Do NOT shuffle validation/test data
)

print("\n--- Data Generators Summary ---")
print(f"Found {train_generator.samples} training images belonging to {train_generator.num_classes} classes.")
print(f"Found {test_generator.samples} validation images belonging to {test_generator.num_classes} classes.")
print("Class indices (how labels are mapped to 0/1):", train_generator.class_indices)
# Expected output might be: {'FAKE': 0, 'REAL': 1} or {'REAL': 0, 'FAKE': 1}.
# Remember these mappings for interpreting your model's predictions later in the Streamlit app.

Creating training data generator...
Found 100000 images belonging to 2 classes.

Creating validation (test) data generator...
Found 20000 images belonging to 2 classes.

--- Data Generators Summary ---
Found 100000 training images belonging to 2 classes.
Found 20000 validation images belonging to 2 classes.
Class indices (how labels are mapped to 0/1): {'FAKE': 0, 'REAL': 1}


In [None]:
# Day 13-14: Train Model (MODIFIED to use Data Generators)

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt # Import for plotting history

# --- Setup Callbacks ---
# Callbacks are functions that run during training to perform actions.
early_stopping = EarlyStopping(
    monitor='val_loss', # Monitors the validation loss (how well the model performs on unseen data)
    patience=5,         # If 'val_loss' does not improve for 5 consecutive epochs, training stops.
    restore_best_weights=True # After stopping, loads the model weights from the epoch with the best 'val_loss'.
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', # Also monitors validation loss
    factor=0.2,       # If 'val_loss' plateaus, reduce the learning rate by 80% (new_lr = old_lr * 0.2).
    patience=3,       # If 'val_loss' does not improve for 3 consecutive epochs.
    min_lr=0.0001     # Ensures the learning rate doesn't go below this value.
)

print("\n--- Starting Model Training ---")
print("This will take a while. Monitor the 'accuracy' and 'val_accuracy' metrics in the output.")
print(f"Each epoch will process {train_generator.samples // train_generator.batch_size} training steps and {test_generator.samples // test_generator.batch_size} validation steps.")

# Train the model using the generators
# 'model.fit()' now takes generators as input instead of full X_train, y_train arrays.
# 'steps_per_epoch': Number of batches to draw from the generator to complete one epoch.
#   It's usually total_samples // batch_size.
# 'validation_data': The generator for the validation set.
# 'epochs': Maximum number of times to go through the entire training dataset.
#   EarlyStopping will likely stop it sooner if performance plateaus.
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    validation_data=test_generator, # Use test_generator as validation_data
    validation_steps=test_generator.samples // test_generator.batch_size,
    epochs=20, # You can start with 10-20. EarlyStopping will prevent unnecessary epochs.
    callbacks=[early_stopping, reduce_lr], # Apply the defined callbacks
    verbose=1 # Show training progress
)

# --- Save the Trained Model ---
model_save_path = 'ai_image_detector.h5'
model.save(model_save_path)
print(f"\nModel training complete and saved to {model_save_path}")

# Optional: Plot training history to visualize performance
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


--- Starting Model Training ---
This will take a while. Monitor the 'accuracy' and 'val_accuracy' metrics in the output.
Each epoch will process 3125 training steps and 625 validation steps.


  self._warn_if_super_not_called()


Epoch 1/20
[1m2784/3125[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m2:00[0m 352ms/step - accuracy: 0.4984 - loss: 0.6957