In [20]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Set file paths
image_dir = '/content/drive/MyDrive/Stats201_FinalProject/Final_Project/images'
train_path = '/content/drive/MyDrive/Stats201_FinalProject/Final_Project/train_filtered.csv'
test_path = '/content/drive/MyDrive/Stats201_FinalProject/Final_Project/test_filtered.csv'
val_path = '/content/drive/MyDrive/Stats201_FinalProject/Final_Project/val_filtered.csv'

# Load the CSV files into pandas DataFrames
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
val_df = pd.read_csv(val_path)

# Prepare image paths and labels
train_image_paths = [os.path.join(image_dir, f"{img_id}.jpg") for img_id in train_df['ID']]
test_image_paths = [os.path.join(image_dir, f"{img_id}.jpg") for img_id in test_df['ID']]
val_image_paths = [os.path.join(image_dir, f"{img_id}.jpg") for img_id in val_df['ID']]

train_labels = train_df['label'].values
test_labels = test_df['label'].values
val_labels = val_df['label'].values

# Load the pre-trained ResNet50 model without the top layer for feature extraction
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)  # Add global average pooling to reduce spatial dimensions
x = Dense(1024, activation='relu')(x)  # Dense layer for non-linearity
predictions = Dense(1, activation='sigmoid')(x)  # Sigmoid activation for binary classification
model = Model(inputs=base_model.input, outputs=predictions)

# Freeze the layers of the base model to prevent retraining
for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Function to load and preprocess images
def load_and_preprocess_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (224, 224))
    img = preprocess_input(img)
    return img

# Create a tf.data.Dataset for the images and labels
def create_dataset(image_paths, labels, batch_size=32):
    # Create a tf.data.Dataset from the image paths and labels
    image_paths_ds = tf.data.Dataset.from_tensor_slices(image_paths)
    labels_ds = tf.data.Dataset.from_tensor_slices(labels)

    # Map the image loading and preprocessing function to the dataset
    dataset = tf.data.Dataset.zip((image_paths_ds, labels_ds))
    dataset = dataset.map(lambda img_path, label: (load_and_preprocess_image(img_path), label),
                          num_parallel_calls=tf.data.AUTOTUNE)

    # Shuffle, batch, and prefetch to optimize the loading pipeline
    dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# Prepare datasets for training, validation, and testing
train_dataset = create_dataset(train_image_paths, train_labels)
val_dataset = create_dataset(val_image_paths, val_labels)
test_dataset = create_dataset(test_image_paths, test_labels)

# Train the model
model.fit(train_dataset, epochs=5, validation_data=val_dataset)

# Evaluate the model on the test set
test_preds = model.predict(test_dataset)
test_preds = (test_preds > 0.5).astype(int)  # Convert probabilities to class labels

# Compute accuracy
accuracy = accuracy_score(test_labels, test_preds)
print(f'Test Set Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("\nClassification Report on Test Set:")
print(classification_report(test_labels, test_preds, target_names=['Authentic', 'Machine-Generated']))


Epoch 1/5
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 92ms/step - accuracy: 0.8762 - loss: 0.4137 - val_accuracy: 0.9595 - val_loss: 0.1025
Epoch 2/5
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 65ms/step - accuracy: 0.9684 - loss: 0.0834 - val_accuracy: 0.9702 - val_loss: 0.0820
Epoch 3/5
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 64ms/step - accuracy: 0.9752 - loss: 0.0604 - val_accuracy: 0.9665 - val_loss: 0.0879
Epoch 4/5
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 64ms/step - accuracy: 0.9810 - loss: 0.0469 - val_accuracy: 0.9709 - val_loss: 0.0900
Epoch 5/5
[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 64ms/step - accuracy: 0.9925 - loss: 0.0224 - val_accuracy: 0.9738 - val_loss: 0.0926
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 72ms/step
Test Set Accuracy: 50.37%

Classification Report on Test Set:
                   precision    recall  f1-score