In [5]:
import cv2
import imagehash
import numpy as np
import os
import glob
import joblib
from PIL import Image
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

In [None]:
# Define a function to compute image hashes using dHash (perceptual hashing)
def compute_image_hash(image_path):
    img = cv2.imread(image_path, 0)  # Read the image in grayscale
    hash_value = imagehash.dhash(Image.fromarray(img))
    return hash_to_binary_vector(str(hash_value))

# Define a function to extract features from the dataset
def extract_features_and_labels(dataset_path):
    fake_images = glob.glob(os.path.join(dataset_path, 'fake', '*.jpg'))
    real_images = glob.glob(os.path.join(dataset_path, 'real', '*.jpg'))
    
    # Combine real and fake images and compute their respective image hashes
    images = real_images + fake_images
    labels = [1] * len(real_images) + [0] * len(fake_images)
    hashes = [compute_image_hash(img) for img in images]
    
    return hashes, labels

# Define a function to convert hash strings to binary vectors
def hash_to_binary_vector(hash_str):
    binary_vector = []
    for char in hash_str:
        # Convert each character to a binary digit
        binary_digit = bin(int(char, 16))[2:].zfill(4)  # Convert to binary and pad to 4 digits
        binary_vector.extend([int(bit) for bit in binary_digit])
    return binary_vector


# Load the dataset
dataset_path = 'path_to_your_dataset_directory'
hashes, labels = extract_features_and_labels(dataset_path)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(hashes, labels, test_size=0.2, random_state=42)

# Convert the image hashes to binary vectors
X_train_features = [hash_to_binary_vector(hash_str) for hash_str in X_train]
X_test_features = [hash_to_binary_vector(hash_str) for hash_str in X_test]

In [None]:
# Optional: Fine-tune and experiment with different feature extraction and classification techniques for better performance
# Create a pipeline for experimentation
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # You can experiment with different scalers
    ('reduce_dim', PCA()),         # You can experiment with dimensionality reduction techniques
    ('clf', RandomForestClassifier(random_state=42))  # You can experiment with different classifiers
])

# Define hyperparameters to search over
param_grid = {
    'scaler': [StandardScaler(), MinMaxScaler()],  # Experiment with different scalers
    'reduce_dim__n_components': [None, 10, 20],     # Experiment with different PCA components
    'clf': [RandomForestClassifier(random_state=42),  # Experiment with different classifiers
            DecisionTreeClassifier(random_state=42)]
}

In [None]:
# Create a grid search to find the best combination of hyperparameters
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to your data
grid_search.fit(X_train_features, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Model Accuracy: {accuracy}')

In [None]:
# Save the best model
best_model_filename = 'best_fake_image_detection_model.pkl'
joblib.dump(best_model, best_model_filename)