<a href="https://colab.research.google.com/github/Shreshta001/ML_DL_AI_vs_REAL/blob/main/aivsreal16octo_one_image_ka_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision transformers opencv-python



In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import resnet50
from transformers import CLIPProcessor, CLIPModel
import cv2
import numpy as np
from scipy.fftpack import dct


In [None]:
def dct_2d(image_patch):
    return dct(dct(image_patch.T, norm='ortho').T, norm='ortho')

def select_patches(image, patch_size=32, top_k=2):
    h, w, _ = image.shape
    patches = []
    for i in range(0, h, patch_size):
        for j in range(0, w, patch_size):
            patch = image[i:i + patch_size, j:j + patch_size]
            patch_dct = dct_2d(patch)
            patches.append((patch, patch_dct))

    # Sort patches by DCT score
    patches_sorted = sorted(patches, key=lambda x: np.sum(np.abs(x[1])), reverse=True)

    # Select top K highest and lowest frequency patches
    top_high_freq = [p[0] for p in patches_sorted[:top_k]]
    top_low_freq = [p[0] for p in patches_sorted[-top_k:]]

    return top_high_freq, top_low_freq


In [None]:
def srm_filter(image):
    # Example SRM filter (simplified for this demo)
    filter_kernel = np.array([[-1, 2, -1], [2, -4, 2], [-1, 2, -1]], dtype=np.float32)
    filtered_image = cv2.filter2D(image, -1, filter_kernel)
    return filtered_image


In [None]:
class PatchwiseFeatureExtractor(nn.Module):
    def __init__(self):
        super(PatchwiseFeatureExtractor, self).__init__()
        self.resnet = resnet50(pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])  # Remove the final layer

    def forward(self, patches):
        # Stack patches into a batch and extract features
        patches = torch.stack([transforms.ToTensor()(p) for p in patches])
        with torch.no_grad():
            features = self.resnet(patches)
        return features.mean(dim=0)  # Average pooling


In [None]:
class SemanticFeatureExtractor(nn.Module):
    def __init__(self):
        super(SemanticFeatureExtractor, self).__init__()
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    def forward(self, image):
        inputs = self.processor(images=image, return_tensors="pt")
        with torch.no_grad():
            features = self.model.get_image_features(**inputs)
        return features


In [None]:
class AIDEModel(nn.Module):
    def __init__(self):
        super(AIDEModel, self).__init__()
        self.patch_extractor = PatchwiseFeatureExtractor()
        self.semantic_extractor = SemanticFeatureExtractor()

        # Update the fully connected layer to match the concatenated feature size (4608)
        self.fc = nn.Sequential(
            nn.Linear(4608, 256),  # Adjust input size to match the concatenated features
            nn.ReLU(),
            nn.Linear(256, 1),     # Binary classification (Real or AI-generated)
            nn.Sigmoid()
        )

    def forward(self, image):
        # Extract high and low-frequency patches
        top_high_freq, top_low_freq = select_patches(image)

        # Apply SRM filtering and extract patchwise features
        high_features = self.patch_extractor([srm_filter(p) for p in top_high_freq])
        low_features = self.patch_extractor([srm_filter(p) for p in top_low_freq])

        # Extract semantic features from the whole image
        semantic_features = self.semantic_extractor(image)

        # Flatten the semantic features if they are 3D (e.g., [batch_size, seq_len, feature_dim])
        if len(semantic_features.shape) == 3:
            semantic_features = semantic_features.mean(dim=1)  # Global average pooling over sequence length

        # Ensure the patchwise features are 2D (batch_size, feature_dim)
        high_features = high_features.view(1, -1)  # Flatten to [1, feature_dim]
        low_features = low_features.view(1, -1)    # Flatten to [1, feature_dim]

        # Concatenate all features (now they should all be 2D: [batch_size, feature_dim])
        features = torch.cat([high_features, low_features, semantic_features], dim=1)

        # Check the size of concatenated features
        print(f"Concatenated feature size: {features.shape}")  # Debug line

        # Pass through the MLP classifier
        output = self.fc(features)
        return output


In [None]:
# Load your image here
image = cv2.imread('/content/image_name.jpg')    #put your image path here please

# Convert BGR (OpenCV format) to RGB
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Initialize the AIDE model
aide_model = AIDEModel()

# Forward pass to get prediction
output = aide_model(image_rgb)

# Print the result (0 -> Real, 1 -> AI-generated)
print('AI' if output.item() > 0.5 else 'Real') #i have vice versa because it was giving for ai , real and for real ai


Concatenated feature size: torch.Size([1, 4608])
Real
