In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoaderm
from torchvision import datasets, transforms
from torchvision.utils import save_image
from deepface import DeepFace
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img, img_to_array

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
latent_dim = 100
batch_size = 32
epochs = 50
lr = 0.0002
betas = (0.5, 0.999) 

celeba_root = r"C:\Users\thiba\Downloads\archive\img_align_celeba"
vanilla_image_dir = "vanilla_gan_images"
os.makedirs(vanilla_image_dir, exist_ok=True)

transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.CenterCrop(64),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
dataset = datasets.ImageFolder(root=celeba_root, transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

class VanillaGenerator(nn.Module):
    def __init__(self, latent_dim):
        super(VanillaGenerator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(True),
            nn.Linear(256, 512),
            nn.ReLU(True),
            nn.Linear(512, 1024),
            nn.ReLU(True),
            nn.Linear(1024, 64 * 64 * 3),
            nn.Tanh()
        )

    def forward(self, z):
        out = self.model(z)
        return out.view(-1, 3, 64, 64)

class VanillaDiscriminator(nn.Module):
    def __init__(self):
        super(VanillaDiscriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(64 * 64 * 3, 1024),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(1024, 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, img):
        return self.model(img.view(img.size(0), -1))

def train_gan(generator, discriminator, dataloader, epochs, latent_dim, lr=0.0002, betas=(0.5, 0.999), run_name="GAN"):
    criterion = nn.BCELoss()
    optim_G = optim.Adam(generator.parameters(), lr=lr, betas=betas)
    optim_D = optim.Adam(discriminator.parameters(), lr=lr, betas=betas)

    generator.to(device)
    discriminator.to(device)

    generator_losses = []
    discriminator_losses = []

    for epoch in range(1, epochs + 1):
        g_loss_epoch = 0.0
        d_loss_epoch = 0.0

        for i, (imgs, _) in enumerate(dataloader):
            batch_size_curr = imgs.size(0)
            real_labels = torch.ones(batch_size_curr, 1, device=device)
            fake_labels = torch.zeros(batch_size_curr, 1, device=device)
            real_imgs = imgs.to(device)
            z = torch.randn(batch_size_curr, latent_dim, device=device)
            fake_imgs = generator(z)
            d_real = discriminator(real_imgs)
            d_real_loss = criterion(d_real, real_labels)
            d_fake = discriminator(fake_imgs.detach())
            d_fake_loss = criterion(d_fake, fake_labels)
            d_loss = d_real_loss + d_fake_loss
            optim_D.zero_grad()
            d_loss.backward()
            optim_D.step()
            g_output = discriminator(fake_imgs)
            g_loss = criterion(g_output, real_labels)
            optim_G.zero_grad()
            g_loss.backward()
            optim_G.step()
            d_loss_epoch += d_loss.item()
            g_loss_epoch += g_loss.item()
        d_loss_epoch /= len(dataloader)
        g_loss_epoch /= len(dataloader)
        discriminator_losses.append(d_loss_epoch)
        generator_losses.append(g_loss_epoch)
        print(f"[{run_name}] Epoch [{epoch}/{epochs}] "
              f"D Loss: {d_loss_epoch:.4f} | G Loss: {g_loss_epoch:.4f}")
        
    return generator_losses, discriminator_losses

def plot_training_losses(generator_losses, discriminator_losses, approach_name):
    plt.figure(figsize=(10,6))
    plt.plot(generator_losses, label="Generator Loss")
    plt.plot(discriminator_losses, label="Discriminator Loss")
    plt.title(f"Training Losses ({approach_name})")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.show()

vanilla_generator = VanillaGenerator(latent_dim)
vanilla_discriminator = VanillaDiscriminator()
vanilla_g_losses, vanilla_d_losses = train_gan(
    vanilla_generator,
    vanilla_discriminator,
    dataloader,
    epochs=epochs,
    latent_dim=latent_dim,
    lr=lr,
    betas=betas,
    run_name="VanillaGAN"
)

plot_training_losses(vanilla_g_losses, vanilla_d_losses, "Vanilla GAN")

vanilla_generator.eval()
with torch.no_grad():
    num_to_generate = 300
    total_batches = num_to_generate // batch_size
    count = 0
    for _ in range(total_batches):
        z = torch.randn(batch_size, latent_dim, device=device)
        gen_imgs = vanilla_generator(z)
        for img in gen_imgs:
            count += 1
            save_path = os.path.join(vanilla_image_dir, f"image_{count}.png")
            save_image(img, save_path, normalize=True)
    remainder = num_to_generate % batch_size
    if remainder > 0:
        z = torch.randn(remainder, latent_dim, device=device)
        gen_imgs = vanilla_generator(z)
        for img in gen_imgs:
            count += 1
            save_path = os.path.join(vanilla_image_dir, f"image_{count}.png")
            save_image(img, save_path, normalize=True)




In [None]:
from sklearnex import patch_sklearn
patch_sklearn()
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from deepface import DeepFace
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

def analyze_images_deepface_cached(image_dir, csv_path="deepface_analysis.csv", enforce_detection=False):
    if os.path.exists(csv_path):
        print(f"[Cache] Loading cached DeepFace results from {csv_path}")
        return pd.read_csv(csv_path)
    rows = []
    for img_file in image_files:
        img_path = os.path.join(image_dir, img_file)
        try:
            analysis = DeepFace.analyze(
                img_path,
                actions=['age','gender','emotion','race'], 
                enforce_detection=enforce_detection
            )
            
            if isinstance(analysis, list):
                if len(analysis) == 0:
                    print(f"[DeepFace] No face data for {img_path}")
                    continue
                analysis = analysis[0]
                
            age = analysis['age']
            gdata = analysis['gender']
            if isinstance(gdata, dict):
                gkey = max(gdata, key=gdata.get)
            else:
                gkey = gdata
            gkey = gkey.lower()
            gender = 1 if "man" in gkey else 0

            emotion_probs = analysis['emotion']
            emotion = max(emotion_probs, key=emotion_probs.get)

            race_probs = analysis['race']
            race = max(race_probs, key=race_probs.get)

            rows.append([img_file, age, gender, emotion, race])

        except Exception as e:
            print(f"[DeepFace] Error analyzing {img_path}: {e}")

    df = pd.DataFrame(rows, columns=['image','age','gender','emotion','race'])
    print(f"[Cache] Saving DeepFace results to {csv_path}")
    df.to_csv(csv_path, index=False)
    return df

def find_optimal_k_elbow_silhouette(data_array, min_k=2, max_k=10):
    k_values = range(min_k, max_k+1)
    inertias = []
    silhouettes = []

    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=42).fit(data_array)
        inertia = kmeans.inertia_
        labels = kmeans.labels_
        sil = silhouette_score(data_array, labels)
        inertias.append(inertia)
        silhouettes.append(sil)
        
    plt.figure()
    plt.plot(k_values, inertias, 'o--')
    plt.title("Elbow Method (Inertia vs. k)")
    plt.xlabel("k")
    plt.ylabel("Inertia (Sum of Squared Distances)")
    plt.show()
    plt.figure()
    plt.plot(k_values, silhouettes, 'o--', color='orange')
    plt.title("Silhouette Score vs. k")
    plt.xlabel("k")
    plt.ylabel("Average Silhouette")
    plt.show()

    return list(k_values), inertias, silhouettes

def cluster_data(data_array, n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(data_array)
    return clusters, kmeans

def plot_feature_distributions(features_df, approach_name):
    if 'age' in features_df.columns:
        plt.figure()
        sns.histplot(features_df['age'].dropna(), kde=True)
        plt.title(f"Age Distribution ({approach_name})")
        plt.xlabel("Age")
        plt.ylabel("Frequency")
        plt.show()

    if 'gender' in features_df.columns:
        plt.figure()
        sns.countplot(x='gender', data=features_df.dropna(subset=['gender']))
        plt.title(f"Gender Distribution ({approach_name})")
        plt.xlabel("Gender (0=Female, 1=Male)")
        plt.ylabel("Count")
        plt.show()

    if 'emotion' in features_df.columns:
        plt.figure()
        order_emotion = features_df['emotion'].value_counts().index
        sns.countplot(x='emotion', data=features_df.dropna(subset=['emotion']), order=order_emotion)
        plt.title(f"Emotion Distribution ({approach_name})")
        plt.xlabel("Emotion")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.show()

    if 'race' in features_df.columns:
        plt.figure()
        order_race = features_df['race'].value_counts().index
        sns.countplot(x='race', data=features_df.dropna(subset=['race']), order=order_race)
        plt.title(f"Race Distribution ({approach_name})")
        plt.xlabel("Race")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.show()

def plot_cluster_compositions(features_df, approach_name):
    if 'cluster' not in features_df.columns:
        return

    if 'age' in features_df.columns:
        plt.figure()
        sns.boxplot(x='cluster', y='age', data=features_df.dropna(subset=['age']))
        plt.title(f"Age Distribution by Cluster ({approach_name})")
        plt.xlabel("Cluster")
        plt.ylabel("Age")
        plt.show()

    if 'gender' in features_df.columns:
        plt.figure()
        gender_cluster = features_df.dropna(subset=['gender']).groupby(['cluster','gender']).size().unstack(fill_value=0)
        gender_cluster.plot(kind='bar', stacked=True, figsize=(8,5))
        plt.title(f"Gender Composition by Cluster ({approach_name})")
        plt.xlabel("Cluster")
        plt.ylabel("Count")
        plt.show()

    if 'emotion' in features_df.columns:
        plt.figure()
        sns.countplot(x='cluster', hue='emotion', data=features_df.dropna(subset=['emotion']))
        plt.title(f"Emotion Composition by Cluster ({approach_name})")
        plt.xlabel("Cluster")
        plt.ylabel("Count")
        plt.legend(title="Emotion")
        plt.show()

    if 'race' in features_df.columns:
        plt.figure()
        sns.countplot(x='cluster', hue='race', data=features_df.dropna(subset=['race']))
        plt.title(f"Race Composition by Cluster ({approach_name})")
        plt.xlabel("Cluster")
        plt.ylabel("Count")
        plt.legend(title="Race")
        plt.show()

def plot_cluster_centers(kmeans, data_array, approach_name, feature_names=None):
    if feature_names is None:
        feature_names = [f"feature_{i}" for i in range(data_array.shape[1])]
    if data_array.shape[1] > 10:
        print("[Warning] Too many features for a neat heatmap. Skipping.")
        return

    centers = kmeans.cluster_centers_
    centers_df = pd.DataFrame(centers, columns=feature_names)

    plt.figure(figsize=(10,6))
    sns.heatmap(centers_df, cmap="YlGnBu", annot=True, fmt=".2f")
    plt.title(f"Cluster Centers Heatmap ({approach_name})")
    plt.xlabel("Features")
    plt.ylabel("Cluster")
    plt.show()

def plot_pca_projection(data_array, features_df, approach_name):
    if 'cluster' not in features_df.columns:
        return

    valid_idx = features_df.dropna(subset=['cluster']).index
    data_sub = data_array[valid_idx]

    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(data_sub)

    temp_df = features_df.loc[valid_idx].copy()
    temp_df['pca1'] = pca_result[:, 0]
    temp_df['pca2'] = pca_result[:, 1]

    plt.figure()
    sns.scatterplot(x='pca1', y='pca2', hue='cluster', data=temp_df, palette='viridis')
    plt.title(f"PCA Projection of Clusters ({approach_name})")
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")
    plt.show()

def calculate_cluster_averages(features_df):
    def mode(series):
        return series.value_counts().index[0]

    if 'cluster' not in features_df.columns:
        return None

    agg_dict = {}
    if 'age' in features_df.columns:
        agg_dict['age'] = 'mean'
    if 'gender' in features_df.columns:
        agg_dict['gender'] = 'mean'
    if 'emotion' in features_df.columns:
        agg_dict['emotion'] = mode
    if 'race' in features_df.columns:
        agg_dict['race'] = mode

    cluster_stats = features_df.groupby('cluster').agg(agg_dict).reset_index()
    print(cluster_stats)
    return cluster_stats


if __name__ == "__main__":
    image_dir = r"C:\Users\thiba\vanilla_gan_images"

    df = analyze_images_deepface_cached(image_dir, csv_path="deepface_analysis.csv", enforce_detection=False)
    print("Sample of DeepFace analysis:\n", df.head())

    df_numeric = df[['age','gender']].copy()
    df_numeric = df_numeric.dropna()

    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(df_numeric)  

    k_values, inertias, silhouettes = find_optimal_k_elbow_silhouette(data_scaled, min_k=2, max_k=10)
    best_k = 5

    clusters, kmeans = cluster_data(data_scaled, n_clusters=best_k)

    valid_idx = df_numeric.index
    cluster_series = pd.Series(clusters, index=valid_idx, name='cluster')
    df_final = df.join(cluster_series, how='left')

    plot_feature_distributions(df_final, "MyGAN - DeepFace")
    plot_cluster_compositions(df_final, "MyGAN - DeepFace")
    plot_cluster_centers(kmeans, data_scaled, "MyGAN - DeepFace", feature_names=["Age","Gender"])
    plot_pca_projection(data_scaled, df_final, "MyGAN - DeepFace")
    _ = calculate_cluster_averages(df_final)

    out_csv = "mygan_deepface_clusters.csv"
    df_final.to_csv(out_csv, index=False)
    print(f"\nResults saved to {out_csv} ===")
