In [None]:
# %%
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Create output folder for all model files and results
OUTPUT_FOLDER = 'model7_gender_only'
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization, Input, Concatenate
from tensorflow.keras.applications import MobileNetV2, ResNet50V2
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import tensorflow as tf
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight
import seaborn as sns

# Constants
IMG_SIZE = 128
BATCH_SIZE = 32
EPOCHS = 30
NUM_CLASSES_GENDER = 2
RANDOM_STATE = 42

# 1. Data preparation
data_parent = 'C:/Users/focus/copy_age_gender_mo2/all_path'  # Adjust path as needed

# Load data from all folds
data = pd.read_csv(os.path.join(data_parent, 'fold_0_data.txt'), sep='\t')
data1 = pd.read_csv(os.path.join(data_parent, 'fold_1_data.txt'), sep='\t')
data2 = pd.read_csv(os.path.join(data_parent, 'fold_2_data.txt'), sep='\t')
data3 = pd.read_csv(os.path.join(data_parent, 'fold_3_data.txt'), sep='\t')
data4 = pd.read_csv(os.path.join(data_parent, 'fold_4_data.txt'), sep='\t')

# Combine all data
total_data = pd.concat([data, data1, data2, data3, data4], ignore_index=True)

# Create image paths
img_path = []
for row in total_data.iterrows():
    path = os.path.join(data_parent, "faces", row[1].user_id, 
                    f"coarse_tilt_aligned_face.{str(row[1].face_id)}.{row[1].original_image}")
    img_path.append(path)

df = total_data[['gender', 'x', 'y', 'dx', 'dy']].copy()
df['img_path'] = img_path

# Clean data - remove problematic rows
df = df.dropna()
unbiased_data = df[df.gender != 'u'].copy()

# Save data for reference
unbiased_data.to_csv(os.path.join(OUTPUT_FOLDER, 'clean_data.csv'), index=False)

# Label mappings
gender_to_label_map = {
    'f': 0,
    'm': 1
}

# Reverse mappings for prediction interpretation
label_to_gender_map = {0: 'Female', 1: 'Male'}

# Convert to numerical labels
unbiased_data['gender_label'] = unbiased_data['gender'].apply(lambda g: gender_to_label_map[g])

# 2. Data analysis for imbalance
def analyze_data_imbalance():
    print("Data Imbalance Analysis:")
    
    print("\nGender Distribution:")
    gender_counts = unbiased_data['gender'].value_counts()
    print(gender_counts)
    
    plt.figure(figsize=(8, 6))
    sns.countplot(data=unbiased_data, x='gender')
    plt.title('Gender Distribution')
    plt.xlabel('Gender')
    plt.ylabel('Count')
    plt.xticks(ticks=[0, 1], labels=['Female (f)', 'Male (m)'])
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_FOLDER, 'gender_distribution.png'))
    
    return gender_counts

# 3. Enhanced image loading and preprocessing
def load_and_preprocess_image(img_path, augment=False):
    try:
        img = cv2.imread(img_path)
        if img is None:
            return np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)
        
        # Enhanced face detection and alignment could be added here
        
        # Better preprocessing with histogram equalization for improved contrast
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        img_gray_eq = cv2.equalizeHist(img_gray)
        img = cv2.cvtColor(img_gray_eq, cv2.COLOR_GRAY2RGB)
        
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        
        # More aggressive data augmentation
        if augment:
            # Random rotation
            if np.random.random() > 0.5:
                angle = np.random.uniform(-25, 25)  # Increased range
                height, width = img.shape[:2]
                matrix = cv2.getRotationMatrix2D((width/2, height/2), angle, 1)
                img = cv2.warpAffine(img, matrix, (width, height))
            
            # Random brightness and contrast adjustment
            if np.random.random() > 0.5:
                alpha = np.random.uniform(0.7, 1.3)  # Contrast (increased range)
                beta = np.random.uniform(-20, 20)  # Brightness (increased range)
                img = cv2.convertScaleAbs(img, alpha=alpha, beta=beta)
            
            # Random horizontal flip
            if np.random.random() > 0.5:
                img = cv2.flip(img, 1)
                
            # Random zoom
            if np.random.random() > 0.7:
                zoom_factor = np.random.uniform(0.8, 1.2)
                h, w = img.shape[:2]
                h_new, w_new = int(h * zoom_factor), int(w * zoom_factor)
                h_start = max(0, (h_new - h) // 2)
                w_start = max(0, (w_new - w) // 2)
                
                if zoom_factor < 1:
                    # Zoom out
                    img_new = np.zeros((h_new, w_new, 3), dtype=np.uint8)
                    img_new[h_start:h_start+min(h, h_new), w_start:w_start+min(w, w_new)] = img[:min(h, h_new-h_start), :min(w, w_new-w_start)]
                    img = cv2.resize(img_new, (h, w))
                else:
                    # Zoom in
                    img = img[max(0, (h-h_new)//2):min(h, (h+h_new)//2), max(0, (w-w_new)//2):min(w, (w+w_new)//2)]
                    img = cv2.resize(img, (h, w))
                
            # Random noise
            if np.random.random() > 0.8:
                noise = np.random.normal(0, 5, img.shape).astype(np.uint8)
                img = cv2.add(img, noise)
                
        return img
    except Exception as e:
        print(f"Error loading image {img_path}: {e}")
        return np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)

# 4. Improved balanced data generator
def balanced_data_generator(img_paths, labels, batch_size=32, augment=True):
    num_samples = len(img_paths)
    
    # Compute class weights for better balancing
    y_integers = labels.values
    class_weights_array = class_weight.compute_class_weight(
        'balanced', 
        classes=np.unique(y_integers), 
        y=y_integers
    )
    class_weights = dict(enumerate(class_weights_array))
    
    # Create indices for each class
    class_indices = {}
    for cls in np.unique(y_integers):
        class_indices[cls] = np.where(y_integers == cls)[0]
    
    while True:
        batch_images = []
        batch_labels = []
        
        # Sample balanced classes
        samples_per_class = batch_size // len(class_indices)
        remainder = batch_size % len(class_indices)
        
        for cls, indices in class_indices.items():
            n_samples = samples_per_class + (1 if remainder > 0 else 0)
            remainder -= 1 if remainder > 0 else 0
            
            if len(indices) < n_samples:
                # If not enough samples, use replacement
                sampled_indices = np.random.choice(indices, size=n_samples, replace=True)
            else:
                sampled_indices = np.random.choice(indices, size=n_samples, replace=False)
            
            for idx in sampled_indices:
                img = load_and_preprocess_image(img_paths.iloc[idx], augment=augment)
                batch_images.append(img)
                batch_labels.append(labels.iloc[idx])
        
        # Convert to numpy arrays
        batch_images = np.array(batch_images) / 255.0
        
        # one-hot encoding
        batch_labels = to_categorical(batch_labels, NUM_CLASSES_GENDER)
        
        yield batch_images, batch_labels

# 5. Gender model with residual connections and more regularization
def create_gender_model():
    # Using ResNet50V2 which has better feature extraction for facial attributes
    base_model = ResNet50V2(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    
    # Freeze early layers
    for layer in base_model.layers[:-30]:  # Freeze fewer layers for better fine-tuning
        layer.trainable = False
    
    # Input for additional features if needed
    input_img = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    
    # Extract features
    x = base_model(input_img)
    x = GlobalAveragePooling2D()(x)
    
    # Add more regularization
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.4)(x)
    
    # Output layer
    predictions = Dense(NUM_CLASSES_GENDER, activation='softmax')(x)
    
    model = Model(inputs=input_img, outputs=predictions)
    
    # Lower learning rate and added weight decay
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.00005, decay=1e-6)
    
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# 6. Train gender model with more robust approach
def train_gender_model():
    # Split data for training and testing with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        unbiased_data['img_path'],
        unbiased_data['gender_label'],
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=unbiased_data['gender_label']
    )
    
    # Print split information
    print(f"Training data: {len(X_train)} samples")
    print(f"Testing data: {len(X_test)} samples")
    
    # Calculate class weights for imbalanced data
    class_weights = class_weight.compute_class_weight(
        'balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
    
    # Create model
    model = create_gender_model()
    
    # Create callbacks with better settings
    checkpoint = ModelCheckpoint(os.path.join(OUTPUT_FOLDER, 'gender_model_best.h5'),
                                monitor='val_accuracy',
                                save_best_only=True,
                                verbose=1)
    
    early_stopping = EarlyStopping(monitor='val_accuracy',
                                  patience=20,  # More patience
                                  verbose=1,
                                  restore_best_weights=True)
    
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                 factor=0.1,  # More aggressive reduction
                                 patience=10,  # More patience
                                 min_lr=0.000001,
                                 verbose=1)
    
    # Create generators
    train_generator = balanced_data_generator(X_train, y_train, BATCH_SIZE, augment=True)
    validation_generator = balanced_data_generator(X_test, y_test, BATCH_SIZE, augment=False)
    
    # Train model
    steps_per_epoch = len(X_train) // BATCH_SIZE
    validation_steps = len(X_test) // BATCH_SIZE
    
    history = model.fit(
        train_generator,
        steps_per_epoch=steps_per_epoch,
        epochs=EPOCHS,
        validation_data=validation_generator,
        validation_steps=validation_steps,
        callbacks=[checkpoint, early_stopping, reduce_lr],
        class_weight=class_weight_dict
    )
    
    # Save model
    model.save(os.path.join(OUTPUT_FOLDER, 'gender_model_final.h5'))
    
    # Plot performance
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train')
    plt.plot(history.history['val_accuracy'], label='Validation')
    plt.title('Gender Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train')
    plt.plot(history.history['val_loss'], label='Validation')
    plt.title('Gender Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_FOLDER, 'gender_model_training.png'))
    
    # Evaluate model with test data
    print("\nGender Model Evaluation:")
    
    # Load best model
    model = load_model(os.path.join(OUTPUT_FOLDER, 'gender_model_best.h5'))
    
    # Generate predictions for test data
    predictions = []
    true_labels = []
    
    for i in range(0, len(X_test), BATCH_SIZE):
        batch_img_paths = X_test.iloc[i:i+BATCH_SIZE]
        batch_true_labels = y_test.iloc[i:i+BATCH_SIZE]
        
        batch_images = []
        for img_path in batch_img_paths:
            img = load_and_preprocess_image(img_path)
            batch_images.append(img)
        
        batch_images = np.array(batch_images) / 255.0
        batch_preds = model.predict(batch_images)
        batch_pred_labels = np.argmax(batch_preds, axis=1)
        
        predictions.extend(batch_pred_labels)
        true_labels.extend(batch_true_labels)
    
    # Create evaluation report
    print(classification_report(true_labels, predictions, 
                               target_names=['Female', 'Male']))
    
    # Plot confusion matrix
    cm = confusion_matrix(true_labels, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
               xticklabels=['Female', 'Male'],
               yticklabels=['Female', 'Male'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Gender Prediction Confusion Matrix')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_FOLDER, 'gender_confusion_matrix.png'))
    
    return model, history

# 7. Prediction function for gender only
def predict_gender(image_path, gender_model):
    # Read image
    img = cv2.imread(image_path)
    if img is None:
        return "Cannot read image"
    
    # Detect faces
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    
    # Create result image
    result_img = img.copy()
    
    # Process each detected face
    for (x, y, w, h) in faces:
        x1, y1, x2, y2 = x, y, x+w, y+h
        
        # Crop face
        face_img = img[y1:y2, x1:x2]
        
        # Check if face is valid
        if face_img.size == 0 or face_img.shape[0] == 0 or face_img.shape[1] == 0:
            continue
        
        # Resize for model prediction
        face_resized = cv2.resize(face_img, (IMG_SIZE, IMG_SIZE))
        face_rgb = cv2.cvtColor(face_resized, cv2.COLOR_BGR2RGB)
        face_rgb = face_rgb / 255.0
        face_rgb = np.expand_dims(face_rgb, axis=0)
        
        # Predict gender
        gender_pred = gender_model.predict(face_rgb)
        gender_class = np.argmax(gender_pred[0])
        gender_prob = np.max(gender_pred[0]) * 100
        gender_label = label_to_gender_map[gender_class]
        
        # Draw bounding box
        cv2.rectangle(result_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
        # Add prediction results to image
        label_text = f"Gender: {gender_label} ({gender_prob:.1f}%)"
        cv2.putText(result_img, label_text, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
    
    # Save result image
    result_path = os.path.join(OUTPUT_FOLDER, 'result_' + os.path.basename(image_path))
    cv2.imwrite(result_path, result_img)
    
    return result_path

# 8. Test with image function
def test_with_image(image_path):
    """
    Test gender model with image and display result
    """
    # Read image
    img = cv2.imread(image_path)
    if img is None:
        print("Cannot read image")
        return
    
    # Detect faces
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    
    # Create result image
    result_img = img.copy()
    
    # Load model
    gender_model = load_model(os.path.join(OUTPUT_FOLDER, 'gender_model_best.h5'))
    
    # Process each detected face
    for (x, y, w, h) in faces:
        # Crop face
        face_img = img[y:y+h, x:x+w]
        
        # Resize
        face_resized = cv2.resize(face_img, (IMG_SIZE, IMG_SIZE))
        face_rgb = cv2.cvtColor(face_resized, cv2.COLOR_BGR2RGB)
        face_rgb = face_rgb / 255.0
        face_rgb = np.expand_dims(face_rgb, axis=0)
        
        # Predict
        gender_pred = gender_model.predict(face_rgb)
        gender_class = np.argmax(gender_pred[0])
        gender_prob = np.max(gender_pred[0]) * 100
        gender_label = label_to_gender_map[gender_class]
        
        # Draw bounding box and labels
        cv2.rectangle(result_img, (x, y), (x+w, y+h), (0, 255, 0), 2)
        
        label_text = f"Gender: {gender_label} ({gender_prob:.1f}%)"
        cv2.putText(result_img, label_text, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
    
    # Save and display result
    result_path = os.path.join(OUTPUT_FOLDER, 'result_' + os.path.basename(image_path))
    cv2.imwrite(result_path, result_img)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(cv2.cvtColor(result_img, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.title('Gender Prediction Result')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_FOLDER, 'prediction_result.png'))
    
    return result_path

# 9. Main function
def main():
    print("=" * 50)
    print("Gender Prediction from Face")
    print("=" * 50)
    
    # 1. Analyze data
    print("\n1. Analyzing data...")
    gender_counts = analyze_data_imbalance()
    print("\nData Summary:")
    print(f"Total samples: {len(unbiased_data)}")
    print(f"Number of gender classes: {NUM_CLASSES_GENDER}")
    
    # 2. Train gender model
    print("\n2. Training gender model...")
    gender_model, gender_history = train_gender_model()
    
    # 3. Test with image
    test_img = "C:/Users/focus/Pictures/USE_to_test_ml/image_th.png"
    result_path = test_with_image(test_img)
    
    print(f"\nPrediction completed. Result saved at: {result_path}")

if __name__ == "__main__":
    main()