In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import os
from PIL import Image
import logging
import time
import joblib

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define constants
CSV_PATH = r"C:\Users\ahmed\Downloads\ML-Project\Dataset\image_labels.csv"
IMG_DIR = r"C:\Users\ahmed\Downloads\ML-Project\Dataset\interior"
IMG_HEIGHT, IMG_WIDTH = 224, 224
BATCH_SIZE = 32

def regenerate_csv(image_dir, output_file):
    """
    Scans the image directory, matches filenames to class labels based on predefined variations,
    and saves the mapping to a CSV file.
    
    Args:
        image_dir (str): Directory containing the images.
        output_file (str): Path to save the generated CSV file.
    
    Returns:
        pd.DataFrame: DataFrame containing image paths and labels.
    """
    classes = ['bath', 'bed', 'dining room', 'kitchen', 'living room']
    class_variations = {
        'bath': ['bath', 'bathroom'], 'bed': ['bed', 'bedroom'],
        'dining room': ['dining', 'dining_room', 'diningroom', 'din'],
        'kitchen': ['kitchen'], 'living room': ['living', 'living_room', 'livingroom']
    }
    data = []
    for filename in os.listdir(image_dir):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
            matched = False
            for cls in classes:
                for variation in class_variations[cls]:
                    if variation.lower() in filename.lower():
                        data.append({'image_path': os.path.join(image_dir, filename), 'label': cls})
                        matched = True
                        break
                if matched:
                    break
    df = pd.DataFrame(data)
    df.to_csv(output_file, index=False)
    print(f"Regenerated CSV with {len(df)} images")
    return df

def load_and_validate_data(csv_path, img_dir):
    """
    Load the CSV and validate image paths, removing invalid or corrupted images.
    
    Args:
        csv_path (str): Path to the CSV file containing image paths and labels.
        img_dir (str): Directory containing the images.
    
    Returns:
        pd.DataFrame: Cleaned DataFrame with valid image paths and labels.
    """
    df = pd.read_csv(csv_path)
    invalid_images = []
    for img_path in df['image_path']:
        full_path = img_path if os.path.isabs(img_path) else os.path.join(img_dir, img_path)
        if not os.path.exists(full_path):
            invalid_images.append(img_path)
            continue
        try:
            with Image.open(full_path) as img:
                img.verify()
        except Exception:
            invalid_images.append(img_path)
    if invalid_images:
        df = df[~df['image_path'].isin(invalid_images)]
        df.to_csv(csv_path, index=False)
        logging.info(f"Removed {len(invalid_images)} invalid images. Updated CSV has {len(df)} images.")
    return df

def prepare_dataset(csv_path, img_dir):
    """
    Prepare the dataset by loading data, encoding labels, splitting into train/validation sets,
    and creating TensorFlow datasets with ResNet50 preprocessing.
    
    Args:
        csv_path (str): Path to the CSV file.
        img_dir (str): Directory containing the images.
    
    Returns:
        tuple: (train_dataset, val_dataset, num_classes, label_encoder, train_df, val_df)
    """
    # Load and clean the dataset
    df = load_and_validate_data(csv_path, img_dir)
    if df.empty:
        raise ValueError("No valid images remain after validation. Check image paths and integrity.")
    
    # Encode the string labels into integers
    label_encoder = LabelEncoder()
    df['label_encoded'] = label_encoder.fit_transform(df['label'])
    num_classes = len(label_encoder.classes_)
    
    # Split the data into training (80%) and validation (20%) sets with stratification
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
    
    # Define the image loading and preprocessing function for ResNet50
    def load_image(image_path, label):
        img = tf.io.read_file(image_path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.cast(img, tf.float32)
        # Apply ResNet50-specific preprocessing (normalizes to [-1, 1])
        img = preprocess_input(img)
        return img, label
    
    # Create TensorFlow datasets for training and validation with caching, shuffling, batching, and prefetching
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (train_df['image_path'], tf.keras.utils.to_categorical(train_df['label_encoded'], num_classes))
    ).map(load_image, num_parallel_calls=tf.data.AUTOTUNE).cache().shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    val_dataset = tf.data.Dataset.from_tensor_slices(
        (val_df['image_path'], tf.keras.utils.to_categorical(val_df['label_encoded'], num_classes))
    ).map(load_image, num_parallel_calls=tf.data.AUTOTUNE).cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    
    return train_dataset, val_dataset, num_classes, label_encoder, train_df, val_df

def extract_features(dataset):
    """
    Extracts features from images using the ResNet50 base model with global average pooling.
    
    Args:
        dataset: TensorFlow dataset containing images and labels.
    
    Returns:
        tuple: (features, labels) as numpy arrays.
    """
    # Set up the ResNet50 feature extractor
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(IMG_HEIGHT, IMG_WIDTH, 3))
    feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)
    
    features = []
    labels = []
    for images, lbls in dataset:
        feats = feature_extractor(images)
        feats = tf.keras.layers.GlobalAveragePooling2D()(feats)
        features.append(feats.numpy())
        labels.append(lbls.numpy())
    return np.vstack(features), np.vstack(labels)

def train_and_evaluate_lr(X_train, y_train, X_val, y_val):
    """
    Trains a Logistic Regression classifier with hyperparameter tuning and evaluates its performance.
    
    Args:
        X_train (np.array): Training features.
        y_train (np.array): Training labels.
        X_val (np.array): Validation features.
        y_val (np.array): Validation labels.
    
    Returns:
        tuple: (trained_lr_model, y_pred, accuracy)
    """
    # Prepare labels for Logistic Regression
    y_train_flat = np.argmax(y_train, axis=1)
    y_val_flat = np.argmax(y_val, axis=1)
    
    # Define hyperparameter grid for Logistic Regression
    param_grid = {'C': [0.1, 1, 10, 100], 'solver': ['lbfgs', 'liblinear']}
    grid = GridSearchCV(LogisticRegression(multi_class='ovr', max_iter=1000), param_grid, cv=5, verbose=3)
    
    # Train the Logistic Regression with hyperparameter tuning
    start_time = time.time()
    grid.fit(X_train, y_train_flat)
    lr_training_time = time.time() - start_time
    logging.info(f"Logistic Regression training took {lr_training_time:.2f} seconds with best params: {grid.best_params_}")
    
    # Get the best model
    lr_model = grid.best_estimator_
    
    # Evaluate the model
    y_pred = lr_model.predict(X_val)
    accuracy = accuracy_score(y_val_flat, y_pred)
    logging.info(f"Validation Accuracy: {accuracy:.4f}")
    
    return lr_model, y_pred, accuracy

def plot_confusion_matrix(y_true, y_pred, classes):
    """
    Generates and saves a confusion matrix heatmap.
    
    Args:
        y_true: True labels.
        y_pred: Predicted labels.
        classes: List of class names.
    """
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    plt.close()

def save_results(lr_model, y_val_flat, y_pred, accuracy, grid, feature_extraction_time, lr_training_time, label_encoder):
    """
    Saves the trained model, training log, and classification report.
    
    Args:
        lr_model: Trained Logistic Regression model.
        y_val_flat (np.array): Flattened validation labels.
        y_pred (np.array): Predicted labels.
        accuracy (float): Validation accuracy.
        grid: GridSearchCV object with cross-validation results.
        feature_extraction_time (float): Time taken for feature extraction.
        lr_training_time (float): Time taken for Logistic Regression training.
        label_encoder: LabelEncoder object for decoding labels.
    """
    # Save the model
    joblib.dump(lr_model, 'logistic_regression_model.pkl')
    logging.info("Logistic Regression model saved as 'logistic_regression_model.pkl'")
    
    # Write training log
    with open('training_log.txt', 'w') as f:
        f.write(f"Feature Extraction Time: {feature_extraction_time:.2f} seconds\n")
        f.write(f"Logistic Regression Training Time: {lr_training_time:.2f} seconds\n")
        f.write(f"Best Logistic Regression Params: {grid.best_params_}\n")
        f.write(f"Validation Accuracy: {accuracy:.4f}\n")
        f.write("\nClassification Report:\n")
        f.write(classification_report(y_val_flat, y_pred, target_names=label_encoder.classes_))
        f.write("\nCross-Validation Results:\n")
        for mean_score, params in zip(grid.cv_results_['mean_test_score'], grid.cv_results_['params']):
            f.write(f"Mean CV Score: {mean_score:.4f} with params: {params}\n")

def main():
    # Regenerate CSV if necessary
    regenerate_csv(IMG_DIR, CSV_PATH)
    
    # Prepare the dataset
    train_dataset, val_dataset, num_classes, label_encoder, train_df, val_df = prepare_dataset(CSV_PATH, IMG_DIR)
    
    # Extract features
    start_time = time.time()
    X_train, y_train = extract_features(train_dataset)
    X_val, y_val = extract_features(val_dataset)
    feature_extraction_time = time.time() - start_time
    logging.info(f"Feature extraction took {feature_extraction_time:.2f} seconds.")
    
    # Train and evaluate Logistic Regression
    lr_model, y_pred, accuracy = train_and_evaluate_lr(X_train, y_train, X_val, y_val)
    
    # Prepare labels for evaluation
    y_val_flat = np.argmax(y_val, axis=1)
    
    # Generate and save confusion matrix
    plot_confusion_matrix(y_val_flat, y_pred, label_encoder.classes_)
    
    # Save training results
    save_results(lr_model, y_val_flat, y_pred, accuracy, grid, feature_extraction_time, lr_training_time, label_encoder)

if __name__ == "__main__":
    main()

2025-05-21 02:33:06,300 - INFO - Feature extraction took 55.84 seconds.


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ...............C=0.1, solver=lbfgs;, score=0.880 total time=   7.2s




[CV 2/5] END ...............C=0.1, solver=lbfgs;, score=0.883 total time=   8.3s




[CV 3/5] END ...............C=0.1, solver=lbfgs;, score=0.883 total time=   9.9s




[CV 4/5] END ...............C=0.1, solver=lbfgs;, score=0.878 total time=   9.1s




[CV 5/5] END ...............C=0.1, solver=lbfgs;, score=0.881 total time=   9.6s




[CV 1/5] END ...........C=0.1, solver=liblinear;, score=0.879 total time=  34.2s




[CV 2/5] END ...........C=0.1, solver=liblinear;, score=0.882 total time=  30.2s




[CV 3/5] END ...........C=0.1, solver=liblinear;, score=0.884 total time=  37.3s




[CV 4/5] END ...........C=0.1, solver=liblinear;, score=0.878 total time=  31.4s




[CV 5/5] END ...........C=0.1, solver=liblinear;, score=0.880 total time=  31.7s




[CV 1/5] END .................C=1, solver=lbfgs;, score=0.866 total time=  13.1s




[CV 2/5] END .................C=1, solver=lbfgs;, score=0.868 total time=  13.6s




[CV 3/5] END .................C=1, solver=lbfgs;, score=0.877 total time=  12.1s




[CV 4/5] END .................C=1, solver=lbfgs;, score=0.875 total time=  12.9s




[CV 5/5] END .................C=1, solver=lbfgs;, score=0.871 total time=  12.3s




[CV 1/5] END .............C=1, solver=liblinear;, score=0.866 total time= 1.1min




[CV 2/5] END .............C=1, solver=liblinear;, score=0.869 total time=  57.2s




[CV 3/5] END .............C=1, solver=liblinear;, score=0.878 total time= 1.0min




[CV 4/5] END .............C=1, solver=liblinear;, score=0.876 total time= 1.1min




[CV 5/5] END .............C=1, solver=liblinear;, score=0.871 total time= 1.2min




[CV 1/5] END ................C=10, solver=lbfgs;, score=0.863 total time=  10.9s




[CV 2/5] END ................C=10, solver=lbfgs;, score=0.857 total time=  10.4s




[CV 3/5] END ................C=10, solver=lbfgs;, score=0.871 total time=  12.6s




[CV 4/5] END ................C=10, solver=lbfgs;, score=0.869 total time=  11.9s




[CV 5/5] END ................C=10, solver=lbfgs;, score=0.865 total time=  11.1s




[CV 1/5] END ............C=10, solver=liblinear;, score=0.859 total time= 1.4min




[CV 2/5] END ............C=10, solver=liblinear;, score=0.859 total time= 1.5min




[CV 3/5] END ............C=10, solver=liblinear;, score=0.871 total time= 1.4min




[CV 4/5] END ............C=10, solver=liblinear;, score=0.870 total time= 1.5min




[CV 5/5] END ............C=10, solver=liblinear;, score=0.862 total time= 1.4min




[CV 1/5] END ...............C=100, solver=lbfgs;, score=0.859 total time=   5.8s




[CV 2/5] END ...............C=100, solver=lbfgs;, score=0.860 total time=   7.6s




[CV 3/5] END ...............C=100, solver=lbfgs;, score=0.867 total time=   7.1s




[CV 4/5] END ...............C=100, solver=lbfgs;, score=0.867 total time=   7.7s




[CV 5/5] END ...............C=100, solver=lbfgs;, score=0.866 total time=   7.5s




[CV 1/5] END ...........C=100, solver=liblinear;, score=0.852 total time= 1.4min




[CV 2/5] END ...........C=100, solver=liblinear;, score=0.859 total time= 1.6min




[CV 3/5] END ...........C=100, solver=liblinear;, score=0.866 total time= 1.4min




[CV 4/5] END ...........C=100, solver=liblinear;, score=0.867 total time= 1.4min




[CV 5/5] END ...........C=100, solver=liblinear;, score=0.862 total time= 1.4min


2025-05-21 02:59:18,956 - INFO - Logistic Regression training took 1572.65 seconds with best params: {'C': 0.1, 'solver': 'lbfgs'}
2025-05-21 02:59:19,052 - INFO - Validation Accuracy: 0.8926



Classification Report:
               precision    recall  f1-score   support

        bath       0.93      0.94      0.94       486
         bed       0.90      0.89      0.90       489
 dining room       0.90      0.87      0.88       521
     kitchen       0.91      0.88      0.90       447
 living room       0.83      0.88      0.85       524

    accuracy                           0.89      2467
   macro avg       0.89      0.89      0.89      2467
weighted avg       0.89      0.89      0.89      2467



2025-05-21 02:59:19,445 - INFO - Logistic Regression model saved as 'logistic_regression_model.pkl'
