# Applying log and finding sample size

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import cv2
import os

# Define the directory paths for girls and boys images
girls_dir = '/content/drive/MyDrive/dataset/dataset/girls_data'
boys_dir = '/content/drive/MyDrive/dataset/dataset/boys_data'

# Function to calculate mean, log mean, and log standard deviation for a single image
def calculate_log_stats(image_path):
  image = cv2.imread(image_path)
  pixels = image.flatten()
  pixel_values = np.array(pixels)
  mean = np.mean(pixel_values)
  std_dev = np.std(pixel_values)
  # Calculate log of mean and standard deviation
  log_mean = np.log(mean)
  log_std_dev = np.log(std_dev)
  return mean, log_mean, log_std_dev

# Function to process all images in a directory
def process_images(directory):
  image_means = []
  log_means = []
  log_std_devs = []
  for file in os.listdir(directory):
    if file.endswith(('jpg', 'jpeg', 'png')):
      image_path = os.path.join(directory, file)
      mean, log_mean, log_std_dev = calculate_log_stats(image_path)
      image_means.append(mean)
      log_means.append(log_mean)
      log_std_devs.append(log_std_dev)
  return np.array(image_means), np.array(log_means), np.array(log_std_devs)

# Process images for girls and boys
girls_means, girls_log_means, girls_log_std_devs = process_images(girls_dir)
boys_means, boys_log_means, boys_log_std_devs = process_images(boys_dir)

# Calculate statistics
girls_mean_log_mean = np.mean(girls_log_means)
girls_std_dev_log_std_dev = np.std(girls_log_std_devs)  # Standard deviation of log standard deviations for girls
boys_mean_log_mean = np.mean(boys_log_means)
boys_std_dev_log_std_dev = np.std(boys_log_std_devs)  # Standard deviation of log standard deviations for boys



n1 = 24
n2 = 25

# Pooled standard deviation (assuming equal variances for simplicity)
pooled_std_dev_log_std_dev =  (((n1)*girls_std_dev_log_std_dev**2 + (n2)*boys_std_dev_log_std_dev**2) /n1+n2)
# Sample size (using the pooled standard deviation of log-transformed values)
sample_size = 4 * pooled_std_dev_log_std_dev * ((1.96 + 1.282)**2) / ((np.exp(boys_mean_log_mean) - np.exp(girls_mean_log_mean))**2)

# Print results
print("Mean Image Values for Girls:")
print(girls_means)  # Print all mean values for girls' images
print("Mean of Mean Image Values for Girls:", np.mean(girls_means))

print("\nLog Mean Image Values for Girls:")
print(girls_log_means)  # Print all log mean values for girls' images
print("Mean of Log Mean Image Values for Girls:", girls_mean_log_mean)
print("Standard Deviation of Log Standard Deviation Values for Girls:", girls_std_dev_log_std_dev)

print("\nMean Image Values for Boys:")
print(boys_means)  # Print all mean values for boys' images
print("Mean of Mean Image Values for Boys:", np.mean(boys_means))

print("\nLog Mean Image Values for Boys:")
print(boys_log_means)  # Print all log mean values for boys' images
print("Mean of Log Mean Image Values for Boys:", boys_mean_log_mean)
print("Standard Deviation of Log Standard Deviation Values for Boys:", boys_std_dev_log_std_dev)

print("\nPooled Standard Deviation of Log Standard Deviations:", pooled_std_dev_log_std_dev)
print("Sample Size for hypothesis testing:", sample_size)


Mean Image Values for Girls:
[149.61694678 170.20668093 139.1240222  147.83747453 149.54764257
 157.69512856 151.92007673 148.76973092 147.32033783 142.23708514
 139.83948587 147.89647367 144.75631039 148.81111679 139.37115801
 144.73871817 155.33496462 154.21946372 150.90545656 144.2063519
 139.13610782 146.5836082  155.09413068 109.22502164]
Mean of Mean Image Values for Girls: 146.8497289254283

Log Mean Image Values for Girls:
[5.00807834 5.13701347 4.93536578 4.99611353 5.00761502 5.0606636
 5.02335457 5.00239968 4.99260938 4.95749528 4.94049524 4.99651253
 4.97505171 5.00267783 4.93714058 4.97493017 5.04558385 5.03837668
 5.01665353 4.97124527 4.93545265 4.98759597 5.04403223 4.69341017]
Mean of Log Mean Image Values for Girls: 4.986661126934616
Standard Deviation of Log Standard Deviation Values for Girls: 0.3768977545046952

Mean Image Values for Boys:
[121.4385101  163.58040355 156.29971046 153.51149425 167.47992591
 139.39419902 139.13174778 157.36128592 112.57254334 162.3681

# Clustering

In [None]:
import os
import cv2
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

def load_data(folder_path):
    image_paths = []
    try:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                if file.endswith(('.jpg', '.jpeg', '.png', '.gif')):
                    image_path = os.path.join(root, file)
                    image_paths.append(image_path)
    except FileNotFoundError:
        print("Folder not found. Please check the folder path.")
    except Exception as e:
        print("An error occurred while loading the data:", e)
    return image_paths

def load_and_preprocess_images(image_paths):
    images = []
    for image_path in image_paths:
        # Load image
        image = cv2.imread(image_path)
        if image is None:
            print(f"Error loading image: {image_path}")
            continue
        # Preprocess image
        # Resize images to a fixed size
        resized_image = cv2.resize(image, (224, 224))

        images.append(resized_image)
    return images

# Load images
folder_path = '/content/drive/MyDrive/dataset/dataset/mini_dataset/'
image_paths = load_data(folder_path)

# Load and preprocess images
X = load_and_preprocess_images(image_paths)

# Print the number of loaded images
print("Number of loaded images:", len(X))

# Split data into initially labeled and unlabeled sets
X_labeled, X_unlabeled = train_test_split(X, test_size=0.8, random_state=42)

# Clustering (Unsupervised Learning)
n_clusters = 3  # Number of clusters (since you have 3 skin types)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_ids = kmeans.fit_predict(np.array(X_unlabeled).reshape(len(X_unlabeled), -1))
cluster_labels = {0: 'oily', 1: 'dry', 2: 'normal'}

# Assign labels to unlabelled data based on cluster assignments
y_pred_unlabeled = [cluster_labels[cluster_id] for cluster_id in cluster_ids]

# Print cluster assignments for review
for i, label in enumerate(y_pred_unlabeled):
    print(f"Image {i + 1}: Predicted skin type: {label}")


Number of loaded images: 49




Image 1: Predicted skin type: oily
Image 2: Predicted skin type: oily
Image 3: Predicted skin type: dry
Image 4: Predicted skin type: normal
Image 5: Predicted skin type: normal
Image 6: Predicted skin type: oily
Image 7: Predicted skin type: oily
Image 8: Predicted skin type: oily
Image 9: Predicted skin type: oily
Image 10: Predicted skin type: oily
Image 11: Predicted skin type: oily
Image 12: Predicted skin type: oily
Image 13: Predicted skin type: oily
Image 14: Predicted skin type: oily
Image 15: Predicted skin type: oily
Image 16: Predicted skin type: oily
Image 17: Predicted skin type: oily
Image 18: Predicted skin type: oily
Image 19: Predicted skin type: dry
Image 20: Predicted skin type: oily
Image 21: Predicted skin type: oily
Image 22: Predicted skin type: oily
Image 23: Predicted skin type: oily
Image 24: Predicted skin type: oily
Image 25: Predicted skin type: oily
Image 26: Predicted skin type: dry
Image 27: Predicted skin type: oily
Image 28: Predicted skin type: oily


# Splitting the data

In [None]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def load_data(folder_path):
    image_paths = []
    labels = []
    try:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                if file.endswith(('.jpg', '.jpeg', '.png', '.gif')):  # Add more image extensions if needed
                    image_path = os.path.join(root, file)
                    image_paths.append(image_path)
                    labels.append(os.path.basename(os.path.dirname(image_path)))  # Use parent folder name as label
    except FileNotFoundError:
        print("Folder not found. Please check the folder path.")
    except Exception as e:
        print("An error occurred while loading the data:", e)
    return image_paths, labels

def load_and_preprocess_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error loading image: {image_path}")
        return None
    resized_image = cv2.resize(image, (224, 224))  # Resize to 224x224
    # You can add more preprocessing steps here
    return resized_image

# Load images and labels
folder_path = '/content/drive/MyDrive/Dataset'
image_paths, labels = load_data(folder_path)

import random

# Shuffle the data
combined = list(zip(image_paths, labels))
random.shuffle(combined)
image_paths[:], labels[:] = zip(*combined)

# Print loaded data
print("Number of images:", len(image_paths))
print("Number of labels:", len(labels))
print("First ten labels:", labels[:40])

# Split data into training and testing sets
X_train_paths, X_test, y_train, y_test = train_test_split(image_paths, labels, test_size=0.2, random_state=0)


Number of images: 40
Number of labels: 40
First ten labels: ['oily', 'oily', 'oily', 'dry', 'oily', 'oily', 'oily', 'dry', 'oily', 'normal', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'dry', 'normal', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily', 'dry', 'normal', 'oily', 'oily', 'oily', 'oily', 'oily', 'oily']


In [None]:
X_train_paths

['/content/drive/MyDrive/Dataset/normal/img 33.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 1.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 24.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 38.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 14.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 31.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 34.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 13.jpeg',
 '/content/drive/MyDrive/Dataset/dry/img 26.jpeg',
 '/content/drive/MyDrive/Dataset/dry/img 35.jpeg',
 '/content/drive/MyDrive/Dataset/dry/img 3.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 9.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 28.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 40.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 7.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 21.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 16.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 11.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 17.jpeg',
 '/content/drive/

In [None]:
X_test

['/content/drive/MyDrive/Dataset/oily/img 17.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 37.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 32.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 16.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 30.jpeg',
 '/content/drive/MyDrive/Dataset/normal/img 33.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 18.jpeg',
 '/content/drive/MyDrive/Dataset/oily/img 28.jpeg']

# SVM

In [None]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd

# Define the paths to your data folders
base_dir = '/content/drive/MyDrive/Dataset'
train_oily_dir = os.path.join(base_dir, 'oily')
train_dry_dir = os.path.join(base_dir, 'dry')
train_normal_dir = os.path.join(base_dir, 'normal')

# Collect paths to all images in each class folder
oily_imgs = [os.path.join(train_oily_dir, img) for img in os.listdir(train_oily_dir)]
dry_imgs = [os.path.join(train_dry_dir, img) for img in os.listdir(train_dry_dir)]
normal_imgs = [os.path.join(train_normal_dir, img) for img in os.listdir(train_normal_dir)]

# Concatenate all paths and assign labels
X_paths = oily_imgs + dry_imgs + normal_imgs
y = ['oily'] * len(oily_imgs) + ['dry'] * len(dry_imgs) + ['normal'] * len(normal_imgs)

# Convert labels to one-hot encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_paths, y_encoded, test_size=0.2, random_state=22)

# Convert training images to arrays
X_train_arrs = np.array([img_to_array(load_img(image_path, target_size=(224, 224))) for image_path in X_train])

# Flatten the arrays
X_train_flattened = X_train_arrs.reshape(X_train_arrs.shape[0], -1)

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': [0.001, 0.01, 0.1, 1],  # Kernel coeff for 'rbf' kernel
    'kernel': ['rbf']  # Kernel type
}

# Initialize SVM classifier
svm_classif = SVC()

# Perform grid search to find the best parameters
svm_grid_search = GridSearchCV(svm_classif, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
svm_grid_search.fit(X_train_flattened, y_train)

# Get the best parameters
best_params = svm_grid_search.best_params_

# Train the SVM classifier with the best parameters
best_svm_classif = SVC(**best_params)
best_svm_classif.fit(X_train_flattened, y_train)

# Convert test images to arrays
X_test_arrs = np.array([img_to_array(load_img(image_path, target_size=(224, 224))) for image_path in X_test])

# Flatten the arrays
X_test_flattened = X_test_arrs.reshape(X_test_arrs.shape[0], -1)

# Test the classifier
y_pred = best_svm_classif.predict(X_test_flattened)

# Evaluate the classifier
accu = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

# Print the metrics of the model where it achieves the best accuracy
print("\nMetrics for SVM with best parameters:")
print("Accuracy:", accu)
print("Precision:", prec)
print("Recall:", recall)
print("F1:", f1)

# Store metrics in a dataframe
metrics_df = pd.DataFrame({'Accuracy': [accu],
                           'Precision': [prec],
                           'Recall': [recall],
                           'F1 Score': [f1]})

# Save metrics to file
metrics_file = "SVM_metrics.csv"
metrics_df.to_csv(metrics_file, index=False)

print(f"Metrics for SVM saved to {metrics_file}")





Metrics for SVM with best parameters:
Accuracy: 0.625
Precision: 0.765625
Recall: 0.625
F1: 0.4807692307692308
Metrics for SVM saved to SVM_metrics.csv


# KNN

In [None]:
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize KNN classifier
knn_classi = KNeighborsClassifier()

# Split data into training and testing sets
X_train_paths, X_test, y_train, y_test = train_test_split(image_paths, labels, test_size=0.2, random_state=0)

# Data Augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

# Augment the training data
X_train_augmented = []
y_train_augmented = []
for i, image_path in enumerate(X_train_paths):
    img = load_and_preprocess_image(image_path)
    if img is not None:
        X_train_augmented.append(img)
        y_train_augmented.append(y_train[i])
        for j in range(4):  # Augment each image 4 times
            augmented_img = datagen.random_transform(img)
            X_train_augmented.append(augmented_img)
            y_train_augmented.append(y_train[i])

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)


# Get all parameter combinations
param_grid = {
    'n_neighbors': [3, 5, 7],  # Number of neighbors to use
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute the nearest neighbors
    'leaf_size': [20, 30, 40],  # Leaf size passed to BallTree or KDTree
    'p': [1, 2]  # Power parameter for the Minkowski metric
}

param_combinations = list(ParameterGrid(param_grid))

# Variables to store best parameters and accuracy
best_params = None
best_accuracy = 0.0

# Initialize tqdm with total number of iterations
with tqdm(total=len(param_combinations)) as pbar:
    # Perform grid search over all parameter combinations
    for params in param_combinations:
        # Set parameters for KNN classifier
        knn_classi.set_params(**params)

        # Convert X_test to numeric arrays and reshape
        X_test_numeric = np.array([load_and_preprocess_image(image_path) for image_path in X_test])
        X_test_reshaped = X_test_numeric.reshape(X_test_numeric.shape[0], -1)

        # Fit KNN classifier
        knn_classi.fit(X_train_augmented.reshape(X_train_augmented.shape[0], -1), y_train_augmented)

        # Test the classifier
        y_pred = knn_classi.predict(X_test_reshaped)

        # Compute accuracy
        accuracy = accuracy_score(y_test, y_pred)

        # Update best parameters and accuracy if better
        if accuracy > best_accuracy:
            best_params = params
            best_accuracy = accuracy

        # Update tqdm progress bar
        pbar.update(1)
        pbar.set_description(f"Best accuracy so far: {best_accuracy:.4f}")

# Print results
print("\nK-Nearest Neighbors with GridSearchCV:")
print("Best parameters:", best_params)
print("Best accuracy:", best_accuracy)

# Train the KNN classifier with the best parameters
best_knn_classifier = KNeighborsClassifier(**best_params)
best_knn_classifier.fit(X_train_augmented.reshape(X_train_augmented.shape[0], -1), y_train_augmented)

# Test the classifier
y_pred = best_knn_classifier.predict(X_test_reshaped)

# Calculate metrics for KNN
accuracy_knn = accuracy_score(y_test, y_pred)
precision_knn = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall_knn = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1_knn = f1_score(y_test, y_pred, average='weighted', zero_division=1)

# Print additional results for KNN
print("Accuracy:", accuracy_knn)
print("Precision:", precision_knn)
print("Recall:", recall_knn)
print("F1:", f1_knn)

# Store metrics in a dataframe
metrics_df = pd.DataFrame({'Accuracy': [accuracy_knn],
                           'Precision': [precision_knn],
                           'Recall': [recall_knn],
                           'F1 Score': [f1_knn]})

# Save metrics to file
metrics_file = "KNN_metrics.csv"
metrics_df.to_csv(metrics_file, index=False)

print(f"Metrics for KNN saved to {metrics_file}")


Best accuracy so far: 0.7500: 100%|██████████| 144/144 [01:56<00:00,  1.23it/s]



K-Nearest Neighbors with GridSearchCV:
Best parameters: {'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
Best accuracy: 0.75
Accuracy: 0.75
Precision: 0.875
Recall: 0.75
F1: 0.75
Metrics for KNN saved to KNN_metrics.csv


# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import ParameterGrid, train_test_split
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def load_data(folder_path):
    image_paths = []
    labels = []
    try:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                if file.endswith(('.jpg', '.jpeg', '.png', '.gif')):  # Add more image extensions if needed
                    image_path = os.path.join(root, file)
                    image_paths.append(image_path)
                    labels.append(os.path.basename(os.path.dirname(image_path)))  # Use parent folder name as label
    except FileNotFoundError:
        print("Folder not found. Please check the folder path.")
    except Exception as e:
        print("An error occurred while loading the data:", e)
    return image_paths, labels

def load_and_preprocess_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error loading image: {image_path}")
        return None
    resized_image = cv2.resize(image, (224, 224))  # Resize to 224x224
    return resized_image

# Load images and labels
folder_path = '/content/drive/MyDrive/Dataset'
image_paths, labels = load_data(folder_path)

# Initialize Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=0)

# Split data into training and testing sets
X_train_paths, X_test, y_train, y_test = train_test_split(image_paths, labels, test_size=0.2, random_state=33)

# Data Augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

# Augment the training data
X_train_augmented = []
y_train_augmented = []
for i, image_path in enumerate(X_train_paths):
    img = load_and_preprocess_image(image_path)
    if img is not None:
        X_train_augmented.append(img)
        y_train_augmented.append(y_train[i])
        for j in range(4):  # Augment each image 4 times
            augmented_img = datagen.random_transform(img)
            X_train_augmented.append(augmented_img)
            y_train_augmented.append(y_train[i])

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

# Convert X_test to numeric arrays and reshape
X_test_numeric = np.array([load_and_preprocess_image(image_path) for image_path in X_test])
X_test_reshaped = X_test_numeric.reshape(X_test_numeric.shape[0], -1)

# Get all parameter combinations
param_grid = {
    'criterion': ['gini', 'entropy'],  # Split criterion
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2']  # Number of features to consider when looking for the best split
}

param_combinations = list(ParameterGrid(param_grid))

# Variables to store best parameters and accuracy
best_params = None
best_accuracy = 0.0

# Perform grid search over all parameter combinations
for params in param_combinations:
    # Set parameters for Decision Tree classifier
    dt_classifier.set_params(**params)

    # Fit Decision Tree classifier
    dt_classifier.fit(X_train_augmented.reshape(X_train_augmented.shape[0], -1), y_train_augmented)

    # Test the classifier
    y_pred = dt_classifier.predict(X_test_reshaped)

    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Update best parameters and accuracy if better
    if accuracy > best_accuracy:
        best_params = params
        best_accuracy = accuracy


# Train the Decision Tree classifier with the best parameters
best_dt_classifier = DecisionTreeClassifier(**best_params, random_state=12)
best_dt_classifier.fit(X_train_augmented.reshape(X_train_augmented.shape[0], -1), y_train_augmented)

# Test the classifier
y_pred = best_dt_classifier.predict(X_test_reshaped)

# Calculate metrics for Decision Tree
accuracy_dt = accuracy_score(y_test, y_pred)
precision_dt = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall_dt = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1_dt = f1_score(y_test, y_pred, average='weighted', zero_division=1)

# Print metrics for Decision Tree
print("Accuracy:", accuracy_dt)
print("Precision:", precision_dt)
print("Recall:", recall_dt)
print("F1:", f1_dt)

# Store metrics in a dataframe
metrics_df_dt = pd.DataFrame({'Accuracy': [accuracy_dt],
                              'Precision': [precision_dt],
                              'Recall': [recall_dt],
                              'F1 Score': [f1_dt]})

# Save metrics to file
metrics_file_dt = "DecisionTree_metrics.csv"
metrics_df_dt.to_csv(metrics_file_dt, index=False)

print(f"Metrics for Decision Tree saved to {metrics_file_dt}")


Accuracy: 0.375
Precision: 0.65625
Recall: 0.375
F1: 0.47727272727272724
Metrics for Decision Tree saved to DecisionTree_metrics.csv


# ResNet50

In [None]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import load_img, img_to_array
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the paths to your data folders
base_dir = '/content/drive/MyDrive/Dataset'
train_oily_dir = os.path.join(base_dir, 'oily')
train_dry_dir = os.path.join(base_dir, 'dry')
train_normal_dir = os.path.join(base_dir, 'normal')

# Collect paths to all images in each class folder
oily_images = [os.path.join(train_oily_dir, img) for img in os.listdir(train_oily_dir)]
dry_images = [os.path.join(train_dry_dir, img) for img in os.listdir(train_dry_dir)]
normal_images = [os.path.join(train_normal_dir, img) for img in os.listdir(train_normal_dir)]

# Concatenate all paths and assign labels
X_paths = oily_images + dry_images + normal_images
y = ['oily'] * len(oily_images) + ['dry'] * len(dry_images) + ['normal'] * len(normal_images)

# Convert labels to one-hot encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

# Preprocess images and convert them to arrays
X_images = [load_img(image_path, target_size=(224, 224)) for image_path in X_paths]
X_arrays = [img_to_array(img) for img in X_images]

# Convert lists to numpy arrays
X_arrays = np.array(X_arrays)

# Data augmentation for training data
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

# Load pre-trained ResNet50 model (without the top layers)
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze pre-trained layers
for layer in base_model.layers:
    layer.trainable = False

# Add custom layers for classification
x = base_model.output
x = Flatten()(x)
x = Dense(1024, activation='relu')(x)  # Experiment with the number of units and activation
x = Dropout(0.5)(x)  # Experiment with the dropout rate
predictions = Dense(num_classes, activation='softmax')(x)  # Number of units equals the number of classes, softmax activation

# Create the final model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_arrays, y_encoded, test_size=0.2, random_state=22)

# Train the model
history = model.fit(
    train_datagen.flow(X_train, y_train, batch_size=32),
    steps_per_epoch=len(X_train) // 32,
    epochs=10,
    validation_data=(X_val, y_val)
)

# Evaluate the model
y_pred = model.predict(X_val)
y_pred_labels = np.argmax(y_pred, axis=1)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred_labels)
precision = precision_score(y_val, y_pred_labels, average='weighted')
recall = recall_score(y_val, y_pred_labels, average='weighted')
f1 = f1_score(y_val, y_pred_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Store metrics in a dataframe
metrics_df = pd.DataFrame({'Accuracy': [accuracy],
                           'Precision': [precision],
                           'Recall': [recall],
                           'F1 Score': [f1]})

# Save metrics to file
metrics_file = "ResNet50_metrics.csv"
metrics_df.to_csv(metrics_file, index=False)

print(f"Metrics for ResNet50 saved to {metrics_file}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.625
Precision: 0.390625
Recall: 0.625
F1 Score: 0.4807692307692308
Metrics for ResNet50 saved to ResNet50_metrics.csv


  _warn_prf(average, modifier, msg_start, len(result))


# VGG16

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from keras.preprocessing.image import load_img, img_to_array
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the paths to your data folders
base_dir = '/content/drive/MyDrive/Dataset'
train_oily_dir = os.path.join(base_dir, 'oily')
train_dry_dir = os.path.join(base_dir, 'dry')
train_normal_dir = os.path.join(base_dir, 'normal')

# Collect paths to all images in each class folder
oily_images = [os.path.join(train_oily_dir, img) for img in os.listdir(train_oily_dir)]
dry_images = [os.path.join(train_dry_dir, img) for img in os.listdir(train_dry_dir)]
normal_images = [os.path.join(train_normal_dir, img) for img in os.listdir(train_normal_dir)]

# Concatenate all paths and assign labels
X_paths = oily_images + dry_images + normal_images
y = ['oily'] * len(oily_images) + ['dry'] * len(dry_images) + ['normal'] * len(normal_images)

# Convert labels to one-hot encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

# Preprocess images and convert them to arrays
X_images = [load_img(image_path, target_size=(224, 224)) for image_path in X_paths]
X_arrays = [img_to_array(img) for img in X_images]

# Convert lists to numpy arrays
X_arrays = np.array(X_arrays)

# Data augmentation for training data
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

# Load pre-trained VGG16 model (without the top layers)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze pre-trained layers
for layer in base_model.layers:
    layer.trainable = False

# Add custom layers for classification
x = base_model.output
x = Flatten()(x)
x = Dense(1024, activation='relu')(x)  # Experiment with the number of units and activation
x = Dropout(0.5)(x)  # Experiment with the dropout rate
predictions = Dense(num_classes, activation='softmax')(x)  # Number of units equals the number of classes, softmax activation

# Create the final model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_arrays, y_encoded, test_size=0.2, random_state=42)

# Train the model
history = model.fit(
    train_datagen.flow(X_train, y_train, batch_size=32),
    steps_per_epoch=len(X_train) // 32,
    epochs=10,
    validation_data=(X_val, y_val)
)

# Evaluate the model
y_pred = model.predict(X_val)
y_pred_labels = np.argmax(y_pred, axis=1)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred_labels)
precision = precision_score(y_val, y_pred_labels, average='weighted')
recall = recall_score(y_val, y_pred_labels, average='weighted')
f1 = f1_score(y_val, y_pred_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Store metrics in a dataframe
metrics_df = pd.DataFrame({'Accuracy': [accuracy],
                           'Precision': [precision],
                           'Recall': [recall],
                           'F1 Score': [f1]})

# Save metrics to file
metrics_file = "VGG16_metrics.csv"
metrics_df.to_csv(metrics_file, index=False)

print(f"Metrics for VGG16 saved to {metrics_file}")


Mounted at /content/drive
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.875
Precision: 0.765625
Recall: 0.875
F1 Score: 0.8166666666666667
Metrics for VGG16 saved to VGG16_metrics.csv


  _warn_prf(average, modifier, msg_start, len(result))


## EfficientNetB0

In [None]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import load_img, img_to_array
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the paths to your data folders
base_dir = '/content/drive/MyDrive/Dataset'
train_oily_dir = os.path.join(base_dir, 'oily')
train_dry_dir = os.path.join(base_dir, 'dry')
train_normal_dir = os.path.join(base_dir, 'normal')

# Collect paths to all images in each class folder
oily_images = [os.path.join(train_oily_dir, img) for img in os.listdir(train_oily_dir)]
dry_images = [os.path.join(train_dry_dir, img) for img in os.listdir(train_dry_dir)]
normal_images = [os.path.join(train_normal_dir, img) for img in os.listdir(train_normal_dir)]

# Concatenate all paths and assign labels
X_paths = oily_images + dry_images + normal_images
y = ['oily'] * len(oily_images) + ['dry'] * len(dry_images) + ['normal'] * len(normal_images)

# Convert labels to numerical encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

# Preprocess images and convert them to arrays
X_images = [load_img(image_path, target_size=(224, 224)) for image_path in X_paths]
X_arrays = [img_to_array(img) for img in X_images]

# Convert lists to numpy arrays
X_arrays = np.array(X_arrays)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_arrays, y_encoded, test_size=0.2, random_state=5)

# Data augmentation for training data
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

# Load pre-trained EfficientNetB0 model (without the top layers)
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add custom layers for classification
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)  # Experiment with the number of units and activation
x = Dropout(0.5)(x)  # Experiment with the dropout rate
predictions = Dense(num_classes, activation='softmax')(x)  # Number of units equals the number of classes, softmax activation

# Create the final model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(
    train_datagen.flow(X_train, y_train, batch_size=32),
    steps_per_epoch=len(X_train) // 32,
    epochs=10,
    validation_data=(X_test, y_test)
)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_labels)
precision = precision_score(y_test, y_pred_labels, average='weighted')
recall = recall_score(y_test, y_pred_labels, average='weighted')
f1 = f1_score(y_test, y_pred_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Store metrics in a dataframe
metrics_df = pd.DataFrame({'Accuracy': [accuracy],
                           'Precision': [precision],
                           'Recall': [recall],
                           'F1 Score': [f1]})

# Save metrics to file
metrics_file = "EfficientNetB0_metrics.csv"
metrics_df.to_csv(metrics_file, index=False)

print(f"Metrics for EffiNetB0 saved to {metrics_file}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




Accuracy: 0.5
Precision: 0.90625
Recall: 0.5
F1 Score: 0.5750000000000001
Metrics for EffiNetB0 saved to EfficientNetB0_metrics.csv


  _warn_prf(average, modifier, msg_start, len(result))


#Comparing to find the best accurate model

In [None]:
import pandas as pd
import os

# List of model names
model_names = ["SVM", "KNN", "DecisionTree", "VGG16", "ResNet50", "EfficientNetB0"]

# Load metrics for each model into a DataFrame
metrics_data = {}
for model_name in model_names:
    metrics_file = f"{model_name}_metrics.csv"
    metrics_data[model_name] = pd.read_csv(metrics_file)

# Get accuracy for each model
accuracies = {}
for model_name, metrics_df in metrics_data.items():
    accuracies[model_name] = metrics_df['Accuracy'].iloc[0]

# Determine the best accurate model
best_model = max(accuracies, key=accuracies.get)

# Print the best accurate model
print("Best accurate model based on the accuracy:", best_model)


Best accurate model based on the accuracy: VGG16
