# Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from skimage.filters.rank import entropy
from skimage.morphology import disk
from skimage.filters import gaussian, sobel
from skimage.feature import canny

import os
import shutil

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset Preparation

In [None]:
dataset_path = '/content/drive/MyDrive/PBL/dataset/augmented_dataset/'

# List all files
dataset_image_names = os.listdir(dataset_path)

In [None]:
# Defining a function to return the class labels corresponding to the age-ranges shown above.
def class_labels(age):
    if 0 <= age <= 5:
        return 0
    elif 6 <= age <= 10:
        return 1
    elif 11 <= age <= 15:
        return 2
    elif 16 <= age <= 20:
        return 3
    elif 21 <= age <= 30:
        return 4
    elif 31 <= age <= 40:
        return 5
    elif 41 <= age <= 50:
        return 6
    elif 51 <= age <= 60:
        return 7
    elif 61 <= age <= 70:
        return 8
    elif 71 <= age <= 80:
        return 9
    elif 81 <= age <= 110:
        return 10
    else:
        return 10

In [None]:
# Creating a new dataframe to hold all filenames, corresponding ages and class labels.

master_df = pd.DataFrame()
master_df['filename'] = dataset_image_names
master_df['age'] = master_df['filename'].map(lambda img_name : np.uint8(img_name.split("_")[0]))
master_df['target'] = master_df['age'].map(class_labels)

master_df.head(5000)

Unnamed: 0,filename,age,target
0,071_3952_original.png,71,9
1,071_3952_aug_3331_0.png,71,9
2,071_3952_aug_3331_1.png,71,9
3,071_3952_aug_3331_2.png,71,9
4,071_942_original.png,71,9
...,...,...,...
4995,042_6496_aug_687_2.png,42,6
4996,042_6280_original.png,42,6
4997,042_6280_aug_688_0.png,42,6
4998,042_6280_aug_688_1.png,42,6


In [None]:
# Shuffling the rows of combined_df so as to mix together the rows coming from both subreddit datasets.

master_df = shuffle(master_df, random_state=42).reset_index(drop=True)
master_df.head()

Unnamed: 0,filename,age,target
0,039_3719_aug_477_2.png,39,5
1,014_3488_aug_1712_2.png,14,2
2,019_2802_aug_55_2.png,19,3
3,041_2164_aug_427_2.png,41,6
4,074_9740_aug_3296_1.png,74,9


# Split Data

In [None]:
# Defining the filenames and ages from above master_df as X, and target as y for splitting into train and test datasets later.

X = master_df[['filename', 'age']]
y = master_df['target']

In [None]:
# Splitting the dataset into training and testing datasets with test_size=0.3 and stratify=y.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [None]:
X_train.shape, X_test.shape

((3850, 2), (1650, 2))

In [None]:
y_train.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
9,0.090909
5,0.090909
10,0.090909
4,0.090909
3,0.090909
0,0.090909
8,0.090909
7,0.090909
2,0.090909
6,0.090909


In [None]:
y_test.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
8,0.090909
5,0.090909
0,0.090909
2,0.090909
1,0.090909
7,0.090909
4,0.090909
3,0.090909
6,0.090909
9,0.090909


In [None]:
# Creating copies of X and y (both train and test) from above to create a dataframe of filepaths to all images and their target labels.
# These dataframes will be in the deep learning models later to create dataset input pipelines using TensorFlow.data.Dataset API.

# temp_X_train = X_train.copy()
# temp_X_train['target'] = y_train

# temp_X_test = X_test.copy()
# temp_X_test['target'] = y_test

In [None]:
# Defining a function to append the filepath to each image name as a string.

# dataset_path = "/content/content/face_age"

# def append_path_to_filename(filename):
#     return os.path.join(dataset_path, filename)

In [None]:
# Mapping the above created function on both dataframes created above.

# temp_X_train['filename'] = temp_X_train['filename'].map(append_path_to_filename)
# temp_X_test['filename'] = temp_X_test['filename'].map(append_path_to_filename)

In [None]:
# Create the directory if it doesn't exist
os.makedirs("/content/drive/My Drive/PBL/input_output", exist_ok=True)

# Now you can save the CSV files
X_train.to_csv("/content/drive/My Drive/PBL/input_output/images_filenames_labels_train.csv", index=False)
X_test.to_csv("/content/drive/My Drive/PBL/input_output/images_filenames_labels_test.csv", index=False)

# Feature Extraction

In [None]:
# Defining a function to break-down an image of 200x200 pixels into sections of 10x10 pixels each,
# and calculate the mean and stdev of the section.
# Function INPUT: An image of 200x200 pixel size.
# Function OUTPUT: Features array comprising of mean and stdev of 400 sections (10x10 pixels).

def features_grid(img):
    features = np.array([], dtype='uint8')
    section = 1

    for y in range(0, img.shape[0], 10):
        for x in range(0, img.shape[1], 10):

            # Cropping the image into a section.
            section_img = img[y:y+10, x:x+10]

            # Claculating the mean and stdev of the sectioned image.
            section_mean = np.mean(section_img)
            section_std = np.std(section_img)

            # Appending the above calculated values into features array.
            features = np.append(features, [section_mean, section_std])

    # Returning the features array.
    return features

In [None]:
# Defining a function to loop through images in the dataset and extract the canny edges mean and stdev values from 10x10 pixel sections of each image.

def extract_canny_edges(filename_series):

    # Creating an array of shape (1, 801) to store 400 canny edges mean values, 400 canny edges stdev values and 1 age value.
    all_imgs = np.zeros((1, 801), dtype='uint8')

    progress_counter = 0

    for img_name in filename_series:

        # Defining a path to the image and reading in the coloured image.
        img_path = os.path.join(dataset_path, img_name)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        # Converting the coloured image to a grayscale image.
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Resizing the grayscale image to 200x200 pixels.
        img = cv2.resize(img, (200, 200))

        # Converting the grayscale image to a canny edges filtered image.
        img = canny(img, sigma=0.9)

        # Using the function defined above, extracting the features (mean and stdev values of all 10x10 pixel sections from the image) from the canny edges filtered image.
        img_features = features_grid(img)

        # Adding the actual age value (from the image name) into the features array.
        age = np.uint8(img_name.split("_")[0])
        img_features = np.append(img_features, age)

        img_features = img_features.reshape(1, img_features.shape[0])

        # Adding the image's features into the all_imgs features array defined above.
        all_imgs = np.append(all_imgs, img_features, axis=0)

        # Keeping track of progress and printing relevant statements for the user.
        progress_counter += 1
        if progress_counter % 1000 == 0:
            print(f"Images processed for features extraction: {progress_counter} of {len(filename_series)}")

    # Getting rid of the first row of zeros created while defining the all_imgs array above.
    all_imgs = all_imgs[1:]

    return all_imgs

In [None]:
# Extracting the canny edge features from images in the training dataset.

train_imgs = extract_canny_edges(X_train['filename'])

Images processed for features extraction: 1000 of 3850
Images processed for features extraction: 2000 of 3850
Images processed for features extraction: 3000 of 3850


In [None]:
# Exporting the above created features array as a .npy file for use in the model later.

with open("/content/drive/My Drive/PBL/input_output/canny_features_age_train.npy", "wb") as f:
    np.save(f, train_imgs, allow_pickle=True)

In [None]:
# Extracting the canny edge features from images in the testing dataset.

test_imgs = extract_canny_edges(X_test['filename'])

Images processed for features extraction: 1000 of 1650


In [None]:
# Exporting the above created features array as a .npy file for use in the model later.

with open("/content/drive/My Drive/PBL/input_output/canny_features_age_test.npy", "wb") as f:
    np.save(f, test_imgs, allow_pickle=True)

In [None]:
# Creating a list of columns names for the features arrays defined above.
# The column names correspond to the sectioned image's mean and stdev values.
# Last column is the age to be converted to target class label in the model later.

feature_names = []
section = 1

for y in range(0, 200, 10):
    for x in range(0, 200, 10):
        feature_names.append(f"sec{section}_mean")
        feature_names.append(f"sec{section}_std")
        section += 1

feature_names.append('age')

In [None]:
# Exporting the above created list of feature names as a CSV file for use in the model later.

pd.Series(feature_names).to_csv("/content/drive/My Drive/PBL/input_output/canny_features_names.csv", index=False, header=['canny_edge_features'])

# Model Prep

In [None]:
# Importing the numpy arrays of train and test datasets.

train = np.load("/content/drive/My Drive/PBL/input_output/canny_features_age_train.npy")
test = np.load("/content/drive/My Drive/PBL/input_output/canny_features_age_test.npy")

In [None]:
train_df = pd.DataFrame(train, columns=feature_names)
test_df = pd.DataFrame(test, columns=feature_names)

In [None]:
train_df['age'] = train_df['age'].astype(np.uint8)
test_df['age'] = test_df['age'].astype(np.uint8)

In [None]:
# Creating a column of target class values using the function defined above.

train_df['target'] = train_df['age'].map(class_labels)
test_df['target'] = test_df['age'].map(class_labels)

In [None]:
# Splitting the above train and test dataframes into features (X) and target (y).

X_train = train_df.drop(columns=['age', 'target'])
y_train = train_df['target']

X_test = test_df.drop(columns=['age', 'target'])
y_test = test_df['target']

In [None]:
# Scaling X_train to the standard scale.

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)

In [None]:
# Transforming X_test to the same scale.

X_test_sc = ss.transform(X_test)

# Classification (SVC)

In [None]:
# Creating a SVC object.

svc = SVC(# class_weight='balanced',
          # C=1.0,
          # kernel='rbf',
          # degree=3,
          random_state=42
         )

In [None]:
# Establishing ranges of hyperparameters of SVC for GridSearchCV.

svc_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10],
    'class_weight': ['balanced', None]
}
grid_search = GridSearchCV(SVC(random_state=42), param_grid=svc_params, cv=5, scoring='accuracy')


In [None]:
# Creating a GridSearchCV object for the SVC object defined above.

svc_gs = GridSearchCV(svc, param_grid=svc_params, n_jobs=-1, cv=5)

In [None]:
# Fitting X_train_sc and y_train on GridSearchCV object with SVC defined above.

svc_gs.fit(X_train_sc, y_train)

In [None]:
# Best combination of hyperparameters suggested by GridSearchCV.

svc_gs.best_params_

{'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}

In [None]:
# Best accuracy score obtained by the above combination of hyperparameters.

svc_gs.best_score_

0.5031168831168832

In [None]:
# Scoring the model on training dataset.
# Training Accuracy

svc_train_acc = svc_gs.score(X_train_sc, y_train)
svc_train_acc

0.9997402597402597

In [None]:
# Actual Testing Accuracy

svc_test_acc = svc_gs.score(X_test_sc, y_test)
svc_test_acc

0.5563636363636364

In [None]:
# Summary scores from GridSearchCV with SVC.

print("SVC summary of accuracy scores:")
print(f"GridSearchCV best accuracy = {round(svc_gs.best_score_, 3)}")
print("\nUsing GridSearchCV best params suggested,")
print(f"Training accuracy = {round(svc_train_acc, 3)}")
# print(f"Est. Test accuracy (cv=5) = {round(svc_est_test_acc , 3)}")
print(f"Testing accuracy = {round(svc_test_acc, 3)}")

SVC summary of accuracy scores:
GridSearchCV best accuracy = 0.503

Using GridSearchCV best params suggested,
Training accuracy = 1.0
Testing accuracy = 0.556


In [None]:
# Saving the SVC model from above in a pickle file for possible use later.

# Menambahkan ekstensi .pkl ke nama file
svc_pickle = f"/content/drive/My Drive/PBL/input_output/svc_canny_model_acc_{round(svc_test_acc, 3)}.pkl"

# Menyimpan model ke file dengan ekstensi .pkl
with open(svc_pickle, 'wb') as file:
    pickle.dump(svc_gs, file)

# Classification (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [None]:
rf_params = {
    'n_estimators': [50, 100, 200],  # Jumlah pohon
    'max_depth': [None, 5, 10, 20],  # Kedalaman maksimum pohon
    'min_samples_split': [2, 5, 10],  # Minimum sampel untuk split
    'min_samples_leaf': [1, 2, 4],    # Minimum sampel per daun
    'max_features': ['sqrt', 'log2'], # Jumlah fitur untuk split
    'class_weight': [None, 'balanced'] # Penanganan ketidakseimbangan kelas
}

In [None]:
rf_model = RandomForestClassifier(random_state=42)

grid_search_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_params,
    cv=5,                 # 5-fold cross-validation
    scoring='accuracy',   # Gunakan metrik akurasi
    verbose=1,            # Menampilkan proses tuning
    n_jobs=-1             # Gunakan semua core CPU
)

In [None]:
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


In [None]:
# Parameter terbaik
print("Best Parameters:", grid_search_rf.best_params_)

# Model terbaik
best_rf_model = grid_search_rf.best_estimator_

# Evaluasi akurasi di data test
y_pred = best_rf_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

# Laporan klasifikasi
print(classification_report(y_test, y_pred))

Best Parameters: {'class_weight': None, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Test Accuracy: 0.49272727272727274
              precision    recall  f1-score   support

           0       0.54      0.65      0.59       150
           1       0.46      0.46      0.46       150
           2       0.63      0.39      0.48       150
           3       0.57      0.51      0.54       150
           4       0.41      0.54      0.47       150
           5       0.44      0.43      0.43       150
           6       0.45      0.45      0.45       150
           7       0.55      0.42      0.48       150
           8       0.43      0.41      0.42       150
           9       0.48      0.41      0.45       150
          10       0.53      0.75      0.62       150

    accuracy                           0.49      1650
   macro avg       0.50      0.49      0.49      1650
weighted avg       0.50      0.49      0.49      1650



In [None]:
# Saving the Random Forest model from above in a pickle file for possible use later.

# Menambahkan ekstensi .pkl ke nama file
rf_pickle = f"/content/drive/My Drive/PBL/input_output/rf_canny_model_acc_{round(test_accuracy, 3)}.pkl"

# Menyimpan model ke file dengan ekstensi .pkl
with open(rf_pickle, 'wb') as file:
    pickle.dump(grid_search_rf.best_estimator_, file)

# Classification (XGBoost)

In [None]:
from xgboost import XGBClassifier

# Membuat model XGBoost
xgb_model = XGBClassifier(n_estimators=100, max_depth=5, random_state=42)

# Melatih model
xgb_model.fit(X_train, y_train)

# Evaluasi model
y_pred = xgb_model.predict(X_test)
xgb_test_accuracy = accuracy_score(y_test, y_pred)  # Akurasi untuk model XGBoost
print("Accuracy XGBoost:", xgb_test_accuracy)
print(classification_report(y_test, y_pred))

Accuracy XGBoost: 0.5139393939393939
              precision    recall  f1-score   support

           0       0.60      0.61      0.60       150
           1       0.50      0.49      0.50       150
           2       0.55      0.47      0.50       150
           3       0.54      0.51      0.52       150
           4       0.43      0.55      0.49       150
           5       0.45      0.46      0.45       150
           6       0.46      0.49      0.47       150
           7       0.56      0.41      0.48       150
           8       0.49      0.49      0.49       150
           9       0.52      0.48      0.50       150
          10       0.59      0.69      0.64       150

    accuracy                           0.51      1650
   macro avg       0.52      0.51      0.51      1650
weighted avg       0.52      0.51      0.51      1650



In [None]:
# Saving the XGBoost model from above in a pickle file for possible use later.

xgb_pickle = f"/content/drive/My Drive/PBL/input_output/xgboost_canny_model_acc_{round(xgb_test_accuracy, 3)}.pkl"
with open(xgb_pickle, 'wb') as file:
    pickle.dump(xgb_model, file)