In [1]:
import pandas as pd
import numpy as np
import os

from time import time
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, roc_curve, auc
from sklearn.svm import SVC, LinearSVC
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from skimage.feature import hog

In [3]:
NAMES_PATH = "archive/lfw_allnames.csv"
IMAGES_PATH = "archive/lfw-deepfunneled/lfw-deepfunneled"

In [4]:
df_names = pd.read_csv(NAMES_PATH)
min_faces_per_person = 80
df_names = df_names.loc[df_names["images"] >= min_faces_per_person, :]
names = list(df_names["name"])
names

['Colin_Powell',
 'Donald_Rumsfeld',
 'George_W_Bush',
 'Gerhard_Schroeder',
 'Tony_Blair']

In [5]:
X = []
Y = []
for name in names:
    dir_path = os.path.join(IMAGES_PATH, name)
    list_images_name = os.listdir(dir_path)
    for image_name in list_images_name:
        image_path = os.path.join(dir_path, image_name)
        img_rgb = plt.imread(image_path)
        X.append(img_rgb)
        Y.append(name)
    print(f"Class: {name}, number of samples: {len(list_images_name)}.")
X = np.asarray(X)
Y = np.asarray(Y)

print(X.shape, Y.shape)

Class: Colin_Powell, number of samples: 236.
Class: Donald_Rumsfeld, number of samples: 121.
Class: George_W_Bush, number of samples: 530.
Class: Gerhard_Schroeder, number of samples: 109.
Class: Tony_Blair, number of samples: 144.
(1140, 250, 250, 3) (1140,)


In [6]:
import numpy as np
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing import image

def create_features_cnn(img, show_cnn=False):
    # Flatten image
    color_features = img.flatten()
    
    # If the image is grayscale, add a third channel
    if len(img.shape) == 2:
        img = np.stack((img,) * 3, axis=-1)
    
    # Preprocess the image for VGG16 model
    img_resized = image.array_to_img(img, scale=False)
    img_resized = img_resized.resize((224, 224))  # Assuming VGG16 input shape
    img_array = image.img_to_array(img_resized)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)

    # Load pre-trained VGG16 model
    model = VGG16(weights='imagenet', include_top=False)
    
    # Extract features using VGG16 model
    cnn_features = model.predict(img_array)
    cnn_features = cnn_features.flatten()

    if show_cnn:
        plt.imshow(img_array[0])
        plt.axis('off')
        plt.title('Original Image')
        plt.show()

    # Combine color and CNN features into a single array
    flat_features = np.hstack((color_features, cnn_features))
    return flat_features


In [10]:
feature_matrix1 = []
for x in X:  # Iterate over each image in X
    features = create_features_cnn(x)  # Extract features using the create_features_cnn function
    feature_matrix1.append(features)  # Append the extracted features to the feature_matrix1 list


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 556ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 441ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 421ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 461ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 428ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 414ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 450ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 512ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 580ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 693ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 616ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 781ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 848ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

MemoryError: Unable to allocate 9.00 MiB for an array with shape (3, 3, 512, 512) and data type float32

In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Scale the data
ss = StandardScaler()
X_pca_scaled = ss.fit_transform(feature_matrix1)

# Perform PCA without specifying the number of components
pca = PCA()
pca.fit(X_pca_scaled)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance Ratio')
plt.grid(True)
plt.show()

# Determine the optimal number of components
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
optimal_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1

print("Optimal number of components for 95% variance:", optimal_components)

# Now use the optimal number of components in PCA
pca = PCA(n_components=optimal_components)
X_pca_transformed = pca.fit_transform(X_pca_scaled)
print('PCA matrix shape is:', X_pca_transformed.shape)


NameError: name 'feature_matrix1' is not defined

In [63]:
 
label_encoder = LabelEncoder() 
  
# Encode labels in column 'species'. 
Y = label_encoder.fit_transform(Y) 

In [64]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, random_state=42)

In [68]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Instantiate SVM Classifier
svm_classifier = SVC(random_state=42)

# Train the model
svm_classifier.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred_svm = svm_classifier.predict(X_test)

# Evaluate accuracy
accuracy_svm = accuracy_score(Y_test, Y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

SVM Accuracy: 0.8175438596491228


In [75]:
parameters = {'C': [0.1, 1, 10],
             'gamma': [1e-3, 1e-6, 1e-9],
             'kernel': ['rbf', 'linear']}
grid_search = GridSearchCV(svm_classifier, parameters, n_jobs=-1, cv=5)

In [70]:
t0 = time()
grid_search.fit(X_train, Y_train)
print("grid_search in %0.fs" % (time() - t0))

grid_search in 4s


In [71]:
print("The best mean accuracy: ", grid_search.best_score_)

The best mean accuracy:  0.9286549707602341


In [72]:
pip install imbalanced-learn


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform Grid Search Cross Validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model with the best parameters
rf_classifier = RandomForestClassifier(random_state=42, **best_params)
rf_classifier.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred = rf_classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 0.5087719298245614


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Perform Grid Search Cross Validation
grid_search = GridSearchCV(gb_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model with the best parameters
gb_classifier = GradientBoostingClassifier(random_state=42, **best_params)
gb_classifier.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred_gb = gb_classifier.predict(X_test)

# Evaluate accuracy
accuracy_gb = accuracy_score(Y_test, Y_pred_gb)
print("Gradient Boosting Classifier Accuracy:", accuracy_gb)
