In [1]:
import numpy as np
import cv2
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
path_to_data = "./dataset_stars/"
path_to_cr_data = "./dataset_stars/cropped/"

In [3]:
import os
img_dirs = []
for entry in os.scandir(path_to_data):
    if entry.is_dir():
        img_dirs.append(entry.path)

In [4]:
img_dirs


['./dataset_stars/Angelina Jolie',
 './dataset_stars/Brad Pitt',
 './dataset_stars/cropped',
 './dataset_stars/Leonardo DiCaprio',
 './dataset_stars/Lionel Messi',
 './dataset_stars/Tom Cruise']

In [5]:
import shutil
if os.path.exists(path_to_cr_data):
     shutil.rmtree(path_to_cr_data)
os.mkdir(path_to_cr_data)

In [6]:
cascades_path = cv2.data.haarcascades

face_cascade = cv2.CascadeClassifier(os.path.join(cascades_path, 'haarcascade_frontalface_default.xml'))
eye_cascade = cv2.CascadeClassifier(os.path.join(cascades_path, 'haarcascade_eye.xml'))

In [7]:
import cv2

def get_cropped_image_if_2_eyes(image_path):
    img = cv2.imread(image_path)
    
    if img is None:
        print(f"Warning: impossible de lire l'image {image_path}")
        return None
    
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    
    for (x, y, w, h) in faces:
        roi_gray = gray[y:y+h, x:x+w]
        roi_color = img[y:y+h, x:x+w]
        eyes = eye_cascade.detectMultiScale(roi_gray)
        if len(eyes) >= 2:
            return roi_color
    return None


In [8]:
import os
import cv2

cropped_image_dirs = []
celebrity_file_names_dict = {}

for img_dir in img_dirs:
    count = 1
    celebrity_name = os.path.basename(img_dir)
    print(celebrity_name)
    
    celebrity_file_names_dict[celebrity_name] = []
    
    for entry in os.scandir(img_dir):
        if entry.is_file() and entry.name.lower().endswith(('.png', '.jpg', '.jpeg')):
            roi_color = get_cropped_image_if_2_eyes(entry.path)
            if roi_color is not None:
                cropped_folder = os.path.join(path_to_cr_data, celebrity_name)
                if not os.path.exists(cropped_folder):
                    os.makedirs(cropped_folder)
                    cropped_image_dirs.append(cropped_folder)
                    print("Generating cropped images in folder:", cropped_folder)
                
                cropped_file_name = f"{celebrity_name}{count}.png"
                cropped_file_path = os.path.join(cropped_folder, cropped_file_name)
                
                cv2.imwrite(cropped_file_path, roi_color)
                celebrity_file_names_dict[celebrity_name].append(cropped_file_path)
                count += 1


Angelina Jolie
Generating cropped images in folder: ./dataset_stars/cropped/Angelina Jolie
Brad Pitt
Generating cropped images in folder: ./dataset_stars/cropped/Brad Pitt
cropped
Leonardo DiCaprio
Generating cropped images in folder: ./dataset_stars/cropped/Leonardo DiCaprio
Lionel Messi
Generating cropped images in folder: ./dataset_stars/cropped/Lionel Messi
Tom Cruise
Generating cropped images in folder: ./dataset_stars/cropped/Tom Cruise


In [9]:
img_dirs

['./dataset_stars/Angelina Jolie',
 './dataset_stars/Brad Pitt',
 './dataset_stars/cropped',
 './dataset_stars/Leonardo DiCaprio',
 './dataset_stars/Lionel Messi',
 './dataset_stars/Tom Cruise']

In [10]:
import shutil
if os.path.exists(path_to_cr_data):
     shutil.rmtree(path_to_cr_data)
os.mkdir(path_to_cr_data)

In [11]:
cropped_image_dirs = []
celebrity_file_names_dict = {}
for img_dir in img_dirs:
    count = 1
    celebrity_name = img_dir.split('/')[-1]
    celebrity_file_names_dict[celebrity_name] = []
    for entry in os.scandir(img_dir):
        roi_color = get_cropped_image_if_2_eyes(entry.path)
        if roi_color is not None:
            cropped_folder = path_to_cr_data + celebrity_name
            if not os.path.exists(cropped_folder):
                os.makedirs(cropped_folder)
                cropped_image_dirs.append(cropped_folder)
                print("Generating cropped images in folder: ",cropped_folder)
            cropped_file_name = celebrity_name + str(count) + ".png"
            cropped_file_path = cropped_folder + "/" + cropped_file_name
            cv2.imwrite(cropped_file_path, roi_color)
            celebrity_file_names_dict[celebrity_name].append(cropped_file_path)
            count += 1

Generating cropped images in folder:  ./dataset_stars/cropped/Angelina Jolie
Generating cropped images in folder:  ./dataset_stars/cropped/Brad Pitt
Generating cropped images in folder:  ./dataset_stars/cropped/Leonardo DiCaprio
Generating cropped images in folder:  ./dataset_stars/cropped/Lionel Messi
Generating cropped images in folder:  ./dataset_stars/cropped/Tom Cruise


In [12]:
celebrity_file_names_dict = {}
for img_dir in cropped_image_dirs:
    celebrity_name = img_dir.split('/')[-1]
    file_list = []
    for entry in os.scandir(img_dir):
        file_list.append(entry.path)
    celebrity_file_names_dict[celebrity_name] = file_list
celebrity_file_names_dict

{'Angelina Jolie': ['./dataset_stars/cropped/Angelina Jolie\\Angelina Jolie1.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie10.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie11.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie12.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie13.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie14.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie15.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie16.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie17.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie18.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie19.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie2.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie20.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie21.png',
  './dataset_stars/cropped/Angelina Jolie\\Angelina Jolie22.

In [13]:
class_dict = {}
count = 0
for celebrity_name in celebrity_file_names_dict.keys():
    class_dict[celebrity_name] = count
    count = count + 1
class_dict

{'Angelina Jolie': 0,
 'Brad Pitt': 1,
 'Leonardo DiCaprio': 2,
 'Lionel Messi': 3,
 'Tom Cruise': 4}

In [14]:
import numpy as np
import pywt
import cv2    
#transformation en ondelettes discrètes (DWT)
def w2d(img, mode='haar', level=1):
    imArray = img

    imArray = cv2.cvtColor( imArray,cv2.COLOR_RGB2GRAY )
    imArray =  np.float32(imArray)   
    imArray /= 255
    coeffs=pywt.wavedec2(imArray, mode, level=level)

    coeffs_H=list(coeffs)  
    coeffs_H[0] *= 0;  

    imArray_H=pywt.waverec2(coeffs_H, mode)
    imArray_H *= 255
    imArray_H =  np.uint8(imArray_H)

    return imArray_H

In [15]:
X, y = [], []
for celebrity_name, training_files in celebrity_file_names_dict.items():
    for training_image in training_files:
        img = cv2.imread(training_image)
        scalled_raw_img = cv2.resize(img, (32, 32))
        img_har = w2d(img,'db1',5)
        scalled_img_har = cv2.resize(img_har, (32, 32))
        combined_img = np.vstack((scalled_raw_img.reshape(32*32*3,1),scalled_img_har.reshape(32*32,1)))
        X.append(combined_img)
        y.append(class_dict[celebrity_name])     

In [20]:
len(X[0])

4096

In [None]:
X = np.array(X).reshape(len(X),4096).astype(float)
X.shape

In [21]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC(kernel = 'rbf', C = 10))])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7142857142857143

In [23]:
print(classification_report(y_test, pipe.predict(X_test)))


              precision    recall  f1-score   support

           0       1.00      0.77      0.87        13
           1       0.36      0.71      0.48         7
           2       0.67      0.71      0.69        14
           3       1.00      0.62      0.77         8
           4       1.00      0.71      0.83         7

    accuracy                           0.71        49
   macro avg       0.80      0.71      0.73        49
weighted avg       0.81      0.71      0.74        49



In [24]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [25]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto',probability=True),
        'params' : {
            'svc__C': [1,10,100,1000],
            'svc__kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'randomforestclassifier__n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'logisticregression__C': [1,5,10]
        }
    }
}

In [26]:
scores = []
best_estimators = {}
import pandas as pd
for algo, mp in model_params.items():
    pipe = make_pipeline(StandardScaler(), mp['model'])
    clf =  GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': algo,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_estimators[algo] = clf.best_estimator_
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df



Unnamed: 0,model,best_score,best_params
0,svm,0.750246,"{'svc__C': 1, 'svc__kernel': 'linear'}"
1,random_forest,0.53399,{'randomforestclassifier__n_estimators': 10}
2,logistic_regression,0.72931,{'logisticregression__C': 1}


In [27]:
best_estimators

{'svm': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('svc',
                  SVC(C=1, gamma='auto', kernel='linear', probability=True))]),
 'random_forest': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('randomforestclassifier',
                  RandomForestClassifier(n_estimators=10))]),
 'logistic_regression': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('logisticregression',
                  LogisticRegression(C=1, multi_class='auto',
                                     solver='liblinear'))])}

In [28]:
best_estimators['svm'].score(X_test,y_test)


0.7551020408163265

In [29]:
best_estimators['random_forest'].score(X_test,y_test)


0.5510204081632653

In [30]:
best_estimators['logistic_regression'].score(X_test,y_test)

0.7755102040816326

In [31]:
best_clf = best_estimators['svm']


In [32]:
import joblib 
# Save the model as a pickle in a file 
joblib.dump(best_clf, 'saved_model.pkl') 

['saved_model.pkl']

In [33]:
import json
with open("class_dictionary.json","w") as f:
    f.write(json.dumps(class_dict))