<a href="https://colab.research.google.com/github/ShedovaNastya/Face-Recognition-Project/blob/main/Test_of_ML_algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Тест классических алгоритмов машиннного обучения

In [1]:
import torch
import os
from IPython.display import clear_output
import numpy as np
from PIL import Image
import torchvision.transforms as T
import matplotlib.pyplot as plt
from torchvision.models import list_models, get_model
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/celebA_train_500.zip
clear_output()

#Датасет

In [None]:
class FaceDataset(torch.utils.data.Dataset):

    def __init__(self, images_folder, label_folder, sample_folder, type_dataset, transform):
      """
      image_folder: путь до папки с изображениями
      label_folder: путь до файла, где какой человек находится
      sample_folder: путь до файла к какому датасету относится конкретное изображение
      type_dataset: датасет, который нас сейчас интересует
      transform: преобразование изображения
      """
      self.images_folder = images_folder
      self.label_folder = label_folder
      self.sample_folder = sample_folder
      self.type_dataset = type_dataset
      self.transform = transform


      with open(self.label_folder, 'r') as file: #человек
        self.labels = {}
        for i in file.readlines():
          key, value = i.split()
          self.labels[key] = int(value)

      with open(self.sample_folder, 'r') as file: #классификация датасетов
        self.samples = {}
        for i in file.readlines():
          key, value = i.split()
          self.samples[key] = int(value)

      self.files = [i for i in os.listdir(self.images_folder) if self.samples.get(i, 5) == self.type_dataset]

    def __len__(self):
      return len(self.files)

    def __getitem__(self, idx):
      image_name = self.images_folder + self.files[idx]
      image = np.array(Image.open(image_name))
      image = Image.fromarray(image[77:-41,45:-50])
      label = int(self.labels[self.files[idx]])
      image = self.transform(image)
      return image, label




In [None]:
train_dataset = FaceDataset('/content/celebA_train_500/celebA_imgs/',
                            '/content/celebA_train_500/celebA_anno.txt',
                            '/content/celebA_train_500/celebA_train_split.txt',
                            0,
                            T.Compose([
                                T.Resize((224, 224)), T.ToTensor()
                            ]))
val_dataset = FaceDataset('/content/celebA_train_500/celebA_imgs/',
                            '/content/celebA_train_500/celebA_anno.txt',
                            '/content/celebA_train_500/celebA_train_split.txt',
                            1,
                            T.Compose([
                                T.Resize((224, 224)), T.ToTensor()
                            ]))
test_dataset = FaceDataset('/content/celebA_train_500/celebA_imgs/',
                            '/content/celebA_train_500/celebA_anno.txt',
                            '/content/celebA_train_500/celebA_train_split.txt',
                            2,
                            T.Compose([
                                T.Resize((224, 224)), T.ToTensor()
                            ]))

In [None]:
train_loader = DataLoader(train_dataset, batch_size = 30, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 30, shuffle = False)
test_loader = DataLoader(test_dataset, batch_size = 30, shuffle = False)

# Модель

In [None]:
# Модель для оценки
m1 = get_model("efficientnet_b0", weights="IMAGENET1K_V1")
m1.classifier[1] = torch.nn.Linear(in_features=1280, out_features=500, bias=True)
device = 'cpu'
m1 = m1.to(device)


filename = '/content/drive/MyDrive/eff_net_param_m1.pth'
m1.load_state_dict(torch.load(filename, map_location=device))

m1.classifier = m1.classifier[:1]

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 41.3MB/s]


In [None]:
def compute_embeddings(model, loader):#функция считает выход ембеддингового слоя
  '''
  compute embeddings from the trained model for list of images.
  params:
    model: trained nn model that takes images and outputs embeddings
    images_list: list of images paths to compute embeddings for
  output:
    list: list of model embeddings. Each embedding corresponds to images
          names from images_list
  '''
  out_data = []
  out_labels = []
  device = "cuda:0"
  model.to(device)
  model.eval()
  with torch.no_grad():
    for data, labels in tqdm(loader):
      data = data.to(device)
      labels = labels.to(device)

      out_data.append(model(data))
      out_labels.append(labels)
  return torch.cat(out_data), torch.cat(out_labels)


In [None]:
train_em, train_lab = compute_embeddings(m1, train_loader)
val_em, val_lab = compute_embeddings(m1, val_loader)
test_em, test_lab = compute_embeddings(m1, test_loader)




embeddings_m1 = {"train":{
    "data": train_em.cpu().numpy(),
    "label": train_lab.cpu().numpy()
},
 "val": {
     "data": val_em.cpu().numpy(),
     "label": val_lab.cpu().numpy()
 },
 "test": {
     "data": test_em.cpu().numpy(),
     "label": test_lab.cpu().numpy()
 }}

import pickle
with open('embeddings_m1.pkl', 'wb') as file:

    # A new file will be created
    pickle.dump(embeddings_m1, file)

In [1]:
import pickle
with open('/content/drive/MyDrive/embeddings_m1.pkl', 'rb') as file:
    # Десериализуем данные из файла
     embeddings_m1= pickle.load(file)
embeddings_m1

{'train': {'data': array([[-0.1953766 ,  2.7521744 ,  0.09171401, ...,  0.27497578,
           0.23698662,  0.04447447],
         [ 0.2088342 , -0.0933115 ,  2.2184422 , ...,  0.01036019,
          -0.11764833, -0.0998802 ],
         [-0.11963991,  1.1833105 , -0.05651244, ...,  0.11861803,
           2.501749  ,  0.09346844],
         ...,
         [ 0.38081095,  0.3446979 ,  0.60753024, ...,  0.0797212 ,
          -0.04109422, -0.08527862],
         [-0.0832733 ,  0.53367627,  0.724545  , ...,  0.01339865,
           0.04168406,  0.15918595],
         [ 0.26264298,  4.1949553 , -0.14428589, ...,  3.961145  ,
           0.21232411,  0.35450253]], dtype=float32),
  'label': array([ 22, 367, 309, ..., 313, 450, 339])},
 'val': {'data': array([[-0.12083704,  0.9092367 ,  0.03618243, ..., -0.12314168,
          -0.09892447, -0.15674733],
         [ 0.02604393,  0.660122  ,  0.00466253, ...,  0.15259533,
          -0.11874288,  0.6394765 ],
         [-0.17250489,  1.1275271 ,  1.4078394 , 

# KNeighborsClassifier 0,7412

Classifier implementing the k-nearest neighbors vote.
Parameters:

**1. n_neighborsint**, default=5
Number of neighbors to use by default for kneighbors queries.



**2. weights**{‘uniform’, ‘distance’}, callable or None, default=’uniform’
Weight function used in prediction. Possible values:

‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.

‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

[callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.

Refer to the example entitled Nearest Neighbors Classification showing the impact of the weights parameter on the decision boundary.



**3. algorithm**{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’
Algorithm used to compute the nearest neighbors:

‘ball_tree’ will use BallTree

‘kd_tree’ will use KDTree

‘brute’ will use a brute-force search.

‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.

Note: fitting on sparse input will override the setting of this parameter, using brute force.



**4. leaf_sizeint**, default=30
Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

model = KNeighborsClassifier(n_neighbors = 3)

model.fit(embeddings_m1['train']['data'], embeddings_m1['train']['label'])
predictions = model.predict(embeddings_m1['val']['data'])
print(accuracy_score(embeddings_m1['val']['label'], predictions))

0.715122470713525


In [None]:
acc = 0.72
n_neigh = [3,7,9,12,15,20]
weights = ['uniform', 'distance']
algorithms = ['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_size= [25,30,35,40]
for neighbors in n_neigh:
  for weight in weights:
      for algo in algorithms:
        for size in leaf_size:
            model = KNeighborsClassifier(n_neighbors = neighbors, weights = weight, algorithm = algo, leaf_size = size)
            model.fit(embeddings_m1['train']['data'], embeddings_m1['train']['label'])
            predictions = model.predict(embeddings_m1['val']['data'])
            accuracy = accuracy_score(embeddings_m1['val']['label'], predictions)
            if accuracy > acc:
              acc = accuracy
              print(neighbors, weight, algo, size, accuracy)



3 distance auto 25 0.7369542066027689
7 distance auto 25 0.7401490947816827
9 distance auto 25 0.7412140575079872


#Linear Support Vector Classification 0,7966 THE BEST

Linear Support Vector Classification.

Similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

The main differences between LinearSVC and SVC lie in the loss function used by default, and in the handling of intercept regularization between those two implementations.

This class supports both dense and sparse input and the multiclass support is handled according to a one-vs-the-rest scheme.

Read more in the User Guide.

Parameters:
1. penalty{‘l1’, ‘l2’}, default=’l2’
Specifies the norm used in the penalization. The ‘l2’ penalty is the standard used in SVC. The ‘l1’ leads to coef_ vectors that are sparse.

2. loss{‘hinge’, ‘squared_hinge’}, default=’squared_hinge’
Specifies the loss function. ‘hinge’ is the standard SVM loss (used e.g. by the SVC class) while ‘squared_hinge’ is the square of the hinge loss. The combination of penalty='l1' and loss='hinge' is not supported.

3. dual“auto” or bool, default=”auto”
Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. dual="auto" will choose the value of the parameter automatically, based on the values of n_samples, n_features, loss, multi_class and penalty. If n_samples < n_features and optimizer supports chosen loss, multi_class and penalty, then dual will be set to True, otherwise it will be set to False.

4. tol float, default=1e-4
Tolerance for stopping criteria.

5. Cfloat, default=1.0
Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. For an intuitive visualization of the effects of scaling the regularization parameter C, see Scaling the regularization parameter for SVCs.

In [8]:
from  sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

model = LinearSVC(penalty='l2', loss='squared_hinge')

model.fit(embeddings_m1['train']['data'], embeddings_m1['train']['label'])
predictions = model.predict(embeddings_m1['val']['data'])
print(accuracy_score(embeddings_m1['val']['label'], predictions))

0.7955271565495208


In [4]:
from  sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
acc = 0
tols = [0.0001, 0.001]
multi_classes = ['ovr', 'crammer_singer']
for t in tols:
  for mc in multi_classes:
        model = LinearSVC(tol = t, multi_class = mc)
        # Обучение модели
        model.fit(embeddings_m1['train']['data'], embeddings_m1['train']['label'])

        # Получение предсказаний
        predictions = model.predict(embeddings_m1['val']['data'])
        accuracy = accuracy_score(embeddings_m1['val']['label'], predictions)
        if acc< accuracy:
          acc = accuracy
          print(t,mc, accuracy)

0.0001 ovr 0.7955271565495208




0.0001 crammer_singer 0.7965921192758253




#BernoulliNB 0.76

In [2]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

model = BernoulliNB(alpha=1.0, force_alpha=True)

model.fit(embeddings_m1['train']['data'], embeddings_m1['train']['label'])
predictions = model.predict(embeddings_m1['val']['data'])
print(accuracy_score(embeddings_m1['val']['label'], predictions))

0.7646432374866879


#GaussianNB 0,*7*

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

model = GaussianNB(priors=None, var_smoothing=1e-09)

model.fit(embeddings_m1['train']['data'], embeddings_m1['train']['label'])
predictions = model.predict(embeddings_m1['val']['data'])
print(accuracy_score(embeddings_m1['val']['label'], predictions))

0.6996805111821086


# RandomForestClassifier 0.67

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(embeddings_m1['train']['data'], embeddings_m1['train']['label'])
predictions = model.predict(embeddings_m1['val']['data'])
print(accuracy_score(embeddings_m1['val']['label'], predictions))

0.6741214057507987


#TEST

Лучше всего себя показала модель Linear Support Vector Classification ~0.8. Посмотрим ее результат на тестовой выборке

In [11]:
from  sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

model = LinearSVC(tol = 0.001, multi_class = 'crammer_singer')

model.fit(embeddings_m1['train']['data'], embeddings_m1['train']['label'])
predictions = model.predict(embeddings_m1['test']['data'])
print(accuracy_score(embeddings_m1['test']['label'], predictions))

0.7872876022655758




RESULT = 0.7872