In [292]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In [293]:
########### 2a Personen mit >70 Bildern #############
import os, tarfile
from urllib.request import urlretrieve
if not os.path.isfile('lfw-funneled.tgz'):
    print("Downloading")
    urlretrieve('http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz',filename = 'lfw-funneled.tgz')
    
# Pfad zur heruntergeladenen tgz-Datei
tgz_file = 'lfw-funneled.tgz'

# Zielverzeichnis für die Extraktion
extracted_directory = 'lfw_funneled'

# Extrahiere den Datensatz aus der tgz-Datei
if not os.path.exists(extracted_directory):
    with tarfile.open(tgz_file, 'r:gz') as tar:
        print("Extracting...")
        tar.extractall(os.getcwd())
else: 
    print("Verzeichnis gefunden.")

Verzeichnis gefunden.


In [294]:
# Funktion zum Durchsuchen des Verzeichnisses und Ermitteln der Personen mit mindestens 70 Bildern
def find_persons_with_min_images(dataset_directory, min_images=70):
    persons = {}
    
    for root, dirs, files in os.walk(dataset_directory):
        if len(files) >= min_images:
            person_name = os.path.basename(root)
            persons[person_name] = len(files)

    return {k: v for k, v in persons.items() if v >= min_images}

persons = find_persons_with_min_images(extracted_directory, min_images=70)

# Ausgabe der gefundenen Personen
for person, num_images in persons.items():
    print(f'{person}: {num_images} Bilder')

Ariel_Sharon: 77 Bilder
Colin_Powell: 236 Bilder
Donald_Rumsfeld: 121 Bilder
George_W_Bush: 530 Bilder
Gerhard_Schroeder: 109 Bilder
Hugo_Chavez: 71 Bilder
Tony_Blair: 144 Bilder


In [295]:
########### 2b #############
from skimage import io, transform
lfw_path = os.path.join(os.getcwd(), extracted_directory)
    
all_images = []

for person in persons.keys():
    person_folder = os.path.join(lfw_path, person)
    
    for i, img_name in enumerate(os.listdir(person_folder)):
        img_path = os.path.join(person_folder, img_name)
        img = io.imread(img_path, as_gray=True)
        
        # Bildmitte berechnen
        height, width = img.shape[:2]
        center_x, center_y = width // 2, height // 2
        
        # Ausschnittsgröße festlegen
        crop_size = 96
        
        # Berechne den Ausschnitt
        crop_x1 = center_x - (crop_size // 2)
        crop_x2 = center_x + (crop_size // 2)
        crop_y1 = center_y - (crop_size // 2)
        crop_y2 = center_y + (crop_size // 2)
        
        # Den Ausschnitt ausschneiden
        img = img[crop_y1:crop_y2, crop_x1:crop_x2]
        img = transform.resize(img, (32, 32))
        img = img.flatten()
        
        # Save the test picture
        if person == "George_W_Bush":
            img = np.insert(img, 0, 1.0)
        else:
            img = np.insert(img, 0, -1.0)
            
        all_images.append(img)

print("Anzahl Trainingsbilder: ", len(all_images))

Anzahl Trainingsbilder:  1288


In [296]:
import random

# Randomize and then seperate
random.shuffle(all_images)

n_train = int(len(all_images) * 0.6)
train_data = all_images[:n_train]
test_data = all_images[n_train:]

print(len(train_data), "+", len(test_data), "=", len(train_data) + len(test_data), "<=>", len(all_images))

772 + 516 = 1288 <=> 1288


In [297]:
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

print("Shape Design Matrix: ", df_train.shape)
df_train.head()

Shape Design Matrix:  (772, 1025)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024
0,-1.0,0.100032,0.12214,0.235866,0.495822,0.531897,0.620258,0.639195,0.660642,0.680389,...,0.521128,0.552548,0.545123,0.535695,0.529428,0.514658,0.377849,0.423171,0.20205,0.044814
1,-1.0,0.526127,0.555201,0.599809,0.624002,0.646758,0.697131,0.750642,0.773939,0.796575,...,0.64562,0.644525,0.671319,0.676742,0.627658,0.640175,0.628907,0.427381,0.250436,0.262493
2,1.0,0.199988,0.285798,0.30917,0.359026,0.384156,0.412723,0.424768,0.43302,0.429738,...,0.482564,0.487408,0.443085,0.402738,0.341691,0.318077,0.290796,0.241243,0.139942,0.216996
3,1.0,0.240129,0.278447,0.319972,0.37478,0.389146,0.454788,0.484395,0.517182,0.541093,...,0.602813,0.592357,0.582687,0.520182,0.456056,0.518334,0.468444,0.365002,0.334314,0.407794
4,1.0,0.329508,0.349696,0.408826,0.599581,0.681647,0.70152,0.727874,0.75212,0.757378,...,0.566222,0.56464,0.52353,0.471782,0.406601,0.390157,0.412212,0.42884,0.417741,0.405379


In [298]:
 # Get labels back to perform pca
design_labels = df_train.iloc[:, 0]
design_matrix = df_train.iloc[:, 1:]
test_labels = df_test.iloc[:, 0]
test_matrix = df_test.iloc[:, 1:]

print("design_labels shape: ", design_labels.shape)
print("design_matrix shape: ", design_matrix.shape)
print("test_labels shape: ", test_labels.shape)
print("test_matrix shape: ", test_matrix.shape)

design_labels shape:  (772,)
design_matrix shape:  (772, 1024)
test_labels shape:  (516,)
test_matrix shape:  (516, 1024)


In [299]:
###### PCA
def pca(matrix):
    
    # Zentrierung
    matrix = (matrix - matrix.mean()) / matrix.std()
    
    # SVD
    U, D, V = np.linalg.svd(matrix)
    eigenvectors = V
    print("Eigenvectores shape: ", eigenvectors.shape)
    eigenfaces = eigenvectors[:7]
    print("Eigenfaces shape: ", eigenfaces.shape)
    
    # Transform the data to the first 7 eigenfaces
    scores = pd.DataFrame(np.matmul(matrix, eigenfaces.T))
    return scores

train_scores = pca(design_matrix)
test_scores = pca(test_matrix)

# # Zentrierung
# design_matrix = (design_matrix - design_matrix.mean()) / design_matrix.std()
# test_matrix = (test_matrix - test_matrix.mean()) / test_matrix.std()
# 
# # SVD
# U, D, V = np.linalg.svd(design_matrix)
# eigenvectors = V
# print("Eigenvectores shape: ", eigenvectors.shape)
# eigenfaces = eigenvectors[:7]
# print("Eigenfaces shape: ", eigenfaces.shape)
#     
# # Transform the data to the first 7 eigenfaces
# train_scores = pd.DataFrame(np.matmul(design_matrix, eigenfaces.T))
# print(train_scores.head())
# test_scores = pd.DataFrame(np.matmul(test_matrix, eigenfaces.T))
# print(test_scores.head())

Eigenvectores shape:  (1024, 1024)
Eigenfaces shape:  (7, 1024)
Eigenvectores shape:  (1024, 1024)
Eigenfaces shape:  (7, 1024)


In [300]:
### Bayes Classificator
bush = {
    "mean" : [],
    "variance" : []
}

not_bush = {
    "mean" : [],
    "variance" : []
}

train_scores["labels"] = design_labels
bush_scores = train_scores.loc[train_scores["labels"] == 1.0]
not_bush_scores = train_scores.loc[train_scores["labels"] == -1.0]

# Remove label from scores
train_scores = train_scores.drop(columns=["labels"])
bush_scores = bush_scores.drop(columns=["labels"])
not_bush_scores = not_bush_scores.drop(columns=["labels"])

for entry in bush_scores:
    bush["mean"].append(bush_scores[entry].mean())
    bush["variance"].append(bush_scores[entry].var())

for entry in not_bush_scores:
    not_bush["mean"].append(not_bush_scores[entry].mean())
    not_bush["variance"].append(not_bush_scores[entry].var())

bush = pd.DataFrame(bush)
not_bush = pd.DataFrame(not_bush)
print(bush)

       mean    variance
0  0.505394  275.553188
1  0.520161  192.619237
2 -1.999460   64.030502
3 -2.109745   60.431983
4 -0.457527   33.938921
5 -2.109519   27.283246
6  0.417372   23.475302


In [301]:
# Compute the A-Priori of train daata
p_is_bush = design_labels.value_counts().get(1.0, 0) / len(design_labels)
p_not_bush = 1 - p_is_bush

In [302]:
import math

def check_bayes_classificator(scores, labels):
    # Calculate the Gaussian probability values for each feature and class
    true_positive = 0
    false_positive = 0
    true_negativ = 0
    false_negativ = 0
    dates = 0
    for i, score in scores.iterrows():
    
        dates += 1
        likehoods = {
            "P_bush" : [],
            "P_not" : []
        }
        
        for j, value in enumerate(score.values):
    
            mean = bush["mean"][j]
            not_mean = not_bush["mean"][j]
    
            variance = bush["variance"][j]
            not_variance = not_bush["variance"][j]
    
            likehoods["P_not"].append((1 / (math.sqrt(2 * math.pi * not_variance))) * math.exp(-(value - not_mean)**2 / (2 * not_variance)))
            likehoods["P_bush"].append((1 / (math.sqrt(2 * math.pi * variance))) * math.exp(-(value - mean)**2 / (2 * variance)))
        
        # Now we got the pdfs P(x | c), probability of feature x under class c
        # Now look at decision
        prd_bush = 1
        for like in likehoods["P_bush"]:
            prd_bush *= (like * p_is_bush)
    
        prd_not_bush = 1
        for like in likehoods["P_not"]:
            prd_not_bush *= (like * p_not_bush)
        
        # Make the decision
        # print(f"{round(prd_bush,15)} ?= {round(prd_not_bush,15)}")
        if prd_bush > prd_not_bush:
            if labels[i] == 1.0:
                true_positive += 1
            else:
                false_positive += 1
            # print(f"Got Bush! Real value: {test_labels[i]}")
        else:
            if labels[i] == 1.0:
                false_negativ += 1
            else:
                true_negativ += 1
            # print(f"NOT Bush! Real value: {test_labels[i]}")
    
    print(f"""
    True Positive:\t{true_positive/dates}
    False Positive:\t{false_positive/dates}
    False Negativ:\t{false_negativ/dates}
    True Negativ:\t{true_negativ/dates}
    """)

In [303]:
# Check for test data
print("\nChecking results on TRAINING data:")
check_bayes_classificator(train_scores, design_labels)
print("\nChecking results on independent TEST data:")
check_bayes_classificator(test_scores, test_labels)


Checking results on TRAINING data:

    True Positive:	0.009067357512953367
    False Positive:	0.006476683937823834
    False Negativ:	0.41321243523316065
    True Negativ:	0.5712435233160622
    

Checking results on independent TEST data:

    True Positive:	0.007751937984496124
    False Positive:	0.005813953488372093
    False Negativ:	0.3875968992248062
    True Negativ:	0.5988372093023255
    
