In [183]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In [184]:
########### 2a Personen mit >70 Bildern #############
import os, tarfile
from urllib.request import urlretrieve
if not os.path.isfile('lfw-funneled.tgz'):
    print("Downloading")
    urlretrieve('http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz',filename = 'lfw-funneled.tgz')
    
# Pfad zur heruntergeladenen tgz-Datei
tgz_file = 'lfw-funneled.tgz'

# Zielverzeichnis für die Extraktion
extracted_directory = 'lfw_funneled'

# Extrahiere den Datensatz aus der tgz-Datei
if not os.path.exists(extracted_directory):
    with tarfile.open(tgz_file, 'r:gz') as tar:
        print("Extracting...")
        tar.extractall(os.getcwd())
else: 
    print("Verzeichnis gefunden.")

Verzeichnis gefunden.


In [185]:
# Funktion zum Durchsuchen des Verzeichnisses und Ermitteln der Personen mit mindestens 70 Bildern
def find_persons_with_min_images(dataset_directory, min_images=70):
    persons = {}
    
    for root, dirs, files in os.walk(dataset_directory):
        if len(files) >= min_images:
            person_name = os.path.basename(root)
            persons[person_name] = len(files)

    return {k: v for k, v in persons.items() if v >= min_images}

persons = find_persons_with_min_images(extracted_directory, min_images=70)

# Ausgabe der gefundenen Personen
for person, num_images in persons.items():
    print(f'{person}: {num_images} Bilder')

Ariel_Sharon: 77 Bilder
Colin_Powell: 236 Bilder
Donald_Rumsfeld: 121 Bilder
George_W_Bush: 530 Bilder
Gerhard_Schroeder: 109 Bilder
Hugo_Chavez: 71 Bilder
Tony_Blair: 144 Bilder


In [186]:
########### 2b #############
from skimage import io, transform
lfw_path = os.path.join(os.getcwd(), extracted_directory)
    
images = []
labels = []

for person in persons.keys():
    person_folder = os.path.join(lfw_path, person)
    
    for i, img_name in enumerate(os.listdir(person_folder)):
        img_path = os.path.join(person_folder, img_name)
        img = io.imread(img_path, as_gray=True)
        
        # Bildmitte berechnen
        height, width = img.shape[:2]
        center_x, center_y = width // 2, height // 2
        
        # Ausschnittsgröße festlegen
        crop_size = 96
        
        # Berechne den Ausschnitt
        crop_x1 = center_x - (crop_size // 2)
        crop_x2 = center_x + (crop_size // 2)
        crop_y1 = center_y - (crop_size // 2)
        crop_y2 = center_y + (crop_size // 2)
        
        # Den Ausschnitt ausschneiden
        img = img[crop_y1:crop_y2, crop_x1:crop_x2]
        img = transform.resize(img, (32, 32))
        img = img.flatten()
        
        # Save the test picture
        if person == "George_W_Bush":
            labels.append(1.0)
        else:
            labels.append(-1.0)
            
        images.append(img)

print("Anzahl Trainingsbilder: ", len(images))

Anzahl Trainingsbilder:  1288


In [187]:
from sklearn.model_selection import train_test_split

X = np.array(images)
y = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=True)

print(len(X_train), "+", len(X_test), "=", len(X_train) + len(X_test), "<=>", len(images))

772 + 516 = 1288 <=> 1288


In [188]:
df_X_train = pd.DataFrame(X_train)
df_X_test = pd.DataFrame(X_test)
df_y_train = pd.DataFrame(y_train)
df_y_test = pd.DataFrame(y_test)

print("Shape Design Matrix: ", df_X_train.shape)
df_X_train.head()

Shape Design Matrix:  (772, 1024)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.109214,0.200172,0.381743,0.489545,0.53646,0.560711,0.575888,0.600522,0.608648,0.611475,...,0.522324,0.528886,0.553327,0.530423,0.49496,0.476641,0.472823,0.410509,0.34478,0.373695
1,0.58293,0.688128,0.779734,0.805643,0.805638,0.795169,0.770843,0.75123,0.744945,0.747546,...,0.567832,0.579502,0.605916,0.63946,0.619121,0.595679,0.592407,0.503467,0.400279,0.313158
2,0.125745,0.305503,0.37934,0.51346,0.643709,0.704966,0.771127,0.809545,0.822585,0.779196,...,0.389338,0.349493,0.341828,0.353841,0.349461,0.29781,0.246619,0.267566,0.240269,0.218483
3,0.459664,0.465377,0.540511,0.612394,0.684091,0.677095,0.689201,0.725177,0.770056,0.800829,...,0.389256,0.397796,0.4359,0.434809,0.471418,0.50819,0.506922,0.46202,0.405478,0.360806
4,0.518882,0.472868,0.432625,0.458356,0.485482,0.514903,0.536523,0.544992,0.569523,0.58865,...,0.469462,0.484506,0.508066,0.512761,0.510656,0.490752,0.479971,0.429558,0.379284,0.461528


In [189]:
###### PCA
def pca(matrix):
    
    # Zentrierung
    matrix = (matrix - matrix.mean()) / matrix.std()
    
    # SVD
    U, D, V = np.linalg.svd(matrix)
    eigenvectors = V
    print("\tEigenvectores shape: ", eigenvectors.shape)
    eigenfaces = eigenvectors[:7]
    print("\tEigenfaces shape: ", eigenfaces.shape)
    
    # Transform the data to the first 7 eigenfaces
    scores = pd.DataFrame(np.matmul(matrix, eigenfaces.T))
    return scores
   
# Transform the data to the first 7 eigenfaces
print("Train PCA:")
design_matrix = df_X_train
df_train_scores = pca(design_matrix)
# print(train_scores.head())
print("\tShape Train Scores: ", df_train_scores.shape)

print("\nTest PCA:")
test_matrix = df_X_test
df_test_scores = pca(test_matrix)
# print(test_scores.head())
print("\tShape Test Scores: ", df_test_scores.shape)

Train PCA:
	Eigenvectores shape:  (1024, 1024)
	Eigenfaces shape:  (7, 1024)
	Shape Train Scores:  (772, 7)

Test PCA:
	Eigenvectores shape:  (1024, 1024)
	Eigenfaces shape:  (7, 1024)
	Shape Test Scores:  (516, 7)


In [190]:
### Bayes Classificator
bush_stats = {
    "mean" : [],
    "variance" : []
}

not_bush_stats = {
    "mean" : [],
    "variance" : []
}

bush_scores = df_train_scores[df_y_train.iloc[:, 0] == 1.0]
not_bush_scores = df_train_scores[df_y_train.iloc[:,0] == -1.0]

for col in bush_scores:
    bush_stats["mean"].append(bush_scores[col].mean())
    bush_stats["variance"].append(bush_scores[col].var())

for col in not_bush_scores:
    not_bush_stats["mean"].append(not_bush_scores[col].mean())
    not_bush_stats["variance"].append(not_bush_scores[col].var())

bush_stats = pd.DataFrame(bush_stats)
not_bush_stats = pd.DataFrame(not_bush_stats)
print("Bush")
print(bush_stats)
print("\nNot Bush")
print(not_bush_stats)

Bush
       mean    variance
0  0.977832  280.662676
1 -0.991274  194.361878
2 -2.470285   51.723610
3 -1.257515   55.267613
4  0.856717   33.037959
5 -1.963028   26.550370
6  0.058712   25.862817

Not Bush
       mean    variance
0 -0.681259  290.264574
1  0.690624  144.782873
2  1.721056   64.264975
3  0.876115   52.193841
4 -0.596878   30.789435
5  1.367648   29.750331
6 -0.040905   24.598042


In [191]:
# Compute the A-Priori of train daata
prior_p_is_bush = df_y_train.value_counts().get(1.0, 0) / len(df_y_train)
prior_p_not_bush = 1.0 - prior_p_is_bush

In [192]:
import math

def predict_with_bayes_classificator(scores):
    # Calculate the Gaussian probability values for each feature and class
    y_pred = []
    
    for i, score in scores.iterrows():
        likehoods = {
            "P_bush" : [],
            "P_not" : []
        }
        
        for j, value in enumerate(score.values):
    
            mean = bush_stats["mean"][j]
            not_mean = not_bush_stats["mean"][j]
    
            var = bush_stats["variance"][j]
            not_var = not_bush_stats["variance"][j]
    
            likehoods["P_not"].append((1 / (math.sqrt(2 * math.pi * not_var))) * math.exp(-(value - not_mean)**2 / (2 * not_var)))
            likehoods["P_bush"].append((1 / (math.sqrt(2 * math.pi * var))) * math.exp(-(value - mean)**2 / (2 * var)))
        
        # Now we got the pdfs P(x | c), probability of feature x under class c
        # Now look at decision
        post_p_bush = prior_p_is_bush
        for like in likehoods["P_bush"]:
            post_p_bush *= like 
    
        post_p_not_bush = prior_p_not_bush
        for like in likehoods["P_not"]:
            post_p_not_bush *= like
        
        # Make the decision
        if post_p_bush > post_p_not_bush:
            y_pred.append(1.0)
        else:
            y_pred.append(-1.0)
    
    return y_pred
                

def evaluate_predictions(y_pred, y_true):
    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0
    
    for (y_p, y_t) in zip(y_pred, y_true):
        if y_p == 1.0 and y_t == 1.0:
                true_positive += 1
        elif y_p == 1.0 and y_t == -1.0:
            false_positive += 1
        elif y_p == -1.0 and y_t == -1.0:
            true_negative += 1
        elif y_p == -1.0 and y_t == 1.0:
            false_negative += 1
    
    n_data = len(y_pred)
    print(f"""
    Accurancy: \t{np.sum(y_pred == y_true) / n_data}
    True Positive:\t{true_positive / n_data}
    False Positive:\t{false_positive / n_data}
    True Negativ:\t{true_negative / n_data}
    False Negativ:\t{false_negative / n_data}
    
    """)

In [193]:
# Check for test data
print("\nChecking results on TRAINING data:")
y_train_pred = predict_with_bayes_classificator(df_train_scores)
evaluate_predictions(y_train_pred, y_train)

print("\nChecking results on independent TEST data:")
y_test_pred = predict_with_bayes_classificator(df_test_scores)
evaluate_predictions(y_test_pred, y_test)


Checking results on TRAINING data:

    Accurancy: 	0.711139896373057
    True Positive:	0.22279792746113988
    False Positive:	0.10103626943005181
    True Negativ:	0.4883419689119171
    False Negativ:	0.1878238341968912
    
    

Checking results on independent TEST data:

    Accurancy: 	0.625968992248062
    True Positive:	0.19186046511627908
    False Positive:	0.15310077519379844
    True Negativ:	0.43410852713178294
    False Negativ:	0.22093023255813954
    
    
