In [290]:
from PIL import Image
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd


def load_object(path: str):
    try:
        image = Image.open(path)
        image = image.convert('L')
        image.resize((28, 28))
        return np.array(image).reshape(-1)
    except Exception as e:
        print(f"load_object exception: {e}")
        return None


def display_object(path: str):
    print(path)
    plt.imshow(Image.fromarray(load_object(path)).convert('RGB'))


def display_first_objects(path: str):
    for dirname in os.listdir(path):
        dirpath = os.path.join(path, dirname)
        if not os.path.isdir(dirpath):
            continue

        for filename in os.listdir(dirpath):
            filepath = os.path.join(dirpath, filename)
            display_object(filepath)
            break

        plt.axis("off")
        plt.show()

In [None]:
display_first_objects('data/letters_small')

In [291]:
TARGET_LABELS = ['target']
FEATURE_LABELS = ['path']
PIXEL_FEATURE_LABELS = []


def load_data(path: str):
    rows = []
    for dirname in os.listdir(path):
        dirpath = os.path.join(path, dirname)
        if not os.path.isdir(dirpath):
            continue

        objects = (
            {'target': dirname, 'data': load_object(os.path.join(dirpath, file)), 'path': os.path.join(dirpath, file)}
            for file in os.listdir(dirpath)
            if os.path.isfile(os.path.join(dirpath, file)) and os.path.getsize(os.path.join(dirpath, file)) > 0
        )
        rows.extend(objects)

    frame = pd.DataFrame(rows).dropna()
    flattened_data = np.stack(frame['data'].values)
    global PIXEL_FEATURE_LABELS
    PIXEL_FEATURE_LABELS = [f'data_pixel_{i}' for i in range(flattened_data.shape[1])]
    flattened_df = pd.DataFrame(flattened_data, columns=PIXEL_FEATURE_LABELS)
    result = pd.concat([frame[TARGET_LABELS + FEATURE_LABELS], flattened_df], axis=1)
    return result.dropna()

In [292]:
# Task 2
df = load_data("data/letters_small")

In [293]:
from sklearn.model_selection import train_test_split


def split_data(data, train_size=0.6, val_size=0.2, test_size=0.2):
    assert train_size + val_size + test_size == 1, "The sum of train_size, val_size, and test_size must be 1"

    X_tv, X_test, y_tv, y_test = train_test_split(data[FEATURE_LABELS + PIXEL_FEATURE_LABELS], data[TARGET_LABELS], test_size=test_size, random_state=42)

    val_size_relative = val_size / (train_size + val_size)
    X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=val_size_relative, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [294]:
# Task 3
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df)

# TODO task 4??

In [296]:
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression


ovo_clf = OneVsOneClassifier(
    LogisticRegression(random_state=42, max_iter=1_000), n_jobs=-1
).fit(X_train[PIXEL_FEATURE_LABELS], y_train.values.ravel())

In [297]:
print(f"Accuracy score: {accuracy_score(y_train, ovo_clf.predict(X_train[PIXEL_FEATURE_LABELS]))}")
print(f"Accuracy score: {accuracy_score(y_val, ovo_clf.predict(X_val[PIXEL_FEATURE_LABELS]))}")
print(f"Accuracy score: {accuracy_score(y_test, ovo_clf.predict(X_test[PIXEL_FEATURE_LABELS]))}")

table = ovo_clf.predict(X_val[PIXEL_FEATURE_LABELS])
table_df = pd.DataFrame(table, index=X_val.index, columns=['Predictions'])
pd.concat([X_val[FEATURE_LABELS], table_df], axis=1)

Accuracy score: 0.9998219690226099
Accuracy score: 0.8696929238985314
Accuracy score: 0.8643524699599466


Unnamed: 0,path,Predictions
13812,data/letters_small/D/QmV0dGVyS2FtcCBCb2xkSXRhb...,D
16618,data/letters_small/E/Q29tc2F0LU5hdnktVW5pdC5vd...,E
6653,data/letters_small/F/RXF1aXBvaXplU2Fucy1SZWd1b...,F
17468,data/letters_small/B/QWJpbGVuZUZMRi5vdGY=.png,D
7210,data/letters_small/F/RmVuaWNlRXhwZXJ0QlEtUmVnd...,F
...,...,...
18542,data/letters_small/B/Q2dZZWFyYm9va0ZpbGxlci50d...,B
5095,data/letters_small/A/Qm9uZWhlYWQudHRm.png,A
163,data/letters_small/I/RWNsaXB0aWMgQlJLLnR0Zg==.png,I
258,data/letters_small/I/Qml4bGVlQ25kLUhlYXZ5LnR0Z...,I


In [299]:
def predict_table(x_test):
    table = ovo_clf.predict(x_test[PIXEL_FEATURE_LABELS])
    table_df = pd.DataFrame(table, index=x_test.index, columns=['Predictions'])
    return pd.concat([x_test[FEATURE_LABELS], table_df, x_test[TARGET_LABELS]], axis=1)

def load_test_object(path: str, target: str):
    arr = load_object(path)
    print(f"Array size: {arr.shape}")
    return {'target': target, 'data': arr, 'path': path}


def df_from_rows(rows):
    frame = pd.DataFrame(rows).dropna()
    shapes = {arr.shape for arr in frame['data'].values}
    print(f"Shapes: {shapes}")
    flattened_data = np.stack(frame['data'].values)
    PIXEL_FEATURE_LABELS = [f'data_pixel_{i}' for i in range(flattened_data.shape[1])]
    flattened_df = pd.DataFrame(flattened_data, columns=PIXEL_FEATURE_LABELS)
    result = pd.concat([frame[TARGET_LABELS + FEATURE_LABELS], flattened_df], axis=1)
    return result.dropna()

paths = [
    {'path': 'data/test/a.png', 'target': 'a'},
    {'path': 'data/test/b.png', 'target': 'b'},
    {'path': 'data/test/c.png', 'target': 'c'},
    {'path': 'data/test/d.png', 'target': 'd'},
    {'path': 'data/test/f.png', 'target': 'f'},
    {'path': 'data/test/g.png', 'target': 'g'},
    {'path': 'data/test/h.png', 'target': 'h'},
    {'path': 'data/test/i.png', 'target': 'i'},
    {'path': 'data/test/j.png', 'target': 'j'},
]
rows = [load_test_object(obj['path'], obj['target']) for obj in paths]
test_frame = df_from_rows(rows)

predict_table(test_frame)

Array size: (784,)
Array size: (784,)
Array size: (784,)
Array size: (784,)
Array size: (784,)
Array size: (784,)
Array size: (784,)
Array size: (784,)
Array size: (784,)
Shapes: {(784,)}


Unnamed: 0,path,Predictions,target
0,data/test/a.png,A,a
1,data/test/b.png,I,b
2,data/test/c.png,E,c
3,data/test/d.png,J,d
4,data/test/f.png,F,f
5,data/test/g.png,A,g
6,data/test/h.png,H,h
7,data/test/i.png,D,i
8,data/test/j.png,H,j
