In [1]:
import os
from pathlib import Path
import glob
import struct
import pandas as pd

In [2]:
data_path_1 = Path(r"C:\Cambridge Datasets\archive 2\nws_fixed_00001-01000\nws_fixed_00001-01000")
data_path_2 = Path(r"C:\Cambridge Datasets\archive 2\nws_fixed_01001-02000\nws_fixed_01001-02000")
data_path_3 = Path(r"C:\Cambridge Datasets\archive 2\nws_fixed_02001-03000\nws_fixed_02001-03000")

data_path_list = [data_path_1, data_path_2, data_path_3]

In [3]:
def get_data_path(data_path_list):
    image_list = []
    meta_data_list = []
    for data_path in data_path_list:
        all_folders_from = [str(data_path / name) for name in os.listdir(data_path)]
        meta_data = [pd.read_csv(str(folder / Path('meta.csv'))) for folder in all_folders_from]
        meta_data_list.extend(meta_data)
        for folder in all_folders_from:
            folder_path = folder / Path("weights") / Path("020.bin")
            for filename in glob.glob(str(folder_path)):
                image_list.append(filename)
    
    return image_list, meta_data_list

In [4]:
X_list_path, meta_data = get_data_path(data_path_list)

In [5]:
def get_weights_from_bin(X_list_path):
    train_X = []
    for x_path in X_list_path:
        with open(x_path, mode='rb') as file: # b is important -> binary
            fileContent = file.read()
            ints = struct.unpack("i" * ((len(fileContent) -24) // 4), fileContent[20:-4])
            train_X.append(ints)
    return train_X

In [6]:
train_X = get_weights_from_bin(X_list_path)
y_list = [df['optimizer'][0] for df in meta_data]

In [7]:
from sklearn.preprocessing import LabelEncoder
import numpy as np


def labels_preprocessing(y_list):
    le = LabelEncoder()
    le.fit(np.unique(y_list))
    y_array = le.transform(y_list)
    return y_array

y_array = labels_preprocessing(y_list)
y_array

array([2, 2, 0, ..., 0, 2, 1], dtype=int64)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

  from pandas import MultiIndex, Int64Index


In [9]:
def train_test(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions = [round(int(value)) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    accuracy = " %.4f%%" % (accuracy * 100.0)
    return accuracy

In [10]:

models = {
    "XGBClassifier": XGBClassifier(),
    "LogisticRegression": LogisticRegression(),
    # "SVC": SVC(),
    "GaussianNB":GaussianNB(), 
    "RandomForestClassifier": RandomForestClassifier(), 
    }
models.keys()

dict_keys(['XGBClassifier', 'LogisticRegression', 'GaussianNB', 'RandomForestClassifier'])

In [11]:
df_solutions = pd.DataFrame(columns=models.keys(), index=meta_data[0].keys())
df_solutions['XGBClassifier']['batch_size'] = 1
df_solutions = df_solutions.drop(['lrate', 'test_acc'])
df_solutions

Unnamed: 0,XGBClassifier,LogisticRegression,GaussianNB,RandomForestClassifier
dataset,,,,
batch_size,1.0,,,
batch_norm,,,,
augmentation,,,,
optimizer,,,,
activation,,,,
initialization,,,,
filter_size,,,,
depth_conv,,,,
depth_fc,,,,


In [12]:
categories = ['dataset', 'batch_size', 'augmentation', 'optimizer', 'activation', 'initialization']

In [13]:
df_solutions = pd.DataFrame(columns=models.keys(), index=categories)

for model_name in models.keys():
    print(model_name)
    if model_name == 'XGBClassifier' or model_name == 'LogisticRegression':
            continue
    for category in categories:
        
        print(category)
        y_list = [df[category][0] for df in meta_data]
        y_array = labels_preprocessing(y_list)
        print(y_array)
        X_train, X_test, y_train, y_test = train_test_split(train_X, list(y_array), test_size=0.2, random_state=10)

        model = models[model_name]
        model_accuracy = train_test(model, X_train, y_train, X_test, y_test)
        df_solutions[model_name][category] = model_accuracy

    print(df_solutions)
        
df_solutions


XGBClassifier
LogisticRegression
GaussianNB
dataset
[4 0 2 ... 2 4 2]
Accuracy: 30.50%
batch_size
[0 2 0 ... 2 1 3]
Accuracy: 31.67%
augmentation
[0 0 0 ... 0 1 1]
Accuracy: 47.00%
optimizer
[2 2 0 ... 0 2 1]
Accuracy: 51.67%
activation
[2 0 2 ... 0 3 0]
Accuracy: 71.33%
initialization
[1 1 3 ... 0 1 0]
Accuracy: 55.33%
               XGBClassifier LogisticRegression GaussianNB  \
dataset                  NaN                NaN   30.5000%   
batch_size               NaN                NaN   31.6667%   
augmentation             NaN                NaN   47.0000%   
optimizer                NaN                NaN   51.6667%   
activation               NaN                NaN   71.3333%   
initialization           NaN                NaN   55.3333%   

               RandomForestClassifier  
dataset                           NaN  
batch_size                        NaN  
augmentation                      NaN  
optimizer                         NaN  
activation                        NaN  
ini

In [None]:
# 30.50% -dataset - GaussianNB
# 31.67% -batch_size - GaussianNB
# 47.00% -augmentation - GaussianNB
# 51.67% -optimizer - GaussianNB
# 71.33% -activation - GaussianNB
# 55.33% -initialization - GaussianNB

In [None]:
# 35.83% -dataset - LogisticRegression
# 24.17% -batch_size - LogisticRegression
# 46.83% -augmentation - LogisticRegression
# 45.33% -optimizer - LogisticRegression
# 73.67% -activation - LogisticRegression
# 56.17% -initialization - LogisticRegression

' 91.5000%'

In [None]:
# 91.50% -dataset - xgboostx
# 49.50% -batch_size - xgboost
# 66.83% -augmentation - xgboost
# 99.17% -optimizer - xgboost
# 98.67%% -activation - xgboost
# 70.00%% -initialization - xgboost