In [None]:
""""Objective - To perform image pre-processing for images in the dataset, load the model to train it and export the model."""

In [1]:
import joblib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import image
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def prep_data(filePath,size,patches=5):
    images, _ = joblib.load(filePath)
    patch_extractor = image.PatchExtractor(patch_size=(size, size), max_patches=patches, random_state=42)
    sub_images = patch_extractor.transform(images)

    sub_images_rot = np.zeros(sub_images.shape)

    labels = np.zeros(sub_images.shape[0])
    for i in range(sub_images.shape[0]):
        random_k = np.random.randint(0, 4)
        labels[i] = random_k
        sub_images_rot[i] = np.rot90(sub_images[i], k=random_k, axes=(0, 1))

    features = sub_images_rot.reshape(len(sub_images), -1)
    dataset = {'features': features, 'labels': labels}
    return dataset

In [3]:
#filePath_train = 'data/train.full.joblib'
filePath_train = 'data/train.small.joblib'

filePath_test = 'data/eval1.joblib'
size = 50
patches = 4

In [4]:
train_data = prep_data(filePath_train,size,patches)

In [5]:
eval_data = joblib.load(open(filePath_test, "rb"))[size]
x_test = eval_data["x_test"]
y_test = eval_data["y_test"]

In [7]:
from sklearn.neural_network import MLPClassifier

pca2 = PCA(n_components=50)
mlp = MLPClassifier(activation='tanh',alpha=0.1,hidden_layer_sizes=(200),max_iter=500)
model4 = Pipeline([('pca', pca2), ('mlp', mlp)])

model4.fit(train_data['features'], train_data['labels'])
model4.score(x_test, y_test)

0.7085

In [None]:
"""
hidden_layer_sizes = (100) --> 0.63
hidden_layer_sizes = (100,100) --> 0.6665
hidden_layer_sizes = (100,100,100) --> 0.6445
hidden_layer_sizes = (50,50) --> 0.6355
hidden_layer_sizes = (200,200) --> 0.716

activation = 'relu' --> 0.694
activation = 'tanh' --> 0.704
activation = 'logistic' --> 0.68
activation = 'identity' --> 0.456

alpha = 0.1 --> 0.714
alpha = 0.01 --> 0.71
alpha = 0.001 --> 0.70
alpha = 0.0001 --> 0.71.5

n_components = 50   --> 0.668
n_components = 100  --> 0.6695
n_components = 200  --> 0.629
"""

In [22]:
pca = PCA(n_components=90)
knn = KNeighborsClassifier(n_neighbors=1)
model_90 = Pipeline([('pca', pca), ('knn', knn)])

model_90.fit(train_data['features'], train_data['labels'])

model_90.score(x_test, y_test)

0.718

In [23]:
# SVC - Params: kernel, C

from sklearn.svm import SVC

pca2 = PCA(n_components=45)
svc = SVC(C = 10,kernel='rbf')
model3 = Pipeline([('pca', pca2), ('svc', svc)])

model3.fit(train_data['features'], train_data['labels'])
model3.score(x_test, y_test)

0.785

In [None]:
"""
C = 1 - 0.643
C = 10 - 0.6895
C = 20 - 0.686
C = 50 - 0.676

n_components = 45 - 0.697
n_components = 90 - 0.6895
n_components = 180 - 0.679
"""

In [26]:
neigh = range(1,12)

acc = []

for n in neigh:
    knn = KNeighborsClassifier(n_neighbors=n)
    model = Pipeline([('pca', pca), ('knn', knn)])
    model.fit(train_data['features'], train_data['labels'])
    acc.append(model.score(x_test, y_test))

In [27]:
bestParam = neigh[np.argmax(acc)]

knn = KNeighborsClassifier(n_neighbors=bestParam)
model = Pipeline([('pca', pca), ('knn', knn)])
model.fit(train_data['features'], train_data['labels'])
model.score(x_test, y_test)

0.7375

In [None]:
#RandomForestClassifier - Params: n_estimators, max_depth, criterion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [20, 30],
    'criterion': ['gini', 'entropy']
}

gsv_rfc = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, n_jobs=-1)
gsv_rfc.fit(train_data['features'], train_data['labels'])

gsv_rfc.best_params_

{'criterion': 'gini', 'max_depth': 30, 'n_estimators': 200}

In [24]:
from sklearn.ensemble import RandomForestClassifier

pca3 = PCA(n_components=30)
rfc = RandomForestClassifier(n_estimators = 400)
model2 = Pipeline([('pca', pca3), ('rfc', rfc)])

model2.fit(train_data['features'], train_data['labels'])
model2.score(x_test, y_test)

0.718

In [None]:
"""
n_estimators = 50 --> 0.563
n_estimators = 100 --> 0.5775
n_estimators = 200 --> 0.6185
n_estimators = 400 --> 0.632


n_components = 30 --> 0.671
n_components = 45 --> 0.657
n_components = 90 --> 0.6185
n_components = 180 --> 0.5935
"""


0.7825