In [None]:
import pandas as pd
import numpy as np
import os
import sys
import glob
import warnings
import cv2
import time

from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
# from sklearn.svm import SVC, NuSVC  <--- might use that
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

sys.path.append("../")

load_dotenv()
# plt.style.use('Solarize_Light2')
# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


In [None]:
image_dir = "../imgs/gs/"

image_paths = glob.glob(pathname=f"{image_dir}/*")


In [None]:
df = pd.read_pickle(filepath_or_buffer="../data/ecommerce_cleaned.pkl")

df.head()


In [None]:
# Let's discard anything that's not image or target related :

print(df.columns)


In [None]:
droplist = ["product_name", "doc_desc", "lem_desc", "product_specifications", "description", "category_tree"]

df.drop(columns=droplist, inplace=True, errors="ignore")

df.head()


In [None]:
df.rename(columns={"image": "image_name"}, inplace=True)

df.head()


In [None]:
sift = cv2.SIFT_create(
    nOctaveLayers=6  # Doubling default OctaveLayers -> increase sensitivity
)


In [None]:
test_image = cv2.imread(image_paths[0], cv2.IMREAD_GRAYSCALE)


In [None]:
plt.imshow(test_image, cmap="gray")
plt.show()


In [None]:
keypoints, descriptors = sift.detectAndCompute(test_image, None)

image_keypoints = cv2.drawKeypoints(test_image, keypoints, test_image)


In [None]:
plt.imshow(image_keypoints)
plt.show()


In [None]:
def to_pt(kp_tuple):
    kp_list = []
    [kp_list.append(cv_kp.pt) for cv_kp in kp_tuple]
    return kp_list


def get_sift_items(image_name):
    image_path = image_dir + image_name
    image = cv2.imread(
        filename=image_path,
        flags=cv2.IMREAD_GRAYSCALE
        )

    keypoints, descriptors = sift.detectAndCompute(image, None)

    return to_pt(keypoints), descriptors


In [None]:
keypoints_list = []
descriptors_list = []

for index, row in df.iterrows():
    image_name = row["image_name"]
    keypoints, descriptors = get_sift_items(image_name=image_name)
    keypoints_list.append(keypoints)
    descriptors_list.append(descriptors)

df["keypoints"] = keypoints_list
df["descriptors"] = descriptors_list


In [None]:
df.head()


In [None]:
# Checking NAs in KP and Descriptors : 

print("keypoints : ", df["keypoints"].isna().sum())
print("descriptors : ", df["descriptors"].isna().sum())


In [None]:
df = df.dropna(subset=["keypoints", "descriptors"])


In [None]:
# Checking NAs in KP and Descriptors : 

print("keypoints : ", df["keypoints"].isna().sum())
print("descriptors : ", df["descriptors"].isna().sum())


In [None]:
le = LabelEncoder()
df["enc_category"] = le.fit_transform(df["first_category"])
df[["first_category", "enc_category"]].head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[["keypoints", "descriptors"]].values,
    df["enc_category"],
    test_size=0.3
)


In [None]:
# Grid search for KNN : 

param_grid = {'n_neighbors': range(3, 14, 2)}
knn = KNeighborsClassifier()

grid_knn = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring="accuracy",
    n_jobs=-1
)


In [None]:
grid_knn.fit(X_train, y_train)


In [None]:
X_train.shape


In [None]:
y_train.shape

In [None]:
X_train[0]