# Data Preprocessing

In [12]:
#import the required Libraries

import pandas as pd
import numpy as np
from skimage import io
from skimage.color import rgb2gray
from skimage.filters import sobel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Links to the image dataset and the metadata csv

img_link ='dataverse_files/HAM10000_images/'
csv_link = 'dataverse_files/HAM10000_metadata.csv'

In [3]:
metadata = pd.read_csv(csv_link)


In [4]:
#mapping data and images
label_enc = LabelEncoder()
metadata['target'] = label_enc.fit_transform(metadata['dx'])

#split train and test data to 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(metadata['image_id'], metadata['target'], test_size=0.2, random_state=42)


In [5]:
# minority classes unsampled to balance the classes in dataset
train_data = pd.DataFrame({'image_id': X_train, 'target': y_train})
majority_class = train_data['target'].value_counts().idxmax()
minority_classes = train_data['target'].value_counts().drop(majority_class).index
for minority_class in minority_classes:
    minority_data = train_data[train_data['target'] == minority_class]
    minority_upsampled = resample(minority_data, replace=True, n_samples=majority_class, random_state=42)
    train_data = pd.concat([train_data, minority_upsampled])

In [6]:
X_train = train_data['image_id']
y_train = train_data['target']
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [8]:
#feature extraction for test and train data
def feature_extract(img_path):
    img = io.imread(img_link + img_path)
    gray_img = rgb2gray(img)
    edges = sobel(gray_img)
    hist = np.histogram(gray_img, bins=10, range=(0, 1))[0]
    return np.concatenate([hist, edges.flatten()])

X_train = np.array([feature_extract(img_id + '.jpg') for img_id in X_train])
X_test = np.array([feature_extract(img_id + '.jpg') for img_id in X_test])

In [9]:
#100 most relevant features are selected
selector = SelectKBest(chi2, k=100)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

In [10]:
#save the final extracted features to be reused in other code
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)