# SkinAnaliticAI, Skin Cancer Detection with AI Deep Learning

## __Evaluation of Harvard Dataset with different AI classiffication techniques using FastClassAI papeline__
Author: __Pawel Rosikiewicz__   
prosikiewicz@gmail.com      
License: __MIT__    
ttps://opensource.org/licenses/MIT        
Copyright (C) 2021.01.30 Pawel Rosikiewicz        

#### standard imports

In [1]:
import os # allow changing, and navigating files and folders, 
import sys
import shutil
import re # module to use regular expressions, 
import glob # lists names in folders that match Unix shell patterns
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
# setup basedir
basedir = os.path.dirname(os.getcwd())
os.chdir(basedir)
sys.path.append(basedir)

# set up paths for the project
PATH_raw = os.path.join(basedir, "data/raw")
PATH_interim = os.path.join(basedir, "data/interim")
PATH_models = os.path.join(basedir, "models")
PATH_interim_dataset_summary_tables = os.path.join(PATH_interim, "dataset_summary_tables") # create in that notebook,

In [4]:
# load functions,
from src.utils.feature_extraction_tools import encode_images

# load configs
from src.configs.project_configs import CLASS_DESCRIPTION # information on each class, including descriptive class name and diegnostic description - used to help wiht the project
from src.configs.tfhub_configs import TFHUB_MODELS # names of TF hub modules that I presenlected for featuress extraction with all relevant info,
from src.configs.dataset_configs import DATASET_CONFIGS # names created for clases, assigned to original one, and colors assigned to these classes
from src.configs.dataset_configs import CLASS_LABELS_CONFIGS # names created for clases, assigned to original one, and colors assigned to these classes
from src.configs.dataset_configs import DROPOUT_VALUE # str, special value to indicate samples to remoce in class labels
from src.configs.config_functions import DEFINE_DATASETS # function that creates datasunbsets collections for one dataset (custome made for that project)

# set project variables
PROJECT_NAME                      = "SkinAnaliticAI_Harvard_dataset_evaluation" # 
DATASET_NAME                      = "HAM10000"  # name used in config files to identify all info on that dataset variant
DATASET_VARIANTS                  = DATASET_CONFIGS[DATASET_NAME]["labels"] # class labels that will be used, SORT_FILES_WITH   must be included

## FEATURE EXTRACTION

In [7]:
# preset values 
generator_batch_size = 3000 # no more then 3000 images will be taken,  but we expect no more then 2000 in that tassk. 
use_url = "no" # the script is adapted only to use sys.path, but configs carries url's and ulr can be used with feature extraction function 

# extract features from images in each dataset varinat using one or more tf hub modules, 
for dv_i, dataset_variant in enumerate(DATASET_VARIANTS):
    
    print(f"\n- {dv_i} - Extracting features from: {dataset_variant}")
    
    # find names off train/valid/test subsets in dataset folder,
    os.chdir(os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}"))
    subset_names_to_encode = []
    for file in glob.glob(f"[train|valid|test]*"):
        subset_names_to_encode.append(file)
        
        
    # Create lists with info required for feture extraction from images 
    'this step is super usefull when many models is used for feature extraction'
    tfmodules          = list(TFHUB_MODELS.keys()) # names of tf hub models used 
    module_names       = [TFHUB_MODELS[x]['module_name'] for x in tfmodules]
    module_file_names  = [TFHUB_MODELS[x]['file_name'] for x in tfmodules]
    img_imput_size     = [TFHUB_MODELS[x]['input_size'] for x in tfmodules]

    
    # extract features, from images from each subset, and store them togther as one batch array,  
    for i, (one_module_name, one_module_file_name, one_img_input_size) in enumerate(zip(module_names, module_file_names, img_imput_size)):
        '''
            all data subsets found in load_dir will be encoded automatically, 
            - logfile will be created for a given datasets
            - batch_labels csv file and npz file with encoded features will be created for 
            each data subset will have:
            - 
        '''
        print("\n ................................................")
        print(f"  - {dv_i}/{i} module:          {one_module_name}")
        print(f"  - {dv_i}/{i} filename or url: {one_module_file_name}")
        print(f"  - {dv_i}/{i} RGB image size : {one_img_input_size}")
        print(f"  - {dv_i}/{i} datset subsets : {subset_names_to_encode}")
        print(f"  - Cataloging subsets, then extracting features from all images")
        print(f"  - Important: Each subset will be saved as one matrix")
        print("\n")

        # I am using modules saved in computer memory, thus I need to build fiull path to them, 
        if use_url=="no":
            one_module_full_path = os.path.join(PATH_models, one_module_file_name)
        else:
            one_module_full_path = one_module_file_name # here I am using module url, (no path)
 
        # extract features    
        encode_images(

            # .. dastaset name & directories, 
            dataset_name     = f"{DATASET_NAME}__{dataset_variant}",# name used when saving encoded files, logfiles and other things, related to encoding, 
            subset_names     = subset_names_to_encode,# list, ust names of files in the load_dir, if any, 
            load_dir         = os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}"),   # full path to input data, ie. file folder with either folders with images names after class names, or folders with subsetnames, and folders names after each class in them, 
            save_dir         = os.path.join(PATH_interim, f"{DATASET_NAME}__{dataset_variant}__extracted_features"), # all new files, will be saved as one batch, with logfile, if None, load_dir will be used, 

            # .. encoding module parameters, 
            module_name          = one_module_name, # name used when saving files
            module_location      = one_module_full_path, # full path to a given module, or url, 
            img_target_size      = one_img_input_size, # image resolution in pixels, 
            generator_batch_size = generator_batch_size, # must be larger or equal to in size of the largest subset
            generator_shuffle    = False, 

            # .. other, 
            save_files       = True,
            verbose          = False                            
        )


- 0 - Extracting features from: Cancer_Detection_And_Classification

 ................................................
  - 0/0 module:          MobileNet_v2
  - 0/0 filename or url: imagenet_mobilenet_v2_100_224_feature_vector_2
  - 0/0 RGB image size : (224, 224)
  - 0/0 datset subsets : ['train_05', 'train_02', 'valid_01', 'train_03', 'train_04', 'test_01', 'train_01', 'train_06', 'valid_02', 'train_07', 'test_02']
  - Cataloging subsets, then extracting features from all images
  - Important: Each subset will be saved as one matrix


Found 744 images belonging to 7 classes.
Found 742 images belonging to 7 classes.
Found 740 images belonging to 7 classes.
Found 742 images belonging to 7 classes.
Found 744 images belonging to 7 classes.
Found 367 images belonging to 7 classes.
Found 742 images belonging to 7 classes.
Found 738 images belonging to 7 classes.
Found 741 images belonging to 7 classes.
Found 751 images belonging to 7 classes.
Found 367 images belonging to 7 classes.
INFO: