In [53]:
import numpy as np 
import matplotlib.pyplot as plt
import glob
import cv2
import os
import seaborn as sns
import pandas as pd
from skimage.filters import sobel
from skimage.feature import graycomatrix, graycoprops
from skimage.measure import shannon_entropy
from sklearn.preprocessing import LabelEncoder
import joblib
import threading
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor

In [54]:
data = pd.read_csv("./train-metadata.csv", low_memory=False)
data.shape

(401059, 55)

In [55]:
data.columns

Index(['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', '

In [56]:
def feature_extractor(img,label):
    df = {}

    GSLM = graycomatrix(img, [1], [0])       
    GSLM_Energy = graycoprops(GSLM, 'energy')[0]
    df['0'] = GSLM_Energy[0]
    GSLM_corr = graycoprops(GSLM, 'correlation')[0]
    df['1'] = GSLM_corr[0]       
    GSLM_diss = graycoprops(GSLM, 'dissimilarity')[0]
    df['2'] = GSLM_diss[0]       
    GSLM_hom = graycoprops(GSLM, 'homogeneity')[0]
    df['3'] = GSLM_hom[0]       
    GSLM_contr = graycoprops(GSLM, 'contrast')[0]
    df['4'] = GSLM_contr[0]

    GSLM2 = graycomatrix(img, [3], [0])       
    GSLM_Energy2 = graycoprops(GSLM2, 'energy')[0]
    df['5'] = GSLM_Energy2[0]
    GSLM_corr2 = graycoprops(GSLM2, 'correlation')[0]
    df['6'] = GSLM_corr2[0]       
    GSLM_diss2 = graycoprops(GSLM2, 'dissimilarity')[0]
    df['7'] = GSLM_diss2[0]       
    GSLM_hom2 = graycoprops(GSLM2, 'homogeneity')[0]
    df['8'] = GSLM_hom2[0]       
    GSLM_contr2 = graycoprops(GSLM2, 'contrast')[0]
    df['9'] = GSLM_contr2[0]

    GSLM3 = graycomatrix(img, [5], [0])       
    GSLM_Energy3 = graycoprops(GSLM3, 'energy')[0]
    df['10'] = GSLM_Energy3[0]
    GSLM_corr3 = graycoprops(GSLM3, 'correlation')[0]
    df['11'] = GSLM_corr3[0]       
    GSLM_diss3 = graycoprops(GSLM3, 'dissimilarity')[0]
    df['12'] = GSLM_diss3[0]       
    GSLM_hom3 = graycoprops(GSLM3, 'homogeneity')[0]
    df['13'] = GSLM_hom3[0]       
    GSLM_contr3 = graycoprops(GSLM3, 'contrast')[0]
    df['14'] = GSLM_contr3[0]

    GSLM4 = graycomatrix(img, [0], [np.pi/4])       
    GSLM_Energy4 = graycoprops(GSLM4, 'energy')[0]
    df['15'] = GSLM_Energy4[0]
    
    return df

In [57]:
features = pd.DataFrame(columns=['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', 'mel_thick_mm', 'tbp_lv_dnn_lesion_confidence', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15'])

index = 0

for row in data.values:
    img = cv2.imread("./train-image/image/%s.jpg" %row[0], 0)
    img = cv2.resize(img, (800, 800))
    temp = list(row)
    temp.extend(feature_extractor(img,row[1]).values())
    features.loc[index] = temp
    index += 1
    
    if index  == 1000:
        print("%d, " %index, end="")
        break

1000, 

In [58]:
data = features.drop(['image_type','tbp_tile_type','attribution','copyright_license','lesion_id','iddx_full','patient_id','isic_id',"iddx_1","iddx_2","iddx_3","iddx_4","iddx_5","mel_thick_mm", "mel_mitotic_index","7","10","11","13","14"],axis = 1)

In [59]:
data.shape

(1000, 51)

In [60]:
non_numerical_columns = data.select_dtypes(exclude=['number']).columns

# Convert to a list if needed
non_numerical_columns_list = list(non_numerical_columns)

print(non_numerical_columns_list)

['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']


In [61]:
import pickle

data_new = data
data1 = {}
save_file = open("cases.dat", "wb")

for j in non_numerical_columns_list:
    a = list(set(list(data_new[j])))
    index = 0
    dic = {}
    for i in a:
        dic[i] = index
        index += 1

    data1[j] = dic

pickle.dump(data1, save_file)
save_file.close()

In [62]:
data_file = open("cases.dat", "rb")
cases = pickle.load(data_file)
type(cases)
data_file.close()

In [63]:
for i in cases:
    for j in cases[i]:
        data_new[i] = data_new[i].replace(j, cases[i][j])

  data_new[i] = data_new[i].replace(j, cases[i][j])


In [64]:
data_new

Unnamed: 0,target,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,...,1,2,3,4,5,6,8,9,12,15
0,0,60.0,0.0,2.0,3.04,20.244422,16.261975,26.922447,23.954773,33.684638,...,0.995407,0.570045,0.729752,0.718109,0.069896,0.969674,0.439391,4.740494,2.641945,0.184858
1,0,60.0,0.0,4.0,1.10,31.712570,25.364740,26.331000,24.549290,41.219030,...,0.995414,0.883381,0.621010,1.537013,0.049560,0.965190,0.331694,11.668860,4.125442,0.155813
2,0,60.0,0.0,1.0,3.40,22.575830,17.128170,37.970460,33.485410,44.174920,...,0.997565,0.492819,0.761879,0.575757,0.065654,0.984896,0.479583,3.570464,2.288767,0.163991
3,0,65.0,0.0,0.0,3.22,14.242329,12.164757,21.448144,21.121356,25.746200,...,0.998503,0.346510,0.828859,0.367652,0.075459,0.992252,0.584999,1.903342,1.614436,0.159799
4,0,55.0,0.0,0.0,2.73,24.725520,20.057470,26.464900,25.710460,36.217980,...,0.997093,0.513888,0.752340,0.607207,0.064986,0.981493,0.463134,3.862825,2.415203,0.166973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,50.0,0.0,0.0,4.94,18.462110,8.773887,27.207660,23.363460,32.880180,...,0.998292,0.739975,0.749109,6.455488,0.087908,0.986656,0.503408,49.994912,3.482197,0.166807
996,0,50.0,0.0,2.0,3.20,17.451300,12.524040,27.827054,25.922330,32.846504,...,0.997262,0.447852,0.786987,0.558497,0.088612,0.982789,0.532284,3.507693,2.095373,0.196296
997,0,60.0,0.0,3.0,2.98,17.276160,10.673550,32.077050,28.163140,36.433540,...,0.996656,0.682663,0.701688,1.149922,0.069477,0.975496,0.424837,8.424473,3.185310,0.180726
998,0,55.0,0.0,0.0,4.26,24.321700,16.060140,38.068720,35.441760,45.174910,...,0.998672,0.416841,0.796508,0.466275,0.082688,0.992341,0.534920,2.693229,1.927579,0.185978


In [73]:
df_target_1 = data[data['target'] == 1]
df_target_0 = data[data['target'] == 0]

x1train, x1test, y1train, y1test = train_test_split(df_target_1.drop("target", axis=1), df_target_1["target"], test_size=0.1)
x0train, x0test, y0train, y0test = train_test_split(df_target_0.drop("target", axis=1), df_target_0["target"], test_size=0.1)
batch1 = pd.concat([x1train, y1train], ignore_index=True, axis=1)
batch0 = pd.concat([x0train, y0train], ignore_index=True, axis=1)
batch1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
387,80.0,1.0,0.0,9.27,18.093368,13.054772,19.090458,21.211776,26.302386,24.907158,...,0.424115,0.798925,0.537074,0.065269,0.995783,0.547107,3.345182,1.986903,0.141008,1


In [49]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [75]:
tprsacc = {}
final_predict_data = data.drop(["target"],axis=1)

def trainer(number,batch):
    #print(pd.DataFrame(batch))
    #print(number)
    scaler = StandardScaler()

    trainn_set, testn_set = train_test_split(batch, test_size=0.1, stratify=batch['target'], random_state=42)
    X_trainn = trainn_set.drop(columns=['target'])
    y_trainn = trainn_set['target']
    X_testn= testn_set.drop(columns=['target'])
    y_testn = testn_set['target']
    X_trainn = scaler.fit_transform(X_trainn)
    X_testn = scaler.transform(X_testn)
    
    model = lgb.LGBMClassifier(verbose=-1,objective = 'binary')
    model.fit(X_trainn, y_trainn)
    name = "model" + str(number) + ".joblib"
    
    predictions = model.predict(X_testn)
    tn, fp, fn, tp = confusion_matrix(y_testn, predictions).ravel()
    tprsacc[accuracy_score(y_testn,predictions)] = name

    joblib.dump(model, name)

In [None]:
threads = []
import time

start_time = time.time()


for i in range(0,1018):
    batch = batch1 = pd.concat([df_target_1, df_target_0[i*392:(i*392)+392]], ignore_index=True)
       
    thread = threading.Thread(target=trainer, args=(i,batch))
    threads.append(thread)
    thread.start()
end_time = time.time()
total_time = end_time - start_time
total_time

In [None]:
testing_predictions = {}
#testing_predictions_lock = threading.Lock()

def tester(data, model_name):
    scaler = StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(data))
    for i in range(len(data)):
        # Prepare the test data
        test_datan = data.iloc[[i]]
        
        # Load the model
        #model_name = f"model{model_number}.joblib"
        model = joblib.load(model_name)
        
        # Predict using the model
        prediction = model.predict(test_datan)
        
        # Initialize the list for key `i` if it does not exist
        if i not in testing_predictions:
            testing_predictions[i] = []
        
        # Append the prediction to the list
        testing_predictions[i].append(prediction[0])