In [3]:
import numpy as np 
import matplotlib.pyplot as plt
import glob
import cv2
import os
import seaborn as sns
import pandas as pd
from skimage.filters import sobel
from skimage.feature import graycomatrix, graycoprops
from skimage.measure import shannon_entropy
from sklearn.preprocessing import LabelEncoder
import joblib
import threading
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from concurrent.futures import ThreadPoolExecutor

In [4]:
data = pd.read_csv("/kaggle/input/isic-2024-challenge/train-metadata.csv", low_memory=False)
data.shape

(401059, 55)

In [5]:
data.columns

Index(['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', '

In [6]:
def feature_extractor(img,label):
    df = {}

    GSLM = graycomatrix(img, [1], [0])       
    GSLM_Energy = graycoprops(GSLM, 'energy')[0]
    df['0'] = GSLM_Energy[0]
    GSLM_corr = graycoprops(GSLM, 'correlation')[0]
    df['1'] = GSLM_corr[0]       
    GSLM_diss = graycoprops(GSLM, 'dissimilarity')[0]
    df['2'] = GSLM_diss[0]       
    GSLM_hom = graycoprops(GSLM, 'homogeneity')[0]
    df['3'] = GSLM_hom[0]       
    GSLM_contr = graycoprops(GSLM, 'contrast')[0]
    df['4'] = GSLM_contr[0]

    GSLM2 = graycomatrix(img, [3], [0])       
    GSLM_Energy2 = graycoprops(GSLM2, 'energy')[0]
    df['5'] = GSLM_Energy2[0]
    GSLM_corr2 = graycoprops(GSLM2, 'correlation')[0]
    df['6'] = GSLM_corr2[0]       
    GSLM_diss2 = graycoprops(GSLM2, 'dissimilarity')[0]
    df['7'] = GSLM_diss2[0]       
    GSLM_hom2 = graycoprops(GSLM2, 'homogeneity')[0]
    df['8'] = GSLM_hom2[0]       
    GSLM_contr2 = graycoprops(GSLM2, 'contrast')[0]
    df['9'] = GSLM_contr2[0]

    GSLM3 = graycomatrix(img, [5], [0])       
    GSLM_Energy3 = graycoprops(GSLM3, 'energy')[0]
    df['10'] = GSLM_Energy3[0]
    GSLM_corr3 = graycoprops(GSLM3, 'correlation')[0]
    df['11'] = GSLM_corr3[0]       
    GSLM_diss3 = graycoprops(GSLM3, 'dissimilarity')[0]
    df['12'] = GSLM_diss3[0]       
    GSLM_hom3 = graycoprops(GSLM3, 'homogeneity')[0]
    df['13'] = GSLM_hom3[0]       
    GSLM_contr3 = graycoprops(GSLM3, 'contrast')[0]
    df['14'] = GSLM_contr3[0]

    GSLM4 = graycomatrix(img, [0], [np.pi/4])       
    GSLM_Energy4 = graycoprops(GSLM4, 'energy')[0]
    df['15'] = GSLM_Energy4[0]
    
    return df

In [None]:
features = pd.DataFrame(columns=['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', 'mel_thick_mm', 'tbp_lv_dnn_lesion_confidence', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15'])

index = 0

for row in data.values:
    img = cv2.imread("/kaggle/input/isic-2024-challenge/train-image/image/%s.jpg" %row[0], 0)
    img = cv2.resize(img,(800, 800))
    temp = list(row)
    temp.extend(feature_extractor(img,row[1]).values())
    features.loc[index] = temp
    index += 1

In [1]:
features.to_csv("/kaggle/working/Processed.csv")

NameError: name 'features' is not defined

In [10]:
features = pd.read_csv("/kaggle/input/processed/Processed.csv", low_memory=False)

In [11]:
features

Unnamed: 0.1,Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,...,6,7,8,9,10,11,12,13,14,15
0,0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,...,0.969674,1.644876,0.439391,4.740494,0.055895,0.923862,2.641945,0.314371,11.900344,0.184858
1,1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,...,0.965190,2.560588,0.331694,11.668860,0.039492,0.910656,4.125442,0.228392,29.953731,0.155813
2,2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,...,0.984896,1.423595,0.479583,3.570464,0.052397,0.962305,2.288767,0.348609,8.906686,0.163991
3,3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,...,0.992252,1.000685,0.584999,1.903342,0.060689,0.981117,1.614436,0.446695,4.628291,0.159799
4,4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,...,0.981493,1.490896,0.463134,3.862825,0.051599,0.953135,2.415203,0.331420,9.775247,0.166973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,401054,ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,...,0.996506,1.441691,0.488419,3.951402,0.058140,0.991298,2.311936,0.359368,9.859451,0.172276
401055,401055,ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,...,0.982444,1.404570,0.486601,3.574448,0.059358,0.956670,2.250429,0.355719,8.828844,0.180330
401056,401056,ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,...,0.988815,0.915801,0.608884,1.619813,0.077038,0.972809,1.482426,0.469522,3.941297,0.193177
401057,401057,ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,...,0.992245,0.939646,0.610181,1.820486,0.074986,0.981249,1.506836,0.477405,4.398475,0.185053


In [12]:
data = features.drop(['Unnamed: 0','image_type','tbp_tile_type','attribution','copyright_license','lesion_id','iddx_full','patient_id',"iddx_1","iddx_2","iddx_3","iddx_4","iddx_5","7","10","11","13","14",'mel_mitotic_index','tbp_lv_dnn_lesion_confidence','mel_thick_mm'],axis = 1)

In [13]:
non_numerical_columns = data.select_dtypes(exclude=['number']).columns

# Convert to a list if needed
non_numerical_columns_list = list(non_numerical_columns)

print(non_numerical_columns_list)

['isic_id', 'sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']


In [14]:
import pickle

data_new = data
data1 = {}
save_file = open("/kaggle/working/cases.dat", "wb")
non_numerical_columns_list.remove('isic_id')
print(non_numerical_columns_list)
for j in non_numerical_columns_list:
    a = list(set(list(data_new[j])))
    index = 0
    dic = {}
    for i in a:
        dic[i] = index
        index += 1

    data1[j] = dic

pickle.dump(data1, save_file)
save_file.close()

['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']


In [16]:
data_file = open("/kaggle/working/cases.dat", "rb")
cases = pickle.load(data_file)
type(cases)
data_file.close()

In [17]:
for i in cases:
    for j in cases[i]:
        data_new[i] = data_new[i].replace(j, cases[i][j])

  data_new[i] = data_new[i].replace(j, cases[i][j])


In [18]:
df_target_1 = data[data['target'] == 1]
df_target_0 = data[data['target'] == 0]

x1train, x1test, y1train, y1test = train_test_split(df_target_1.drop("target", axis=1), df_target_1["target"], test_size=0.1)
x0train, x0test, y0train, y0test = train_test_split(df_target_0.drop("target", axis=1), df_target_0["target"], test_size=0.1)
batch1 = pd.concat([x1train, y1train], ignore_index=True, axis=1)

batch0 = pd.concat([x0train, y0train], ignore_index=True, axis=1)
batch1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
248412,ISIC_6242031,80.0,1.0,5,16.07,26.971403,27.187709,23.842357,26.412866,35.998814,...,0.512417,0.753634,0.611065,0.058978,0.988865,0.467531,3.876303,2.390050,0.148749,1
301360,ISIC_7538059,55.0,0.0,4,5.14,16.026945,11.422144,28.976245,28.293435,33.113226,...,0.375441,0.816711,0.420213,0.062405,0.995458,0.570772,2.293545,1.711497,0.133697,1
24099,ISIC_0664135,60.0,1.0,5,1.65,38.289313,33.240986,27.873545,27.094191,47.360384,...,0.575314,0.738351,0.853174,0.046177,0.994555,0.465373,5.969076,2.689542,0.115805,1
201793,ISIC_5077618,55.0,0.0,0,2.54,23.789009,18.576847,31.602194,29.293924,39.555223,...,0.383448,0.811150,0.412190,0.087968,0.983151,0.552883,2.262472,1.794123,0.193523,1
151409,ISIC_3831348,65.0,1.0,4,1.23,23.230663,19.751201,24.503351,25.829733,33.765040,...,0.477691,0.772779,0.595976,0.072882,0.982765,0.502757,3.746587,2.216761,0.174069,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214765,ISIC_5400739,75.0,0.0,4,4.94,19.137410,15.875270,25.846260,28.846920,32.160060,...,0.520022,0.757407,0.699540,0.057872,0.989463,0.484152,4.647491,2.432803,0.142592,1
375284,ISIC_9360322,70.0,1.0,2,11.00,20.399131,15.160921,24.828003,28.490951,32.133382,...,0.493362,0.763916,0.600418,0.059135,0.997735,0.487940,3.752922,2.265440,0.142570,1
133171,ISIC_3374114,60.0,0.0,2,9.01,24.512110,18.431090,22.882590,27.414540,33.532910,...,0.470239,0.777138,0.597292,0.066590,0.992721,0.511679,3.770657,2.187701,0.153770,1
184988,ISIC_4659886,60.0,1.0,0,2.51,24.107748,20.279458,33.794779,32.393811,41.512294,...,0.322364,0.840124,0.335427,0.072958,0.994613,0.598285,1.655918,1.531822,0.150584,1


In [21]:
tprsacc = {}
from sklearn.metrics import accuracy_score, confusion_matrix
final_predict_data = data.drop(["target"],axis=1)


def trainer(name,batch):
    #print(pd.DataFrame(batch))
    #print(number)
    scaler = StandardScaler()
    batch = batch.drop(batch.select_dtypes(exclude=['number']).columns,axis = 1)
    trainn_set, testn_set = train_test_split(batch, test_size=0.1,random_state=42)
    X_trainn = trainn_set.drop(columns=['target'])
    y_trainn = trainn_set['target']
    X_testn= testn_set.drop(columns=['target'])
    y_testn = testn_set['target']
    X_trainn = scaler.fit_transform(X_trainn)
    X_testn = scaler.fit_transform(X_testn)
    
    
    model = lgb.LGBMClassifier(verbose = -1,learning_rate = 0.07)
    model.fit(X_trainn, y_trainn)
    name = "/kaggle/working/models/model"+str(name)+".joblib"
    
    
    predictions = model.predict(X_testn)
    tn, fp, fn, tp = confusion_matrix(y_testn, predictions).ravel()
    tprsacc[name] = accuracy_score(y_testn,predictions)

    joblib.dump(model, name)
    print(name)

In [None]:
threads = []
import time

start_time = time.time()

for i in range(0,1018):
    batch = pd.concat([df_target_1, df_target_0[i*353:(i*353)+353]], ignore_index=True)
    thread = threading.Thread(target=trainer, args=(i,batch))
    threads.append(thread)
    thread.start()
end_time = time.time()
total_time = end_time - start_time
total_time

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
models = []

for name, accuracy in tprsacc.items():
    if len(models) < 50:
        models.append(name)
        continue
    
    min = tprsacc[models[0]]
    index = 0
    for i in range(1, len(models)):
        length = len(models)
        if tprsacc[models[i]] < min :
            min = tprsacc[models[i]]
            index = i

    if accuracy > min:
        models.pop(index)
        models.append(name)

models

In [None]:

#testing_predictions_lock = threading.Lock()

def tester(id,data):
    for model_number in range(len(models)):
        model_name = f"/kaggle/working/models/model{model_number}.joblib"
        model = joblib.load(model_name)
        
        prediction = model.predict(data)
            
            # Initialize the list for key `i` if it does not exist
        if id not in testing_predictions:
            testing_predictions[id] = []
        
            # Append the prediction to the list
        testing_predictions[id].append(prediction[0])

In [None]:
xInput = pd.concat([x1test,x0test.iloc[:400,:]],ignore_index=True)
yOutput = pd.concat([y1test,y0test[:400]],ignore_index=True)
scaler = StandardScaler()
ids = xInput['isic_id']
xInput = xInput.drop(['isic_id'],axis=1)
xInput = pd.DataFrame(scaler.fit_transform(xInput))
testing_predictions = {}
for i in range(len(ids)):
    tester(ids[i],xInput.iloc[[i]])

In [None]:
for key, value in testing_predictions.items():
    conf = 0.0
    for i in range(len(value)):
        conf += (tprsacc[models[i]] * value[i])
    testing_predictions[key] = conf/(len(models))


In [None]:
from itertools import chain
testing_predictions_list = list(chain.from_iterable(
    [value] if isinstance(value, (int, float, np.integer, np.floating)) else value 
    for value in testing_predictions.values()
))
norm = MinMaxScaler()
tpl = norm.fit_transform(np.array(testing_predictions_list).reshape(-1,1))
tpl1=[]
for i in range(len(tpl)):
    tpl1.append(int(tpl[i][0]))

In [None]:
tn, fp, fn, tp = confusion_matrix(yOutput, tpl1).ravel()
print(tp/(tp+fn))
accuracy = accuracy_score(yOutput,tpl1)
print(tp, tn , fp , fn)
print(accuracy)

In [None]:
def submission(testing_predictions):
    submit = pd.DataFrame(columns = ['isic_id','target'])
    submit['isic_id'] = list(testing_predictions.keys())
    submit['target'] = list(testing_predictions.values())
    submit.to_csv('submission.csv',index = False)


In [None]:
submission(testing_predictions)