In [2]:
import numpy as np 
import matplotlib.pyplot as plt
import glob
import cv2
import os
import seaborn as sns
import pandas as pd
from skimage.filters import sobel
from skimage.feature import graycomatrix, graycoprops
from skimage.measure import shannon_entropy
from sklearn.preprocessing import LabelEncoder
import joblib
import threading
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from concurrent.futures import ThreadPoolExecutor

In [3]:
data = pd.read_csv("/kaggle/input/isic-2024-challenge/train-metadata.csv", low_memory=False)
data.shape

(401059, 55)

In [4]:
data.columns

Index(['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', '

In [None]:
def feature_extractor(img,label):
    df = {}

    GSLM = graycomatrix(img, [1], [0])       
    GSLM_Energy = graycoprops(GSLM, 'energy')[0]
    df['0'] = GSLM_Energy[0]
    GSLM_corr = graycoprops(GSLM, 'correlation')[0]
    df['1'] = GSLM_corr[0]       
    GSLM_diss = graycoprops(GSLM, 'dissimilarity')[0]
    df['2'] = GSLM_diss[0]       
    GSLM_hom = graycoprops(GSLM, 'homogeneity')[0]
    df['3'] = GSLM_hom[0]       
    GSLM_contr = graycoprops(GSLM, 'contrast')[0]
    df['4'] = GSLM_contr[0]

    GSLM2 = graycomatrix(img, [3], [0])       
    GSLM_Energy2 = graycoprops(GSLM2, 'energy')[0]
    df['5'] = GSLM_Energy2[0]
    GSLM_corr2 = graycoprops(GSLM2, 'correlation')[0]
    df['6'] = GSLM_corr2[0]       
    GSLM_diss2 = graycoprops(GSLM2, 'dissimilarity')[0]
    df['7'] = GSLM_diss2[0]       
    GSLM_hom2 = graycoprops(GSLM2, 'homogeneity')[0]
    df['8'] = GSLM_hom2[0]       
    GSLM_contr2 = graycoprops(GSLM2, 'contrast')[0]
    df['9'] = GSLM_contr2[0]

    GSLM3 = graycomatrix(img, [5], [0])       
    GSLM_Energy3 = graycoprops(GSLM3, 'energy')[0]
    df['10'] = GSLM_Energy3[0]
    GSLM_corr3 = graycoprops(GSLM3, 'correlation')[0]
    df['11'] = GSLM_corr3[0]       
    GSLM_diss3 = graycoprops(GSLM3, 'dissimilarity')[0]
    df['12'] = GSLM_diss3[0]       
    GSLM_hom3 = graycoprops(GSLM3, 'homogeneity')[0]
    df['13'] = GSLM_hom3[0]       
    GSLM_contr3 = graycoprops(GSLM3, 'contrast')[0]
    df['14'] = GSLM_contr3[0]

    GSLM4 = graycomatrix(img, [0], [np.pi/4])       
    GSLM_Energy4 = graycoprops(GSLM4, 'energy')[0]
    df['15'] = GSLM_Energy4[0]
    
    return df

In [None]:
features = pd.DataFrame(columns=['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', 'mel_thick_mm', 'tbp_lv_dnn_lesion_confidence', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15'])

index = 0

for row in data.values:
    img = cv2.imread("/kaggle/input/isic-2024-challenge/train-image/image/%s.jpg" %row[0], 0)
    img = cv2.resize(img,(800, 800))
    temp = list(row)
    temp.extend(feature_extractor(img,row[1]).values())
    features.loc[index] = temp
    index += 1

In [None]:
features.to_csv("/kaggle/working/Processed.csv")

In [6]:
features = pd.read_csv("/kaggle/working/Processed.csv", low_memory=False)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/Processed.csv'

In [7]:
features

NameError: name 'features' is not defined

In [None]:
data = features.drop(['Unnamed: 0','image_type','tbp_tile_type','attribution','copyright_license','lesion_id','iddx_full','patient_id',"iddx_1","iddx_2","iddx_3","iddx_4","iddx_5","7","10","11","13","14",'mel_mitotic_index','tbp_lv_dnn_lesion_confidence','mel_thick_mm'],axis = 1)

In [None]:
non_numerical_columns = data.select_dtypes(exclude=['number']).columns

# Convert to a list if needed
non_numerical_columns_list = list(non_numerical_columns)

print(non_numerical_columns_list)

In [None]:
import pickle

data_new = data
data1 = {}
save_file = open("/kaggle/working/cases.dat", "wb")
non_numerical_columns_list.remove('isic_id')
print(non_numerical_columns_list)
for j in non_numerical_columns_list:
    a = list(set(list(data_new[j])))
    index = 0
    dic = {}
    for i in a:
        dic[i] = index
        index += 1

    data1[j] = dic

pickle.dump(data1, save_file)
save_file.close()

In [None]:
data_file = open("/kaggle/working/cases.dat", "rb")
cases = pickle.load(data_file)
type(cases)
data_file.close()

In [None]:
for i in cases:
    for j in cases[i]:
        data_new[i] = data_new[i].replace(j, cases[i][j])

In [None]:
df_target_1 = data[data['target'] == 1]
df_target_0 = data[data['target'] == 0]

x1train, x1test, y1train, y1test = train_test_split(df_target_1.drop("target", axis=1), df_target_1["target"], test_size=0.1)
x0train, x0test, y0train, y0test = train_test_split(df_target_0.drop("target", axis=1), df_target_0["target"], test_size=0.1)
batch1 = pd.concat([x1train, y1train], ignore_index=True, axis=1)

batch0 = pd.concat([x0train, y0train], ignore_index=True, axis=1)
batch1

In [None]:
tprsacc = {}
from sklearn.metrics import accuracy_score, confusion_matrix
final_predict_data = data.drop(["target"],axis=1)


def trainer(name,batch):
    #print(pd.DataFrame(batch))
    #print(number)
    scaler = StandardScaler()
    batch = batch.drop(batch.select_dtypes(exclude=['number']).columns,axis = 1)
    trainn_set, testn_set = train_test_split(batch, test_size=0.1,random_state=42)
    X_trainn = trainn_set.drop(columns=['target'])
    y_trainn = trainn_set['target']
    X_testn= testn_set.drop(columns=['target'])
    y_testn = testn_set['target']
    X_trainn = scaler.fit_transform(X_trainn)
    X_testn = scaler.fit_transform(X_testn)
    
    
    model = lgb.LGBMClassifier(verbose = -1,learning_rate = 0.07)
    model.fit(X_trainn, y_trainn)
    name = "/kaggle/working/models/model"+str(name)+".joblib"
    
    predictions = model.predict(X_testn)
    tn, fp, fn, tp = confusion_matrix(y_testn, predictions).ravel()
    tprsacc[name] = accuracy_score(y_testn,predictions)

    joblib.dump(model, name)

In [None]:
threads = []
import time

start_time = time.time()

for i in range(0,1018):
    batch = pd.concat([df_target_1, df_target_0[i*353:(i*353)+353]], ignore_index=True)
    thread = threading.Thread(target=trainer, args=(i,batch))
    threads.append(thread)
    thread.start()
end_time = time.time()
total_time = end_time - start_time
total_time

In [None]:
models = []

for name, accuracy in tprsacc.items():
    if len(models) < 50:
        models.append(name)
        continue
    
    min = tprsacc[models[0]]
    index = 0
    for i in range(1, len(models)):
        length = len(models)
        if tprsacc[models[i]] < min :
            min = tprsacc[models[i]]
            index = i

    if accuracy > min:
        models.pop(index)
        models.append(name)

models

In [None]:

#testing_predictions_lock = threading.Lock()

def tester(id,data):
    for model_number in range(len(models)):
        model_name = f"/kaggle/working/models/model{model_number}.joblib"
        model = joblib.load(model_name)
        
        prediction = model.predict(data)
            
            # Initialize the list for key `i` if it does not exist
        if id not in testing_predictions:
            testing_predictions[id] = []
        
            # Append the prediction to the list
        testing_predictions[id].append(prediction[0])

In [None]:
xInput = pd.concat([x1test,x0test.iloc[:400,:]],ignore_index=True)
yOutput = pd.concat([y1test,y0test[:400]],ignore_index=True)
scaler = StandardScaler()
ids = xInput['isic_id']
xInput = xInput.drop(['isic_id'],axis=1)
xInput = pd.DataFrame(scaler.fit_transform(xInput))
testing_predictions = {}
for i in range(len(ids)):
    tester(ids[i],xInput.iloc[[i]])

In [None]:
for key, value in testing_predictions.items():
    conf = 0.0
    for i in range(len(value)):
        conf += (tprsacc[models[i]] * value[i])
    testing_predictions[key] = conf/(len(models))


In [None]:
from itertools import chain
testing_predictions_list = list(chain.from_iterable(
    [value] if isinstance(value, (int, float, np.integer, np.floating)) else value 
    for value in testing_predictions.values()
))
norm = MinMaxScaler()
tpl = norm.fit_transform(np.array(testing_predictions_list).reshape(-1,1))
tpl1=[]
for i in range(len(tpl)):
    tpl1.append(int(tpl[i][0]))

In [None]:
tn, fp, fn, tp = confusion_matrix(yOutput, tpl1).ravel()
print(tp/(tp+fn))
accuracy = accuracy_score(yOutput,tpl1)
print(tp, tn , fp , fn)
print(accuracy)

In [None]:
def submission(testing_predictions):
    submit = pd.DataFrame(columns = ['isic_id','target'])
    submit['isic_id'] = list(testing_predictions.keys())
    submit['target'] = list(testing_predictions.values())
    submit.to_csv('submission.csv',index = False)


In [None]:
submission(testing_predictions)