In [77]:
import numpy as np 
import matplotlib.pyplot as plt
import glob
import cv2
import os
import seaborn as sns
import pandas as pd
from skimage.filters import sobel
from skimage.feature import graycomatrix, graycoprops
from skimage.measure import shannon_entropy
from sklearn.preprocessing import LabelEncoder
import joblib
import threading
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor

In [78]:
data = pd.read_csv("./train-metadata.csv", low_memory=False)
data.shape

(401059, 55)

In [79]:
data.columns

Index(['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', '

In [80]:
def feature_extractor(img,label):
    df = {}

    GSLM = graycomatrix(img, [1], [0])       
    GSLM_Energy = graycoprops(GSLM, 'energy')[0]
    df['0'] = GSLM_Energy[0]
    GSLM_corr = graycoprops(GSLM, 'correlation')[0]
    df['1'] = GSLM_corr[0]       
    GSLM_diss = graycoprops(GSLM, 'dissimilarity')[0]
    df['2'] = GSLM_diss[0]       
    GSLM_hom = graycoprops(GSLM, 'homogeneity')[0]
    df['3'] = GSLM_hom[0]       
    GSLM_contr = graycoprops(GSLM, 'contrast')[0]
    df['4'] = GSLM_contr[0]

    GSLM2 = graycomatrix(img, [3], [0])       
    GSLM_Energy2 = graycoprops(GSLM2, 'energy')[0]
    df['5'] = GSLM_Energy2[0]
    GSLM_corr2 = graycoprops(GSLM2, 'correlation')[0]
    df['6'] = GSLM_corr2[0]       
    GSLM_diss2 = graycoprops(GSLM2, 'dissimilarity')[0]
    df['7'] = GSLM_diss2[0]       
    GSLM_hom2 = graycoprops(GSLM2, 'homogeneity')[0]
    df['8'] = GSLM_hom2[0]       
    GSLM_contr2 = graycoprops(GSLM2, 'contrast')[0]
    df['9'] = GSLM_contr2[0]

    GSLM3 = graycomatrix(img, [5], [0])       
    GSLM_Energy3 = graycoprops(GSLM3, 'energy')[0]
    df['10'] = GSLM_Energy3[0]
    GSLM_corr3 = graycoprops(GSLM3, 'correlation')[0]
    df['11'] = GSLM_corr3[0]       
    GSLM_diss3 = graycoprops(GSLM3, 'dissimilarity')[0]
    df['12'] = GSLM_diss3[0]       
    GSLM_hom3 = graycoprops(GSLM3, 'homogeneity')[0]
    df['13'] = GSLM_hom3[0]       
    GSLM_contr3 = graycoprops(GSLM3, 'contrast')[0]
    df['14'] = GSLM_contr3[0]

    GSLM4 = graycomatrix(img, [0], [np.pi/4])       
    GSLM_Energy4 = graycoprops(GSLM4, 'energy')[0]
    df['15'] = GSLM_Energy4[0]
    
    return df

In [51]:
features = pd.DataFrame(columns=['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', 'mel_thick_mm', 'tbp_lv_dnn_lesion_confidence', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15'])

index = 0

for row in data.values:
    img = cv2.imread("./train-image/image/%s.jpg" %row[0], 0)
    img = cv2.resize(img, (800, 800))
    temp = list(row)
    temp.extend(feature_extractor(img,row[1]).values())
    features.loc[index] = temp
    index += 1
    
    if index % 1000 == 0:
        print("%d, " %index, end="")

KeyboardInterrupt: 

In [None]:
features.to_csv("Processed.csv")

In [81]:
features = pd.read_csv("Processed.csv", low_memory=False)

In [91]:
data = features.drop(['Unnamed: 0', 'image_type','tbp_tile_type','attribution','copyright_license','lesion_id','iddx_full','patient_id','isic_id',"iddx_1","iddx_2","iddx_3","iddx_4","iddx_5","mel_thick_mm", "mel_mitotic_index","7","10","11","13","14"],axis = 1)

In [92]:
data.shape

(401059, 51)

In [93]:
non_numerical_columns = data.select_dtypes(exclude=['number']).columns

# Convert to a list if needed
non_numerical_columns_list = list(non_numerical_columns)

print(non_numerical_columns_list)

['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']


In [94]:
import pickle

data_new = data
data1 = {}
save_file = open("cases.dat", "wb")

for j in non_numerical_columns_list:
    a = list(set(list(data_new[j])))
    index = 0
    dic = {}
    for i in a:
        dic[i] = index
        index += 1

    data1[j] = dic

pickle.dump(data1, save_file)
save_file.close()

In [95]:
data_file = open("cases.dat", "rb")
cases = pickle.load(data_file)
type(cases)
data_file.close()

In [96]:
for i in cases:
    for j in cases[i]:
        data_new[i] = data_new[i].replace(j, cases[i][j])

  data_new[i] = data_new[i].replace(j, cases[i][j])


In [97]:
data_new

Unnamed: 0,target,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,...,1,2,3,4,5,6,8,9,12,15
0,0,60.0,0.0,5,3.04,20.244422,16.261975,26.922447,23.954773,33.684638,...,0.995407,0.570045,0.729752,0.718109,0.069896,0.969674,0.439391,4.740494,2.641945,0.184858
1,0,60.0,0.0,2,1.10,31.712570,25.364740,26.331000,24.549290,41.219030,...,0.995414,0.883381,0.621010,1.537013,0.049560,0.965190,0.331694,11.668860,4.125442,0.155813
2,0,60.0,0.0,1,3.40,22.575830,17.128170,37.970460,33.485410,44.174920,...,0.997565,0.492819,0.761879,0.575757,0.065654,0.984896,0.479583,3.570464,2.288767,0.163991
3,0,65.0,0.0,3,3.22,14.242329,12.164757,21.448144,21.121356,25.746200,...,0.998503,0.346510,0.828859,0.367652,0.075459,0.992252,0.584999,1.903342,1.614436,0.159799
4,0,55.0,0.0,3,2.73,24.725520,20.057470,26.464900,25.710460,36.217980,...,0.997093,0.513888,0.752340,0.607207,0.064986,0.981493,0.463134,3.862825,2.415203,0.166973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,0,70.0,0.0,3,6.80,22.574335,14.944666,27.663259,26.767135,35.705132,...,0.999450,0.499443,0.762344,0.620604,0.072442,0.996506,0.488419,3.951402,2.311936,0.172276
401055,0,60.0,0.0,1,3.11,19.977640,16.026870,34.158840,31.011870,39.571870,...,0.997146,0.488614,0.764851,0.580682,0.074116,0.982444,0.486601,3.574448,2.250429,0.180330
401056,0,65.0,1.0,3,2.05,17.332567,12.364397,29.845326,26.500073,34.513206,...,0.997717,0.315893,0.843501,0.330371,0.095693,0.988815,0.608884,1.619813,1.482426,0.193177
401057,0,30.0,1.0,3,2.80,22.288570,9.564721,28.431200,27.012250,36.126360,...,0.998493,0.327015,0.839117,0.353395,0.092291,0.992245,0.610181,1.820486,1.506836,0.185053


In [98]:
data

Unnamed: 0,target,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,...,1,2,3,4,5,6,8,9,12,15
0,0,60.0,0.0,5,3.04,20.244422,16.261975,26.922447,23.954773,33.684638,...,0.995407,0.570045,0.729752,0.718109,0.069896,0.969674,0.439391,4.740494,2.641945,0.184858
1,0,60.0,0.0,2,1.10,31.712570,25.364740,26.331000,24.549290,41.219030,...,0.995414,0.883381,0.621010,1.537013,0.049560,0.965190,0.331694,11.668860,4.125442,0.155813
2,0,60.0,0.0,1,3.40,22.575830,17.128170,37.970460,33.485410,44.174920,...,0.997565,0.492819,0.761879,0.575757,0.065654,0.984896,0.479583,3.570464,2.288767,0.163991
3,0,65.0,0.0,3,3.22,14.242329,12.164757,21.448144,21.121356,25.746200,...,0.998503,0.346510,0.828859,0.367652,0.075459,0.992252,0.584999,1.903342,1.614436,0.159799
4,0,55.0,0.0,3,2.73,24.725520,20.057470,26.464900,25.710460,36.217980,...,0.997093,0.513888,0.752340,0.607207,0.064986,0.981493,0.463134,3.862825,2.415203,0.166973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,0,70.0,0.0,3,6.80,22.574335,14.944666,27.663259,26.767135,35.705132,...,0.999450,0.499443,0.762344,0.620604,0.072442,0.996506,0.488419,3.951402,2.311936,0.172276
401055,0,60.0,0.0,1,3.11,19.977640,16.026870,34.158840,31.011870,39.571870,...,0.997146,0.488614,0.764851,0.580682,0.074116,0.982444,0.486601,3.574448,2.250429,0.180330
401056,0,65.0,1.0,3,2.05,17.332567,12.364397,29.845326,26.500073,34.513206,...,0.997717,0.315893,0.843501,0.330371,0.095693,0.988815,0.608884,1.619813,1.482426,0.193177
401057,0,30.0,1.0,3,2.80,22.288570,9.564721,28.431200,27.012250,36.126360,...,0.998493,0.327015,0.839117,0.353395,0.092291,0.992245,0.610181,1.820486,1.506836,0.185053


In [99]:
df_target_1 = data[data['target'] == 1]
df_target_0 = data[data['target'] == 0]

x1train, x1test, y1train, y1test = train_test_split(df_target_1.drop("target", axis=1), df_target_1["target"], test_size=0.1)
x0train, x0test, y0train, y0test = train_test_split(df_target_0.drop("target", axis=1), df_target_0["target"], test_size=0.1)
batch1 = pd.concat([x1train, y1train], ignore_index=True, axis=1)
batch0 = pd.concat([x0train, y0train], ignore_index=True, axis=1)
batch1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
65631,60.0,0.0,3,3.12,31.926363,25.735510,37.727808,37.025539,49.423478,45.091096,...,0.627286,0.707205,0.838431,0.049415,0.986769,0.408976,5.844986,2.965379,0.138350,1
334835,60.0,0.0,0,2.19,41.093340,24.777120,23.342800,27.003350,47.260440,36.648140,...,0.858698,0.623045,1.393404,0.042864,0.974912,0.327113,10.430596,4.004088,0.141238,1
192393,60.0,0.0,1,2.32,26.957195,24.175102,28.677592,27.677507,39.358540,36.748877,...,0.504759,0.757505,0.604240,0.067192,0.980734,0.473643,3.821581,2.356039,0.168947,1
184988,60.0,0.0,3,2.51,24.107748,20.279458,33.794779,32.393811,41.512294,38.217998,...,0.322364,0.840124,0.335427,0.072958,0.994613,0.598285,1.655918,1.531822,0.150584,1
253558,55.0,0.0,0,2.11,31.787660,25.694930,33.371060,34.688920,46.087770,43.168860,...,0.665715,0.691659,0.912630,0.060911,0.970634,0.395096,6.323694,3.068789,0.173463,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29299,60.0,0.0,2,1.92,29.157143,20.546514,21.557824,23.550959,36.261257,31.253911,...,0.574246,0.728420,0.729962,0.057304,0.983011,0.438645,4.933607,2.709042,0.154334,1
232025,70.0,0.0,1,14.36,20.077414,13.687077,22.998065,26.071619,30.528897,29.445974,...,0.462182,0.778820,0.561554,0.059186,0.998553,0.516406,3.488723,2.142709,0.127244,1
296351,75.0,0.0,5,1.10,24.376451,21.729852,23.312658,24.914878,33.729681,33.059607,...,0.518630,0.753728,0.650031,0.070199,0.977685,0.473584,4.195066,2.413594,0.175498,1
133171,60.0,1.0,1,9.01,24.512110,18.431090,22.882590,27.414540,33.532910,33.034250,...,0.470239,0.777138,0.597292,0.066590,0.992721,0.511679,3.770657,2.187701,0.153770,1


In [100]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [140]:
tprsacc = {}
final_predict_data = data.drop(["target"],axis=1)
scaler = StandardScaler()
df_target_0 = pd.DataFrame(scaler.fit_transform(df_target_0))
df_target_1 = pd.DataFrame(scaler.fit_transform(df_target_1))

def trainer(number,batch, scaler):
    #print(pd.DataFrame(batch))
    #print(number)
    trainn_set, testn_set = train_test_split(batch, test_size=0.1,random_state=42)
    X_trainn = trainn_set.drop(columns=['target'])
    y_trainn = trainn_set['target']
    X_testn= testn_set.drop(columns=['target'])
    y_testn = testn_set['target']
    
    
    model = lgb.LGBMClassifier(verbose=-1,objective = 'binary')
    model.fit(X_trainn, y_trainn)
    name = "model" + str(number) + ".joblib"
    
    predictions = model.predict(X_testn)
    tn, fp, fn, tp = confusion_matrix(y_testn, predictions).ravel()
    tprsacc[name] = accuracy_score(y_testn,predictions)

    joblib.dump(model, name)

In [141]:
threads = []
import time

start_time = time.time()


for i in range(0,1018):
    batch = pd.concat([df_target_1, df_target_0[i*353:(i*353)+353]], ignore_index=True)
       
    thread = threading.Thread(target=trainer, args=(i,batch, scaler))
    threads.append(thread)
    thread.start()
end_time = time.time()
total_time = end_time - start_time
total_time

Exception in thread Thread-6689 (trainer):
Traceback (most recent call last):
  File "e:\Software\Python3.11.9\Lib\threading.py", line 1045, in _bootstrap_inner
Exception in thread Thread-6690 (trainer):
Traceback (most recent call last):
  File "e:\Software\Python3.11.9\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "C:\Users\Gokul\AppData\Roaming\Python\Python311\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    self.run()
  File "C:\Users\Gokul\AppData\Roaming\Python\Python311\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
Exception in thread Thread-6691 (trainer):
Traceback (most recent call last):
  File "e:\Software\Python3.11.9\Lib\threading.py", line 1045, in _bootstrap_inner
    _threading_Thread_run(self)
  File "e:\Software\Python3.11.9\Lib\threading.py", line 982, in run
    _threading_Thread_run(self)
  File "e:\Software\Python3.11.9\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
 

KeyboardInterrupt: 

In [107]:
testing_predictions = {}
#testing_predictions_lock = threading.Lock()

def tester(data, model_name):
    for i in range(len(data)):
        test_datan = data.iloc[[i]]
        model = joblib.load(model_name)
        prediction = model.predict(test_datan)
        
        if i not in testing_predictions:
            testing_predictions[i] = []
        
        testing_predictions[i].append(prediction[0])

In [106]:
print(len(tprsacc))

1018


In [127]:
models = []

for name, accuracy in tprsacc.items():
    if len(models) < 10:
        models.append(name)
        continue
    
    min = tprsacc[models[0]]
    index = 0
    for i in range(1, len(models)):
        length = len(models)
        if tprsacc[models[i]] < min :
            min = tprsacc[models[i]]
            index = i

    if accuracy > min:
        models.pop(index)
        models.append(name)

models

['model47.joblib',
 'model70.joblib',
 'model133.joblib',
 'model235.joblib',
 'model364.joblib',
 'model454.joblib',
 'model418.joblib',
 'model557.joblib',
 'model575.joblib',
 'model787.joblib']

In [128]:
[tprsacc[models[i]] for i in range(0, 10)]

[0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9466666666666667,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9466666666666667,
 0.9466666666666667]

In [130]:
xInput = pd.concat([x1test,x0test],ignore_index=True)
yOutput = pd.concat([y1test,y0test],ignore_index=True)
scaler = StandardScaler()
xInput = pd.DataFrame(scaler.fit_transform(xInput))
for model_name in models:
    tester(xInput, model_name)

0        1
1        1
2        1
3        1
4        1
        ..
40102    0
40103    0
40104    0
40105    0
40106    0
Name: target, Length: 40107, dtype: int64

In [None]:
dict = testing_predictions
dict

In [67]:
for key, value in testing_predictions.items():
    # Check if the value is a list of lists, a list, or a single value
    if isinstance(value, list):
        # If it's a list of lists, flatten it
        if all(isinstance(sublist, list) for sublist in value):
            flat_predictions = [item for sublist in value for item in sublist]
        else:
            # If it's a simple list, use it directly
            flat_predictions = value
    else:
        # If it's a single value, convert it to a list
        flat_predictions = [value]
    
    # Convert to pandas Series
    prediction_series = pd.Series(flat_predictions)
    
    # Find the mode (most frequent value)
    mode_value = prediction_series.mode().iloc[0]
    
    # Replace the list with the mode value
    testing_predictions[key] = mode_value


In [68]:
from itertools import chain
testing_predictions_list = list(chain.from_iterable(
    [value] if isinstance(value, (int, float, np.integer, np.floating)) else value 
    for value in testing_predictions.values()
))
testing_predictions_list

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [69]:
tn, fp, fn, tp = confusion_matrix(yOutput, testing_predictions_list).ravel()
print(tp/(tp+fn))
accuracy = accuracy_score(yOutput,testing_predictions_list)
print(tp, tn , fp , fn)
print(accuracy)

1.0
40 141 39926 0
0.004512927917819832
